first CLI :)

This commit is contained in:
Julian Hammer
2018-06-26 15:11:35 +02:00
parent 39659ee257
commit 0349de9e64
6 changed files with 111 additions and 64 deletions

0
asmjit/__init__.py Normal file
View File

33
asmjit/__main__.py Normal file
View File

@@ -0,0 +1,33 @@
#!/usr/bin/env python3
import argparse
from . import op, bench
def main():
parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit')
# parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput'])
parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+',
help='instruction declaration, e.g., "add {src:i32:r} {srcdst:i32:r}"')
parser.add_argument('--latency-serial', '-l', type=int, default=8,
help='length of serial chain for each instruction in latency benchmark')
parser.add_argument('--parallel', '-p',type=int, default=4,
help='number of parallel instances of serial chains in throughput '
'benchmark')
parser.add_argument('--throughput-serial', '-t', type=int, default=8,
help='length of serial instances of serial chains in throughput benchmark')
parser.add_argument("--verbose", "-v", action="count", default=0,
help="increase output verbosity")
args = parser.parse_args()
bench.setup_llvm()
lat, tp = bench.bench_instructions(args.instructions,
serial_factor=args.latency_serial,
parallel_factor=args.parallel,
throughput_serial_factor=args.throughput_serial,
verbosity=args.verbose)
print("Latency: {}\nThroughput: {}\n".format(lat, tp))
if __name__ == "__main__":
main()

View File

@@ -3,11 +3,20 @@ import ctypes
import time
import textwrap
import itertools
import re
from pprint import pprint
import llvmlite.binding as llvm
import psutil
from asmjit import op
from . import op
def setup_llvm():
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
def uniquify(l):
@@ -31,7 +40,7 @@ class Benchmark:
def prepare_arguments(previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return 100,
return 10000000,
else:
return int(previous_args[0] * time_factor),
@@ -63,7 +72,10 @@ class Benchmark:
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
return tm.emit_assembly(self.get_llvm_module())
asm = tm.emit_assembly(self.get_llvm_module())
# Remove double comments
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
return asm
def get_function_ctype(self):
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
@@ -72,6 +84,7 @@ class Benchmark:
# Compile the module to machine code using MCJIT
tm = self.get_target_machine()
runtimes = []
return_values = []
args = self.prepare_arguments()
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
ee.finalize_object()
@@ -88,11 +101,10 @@ class Benchmark:
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
fixed_args = False
results = []
for i in range(repeat):
while True:
start = time.perf_counter()
results.append(cfunc(*args))
ret = cfunc(*args)
end = time.perf_counter()
elapsed = end - start
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
@@ -104,13 +116,14 @@ class Benchmark:
# After we have the right argument choice, we keep it.
fixed_args = True
break
return_values.append(ret)
runtimes.append(elapsed)
return {'iterations': self.get_iterations(args),
'arguments': args,
'runtimes': runtimes,
'frequency': psutil.cpu_freq().current * 1e6,
'results': results}
'returned': return_values}
class LoopBenchmark(Benchmark):
@@ -210,63 +223,62 @@ class IntegerLoopBenchmark(LoopBenchmark):
phi=textwrap.indent(self.get_phi_code(), ' '))
def bench_instruction(instruction, serial_factor=8, parallel_factor=8, parallel_serial_factor=8):
# Latency Benchmark
s = op.Serialized([instruction] * serial_factor)
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in s.get_source_registers()]
b = IntegerLoopBenchmark(s, init_values)
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
# Throughput Benchmark
p = op.Parallelized([op.Serialized([instruction] * parallel_serial_factor)] * parallel_factor)
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
p.get_source_registers()]
b = IntegerLoopBenchmark(p, init_values)
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
tp = min(
[(t / parallel_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
# Result compilation
return lat, tp
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, parallel_serial_factor=8):
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
verbosity=0):
# Latency Benchmark
if verbosity > 0:
print('## Latency Benchmark')
p_instrs = []
for i in instructions:
p_instrs.append(op.Serialized([i] * serial_factor))
p = op.Parallelized(p_instrs)
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in p.get_source_registers()]
b = IntegerLoopBenchmark(p, init_values)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
# Throughput Benchmark
if verbosity > 0:
print('## Throughput Benchmark')
p_instrs = []
for i in instructions:
p_instrs.append(op.Serialized([i] * parallel_serial_factor))
p_instrs.append(op.Serialized([i] * throughput_serial_factor))
p = op.Parallelized(p_instrs * parallel_factor)
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
p.get_source_registers()]
b = IntegerLoopBenchmark(p, init_values)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
tp = min(
[(t / parallel_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
# Result compilation
return lat, tp
if __name__ == '__main__':
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
setup_llvm()
i1 = op.Instruction(
instruction='add $2, $0',
@@ -303,10 +315,10 @@ if __name__ == '__main__':
print(b.get_assembly())
print(b.build_and_execute())
print(bench_instruction(op.Instruction(
print(bench_instructions([op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])))
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])]))
# if len(s.get_source_operand_types())
# b = IntegerLoopBenchmark(loop_body,

View File

@@ -1,10 +1,10 @@
#!/usr/bin/env python3
import re
# TODO use abc to force implementation of interface requirements
init_value_by_llvm_type = {'i' + bits: '1' for bits in ['1', '8', '16', '32', '64']}
init_value_by_llvm_type.update({fp_type: '1.0' for fp_type in ['float', 'double', 'fp128']})
init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']}
init_value_by_llvm_type.update({fp_type: '1.00023' for fp_type in ['float', 'double', 'fp128']})
init_value_by_llvm_type.update(
{'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>'
for t, v in init_value_by_llvm_type.items()
@@ -191,35 +191,37 @@ class Instruction(Operation):
Create Instruction object from string.
:param s: must have the form:
"asm_instruction_name ( (src|dst|srcdst):llvm_type:constraint_char)+"
"asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+"
"""
instruction, operands = s.split(maxsplit=1)
instruction = s
# It is important that the match objects are in reverse order, to allow string replacements
# based on original match group locations
operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s))))
# Destination indices start at 0, source indices at "number of destination operands"
dst_index, src_index = 0, ['dst' in o.group(1) for o in operands].count(True)
dst_ops = []
src_ops = []
src_index = list(['dst' in o.split()[0] for o in operands.split()]).count(True)
dst_index = 0
for o in operands.split():
direction, reg_options = o.split(':', maxsplit=1)
r = Register.from_string(reg_options)
valid = False
for m in operands:
direction, register_string = m.group(1, 2)
register = Register.from_string(register_string)
if 'src' in direction and not 'dst' in direction:
valid = True
src_ops.append(r)
instruction += " ${}".format(src_index)
src_ops.append(register)
# replace with index string
instruction = (instruction[:m.start()] + "${}".format(src_index)
+ instruction[m.end():])
src_index += 1
if 'dst' in direction:
valid = True
dst_ops.append(r)
instruction += " ${}".format(dst_index)
dst_ops.append(register)
# replace with index string
instruction = (instruction[:m.start()] + "${}".format(dst_index)
+ instruction[m.end():])
if 'src' in direction:
src_ops.append(Register(reg_options.split(':', 1)[0], str(dst_index)))
src_ops.append(Register(register_string.split(':', 1)[0], str(dst_index)))
dst_index += 1
if not valid:
raise ValueError("Invalid direction '{}', may only be src, dst or srcdst.".format(
direction))
assert len(dst_ops) == 1, "Instruction supports only single destinations."
return cls(instruction, dst_ops[0], src_ops)
if len(dst_ops) != 1:
raise ValueError("Instruction supports only single destinations.")
return cls(instruction, dst_ops[0], src_ops)
class Load(Operation):
@@ -404,4 +406,4 @@ if __name__ == '__main__':
s4 = Serialized([i1, i2, i3, i4, i5, i6])
print(s4.build_ir(['%out'], ['%in']), '\n')
print(Instruction.from_string("add src:i64:r srcdst:i64:r"))
print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}"))

View File

@@ -10,5 +10,5 @@ setup(
author='Julian Hammer',
author_email='julian.hammer@u-sys.org',
description='A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT',
install_requires=['llvmlite>=0.23.2'],
install_requires=['llvmlite>=0.23.2', 'psutil'],
)

View File

@@ -443,7 +443,7 @@ def main():
# Benchmark TP and Lat for each instruction
# for instr_name, instr_op in instructions.items():
# tp, lat = bench.bench_instruction(instr_op)
# tp, lat = bench.bench_instructions([instr_op])
# print("{:>12} {:>5.2f} {:>5.2f}".format(instr_name, tp, lat))
# Benchmark TP and Lat for all valid instruction pairs