From 0349de9e64ffa220771cb98601024601922262b7 Mon Sep 17 00:00:00 2001 From: Julian Hammer Date: Tue, 26 Jun 2018 15:11:35 +0200 Subject: [PATCH] first CLI :) --- asmjit/__init__.py | 0 asmjit/__main__.py | 33 +++++++++++++++++ asmjit/bench.py | 88 ++++++++++++++++++++++++++-------------------- asmjit/op.py | 50 +++++++++++++------------- setup.py | 2 +- tablegen.py | 2 +- 6 files changed, 111 insertions(+), 64 deletions(-) create mode 100644 asmjit/__init__.py create mode 100644 asmjit/__main__.py diff --git a/asmjit/__init__.py b/asmjit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/asmjit/__main__.py b/asmjit/__main__.py new file mode 100644 index 0000000..443b9e3 --- /dev/null +++ b/asmjit/__main__.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +import argparse + +from . import op, bench + + +def main(): + parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit') + # parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput']) + parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+', + help='instruction declaration, e.g., "add {src:i32:r} {srcdst:i32:r}"') + parser.add_argument('--latency-serial', '-l', type=int, default=8, + help='length of serial chain for each instruction in latency benchmark') + parser.add_argument('--parallel', '-p',type=int, default=4, + help='number of parallel instances of serial chains in throughput ' + 'benchmark') + parser.add_argument('--throughput-serial', '-t', type=int, default=8, + help='length of serial instances of serial chains in throughput benchmark') + parser.add_argument("--verbose", "-v", action="count", default=0, + help="increase output verbosity") + args = parser.parse_args() + + bench.setup_llvm() + lat, tp = bench.bench_instructions(args.instructions, + serial_factor=args.latency_serial, + parallel_factor=args.parallel, + throughput_serial_factor=args.throughput_serial, + verbosity=args.verbose) + print("Latency: {}\nThroughput: {}\n".format(lat, tp)) + + +if __name__ == "__main__": + main() diff --git a/asmjit/bench.py b/asmjit/bench.py index 964ad56..e679f4f 100755 --- a/asmjit/bench.py +++ b/asmjit/bench.py @@ -3,11 +3,20 @@ import ctypes import time import textwrap import itertools +import re +from pprint import pprint import llvmlite.binding as llvm import psutil -from asmjit import op +from . import op + + +def setup_llvm(): + llvm.initialize() + llvm.initialize_native_target() + llvm.initialize_native_asmprinter() + llvm.initialize_native_asmparser() def uniquify(l): @@ -31,7 +40,7 @@ class Benchmark: def prepare_arguments(previous_args=None, time_factor=1.0): """Build argument tuple, to be passed to low level function.""" if previous_args is None: - return 100, + return 10000000, else: return int(previous_args[0] * time_factor), @@ -63,7 +72,10 @@ class Benchmark: """Compile and return assembly from LLVM module.""" tm = self.get_target_machine() tm.set_asm_verbosity(0) - return tm.emit_assembly(self.get_llvm_module()) + asm = tm.emit_assembly(self.get_llvm_module()) + # Remove double comments + asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm) + return asm def get_function_ctype(self): return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64) @@ -72,6 +84,7 @@ class Benchmark: # Compile the module to machine code using MCJIT tm = self.get_target_machine() runtimes = [] + return_values = [] args = self.prepare_arguments() with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee: ee.finalize_object() @@ -88,11 +101,10 @@ class Benchmark: # TODO replace time.clock with a C implemententation for less overhead # TODO return result in machine readable format fixed_args = False - results = [] for i in range(repeat): while True: start = time.perf_counter() - results.append(cfunc(*args)) + ret = cfunc(*args) end = time.perf_counter() elapsed = end - start if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed): @@ -104,13 +116,14 @@ class Benchmark: # After we have the right argument choice, we keep it. fixed_args = True break + return_values.append(ret) runtimes.append(elapsed) return {'iterations': self.get_iterations(args), 'arguments': args, 'runtimes': runtimes, 'frequency': psutil.cpu_freq().current * 1e6, - 'results': results} + 'returned': return_values} class LoopBenchmark(Benchmark): @@ -210,63 +223,62 @@ class IntegerLoopBenchmark(LoopBenchmark): phi=textwrap.indent(self.get_phi_code(), ' ')) -def bench_instruction(instruction, serial_factor=8, parallel_factor=8, parallel_serial_factor=8): - # Latency Benchmark - s = op.Serialized([instruction] * serial_factor) - init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in s.get_source_registers()] - b = IntegerLoopBenchmark(s, init_values) - result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) - lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations'] - for t in result['runtimes']]) - - # Throughput Benchmark - p = op.Parallelized([op.Serialized([instruction] * parallel_serial_factor)] * parallel_factor) - init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in - p.get_source_registers()] - b = IntegerLoopBenchmark(p, init_values) - result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) - tp = min( - [(t / parallel_serial_factor / parallel_factor) * result['frequency'] / result['iterations'] - for t in result['runtimes']]) - - # Result compilation - return lat, tp - - -def bench_instructions(instructions, serial_factor=8, parallel_factor=4, parallel_serial_factor=8): +def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8, + verbosity=0): # Latency Benchmark + if verbosity > 0: + print('## Latency Benchmark') p_instrs = [] for i in instructions: p_instrs.append(op.Serialized([i] * serial_factor)) p = op.Parallelized(p_instrs) init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in p.get_source_registers()] b = IntegerLoopBenchmark(p, init_values) + if verbosity >= 3: + print('### LLVM IR') + print(b.build_ir()) + if verbosity >= 2: + print('### Assembly') + print(b.get_assembly()) result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations'] for t in result['runtimes']]) + if verbosity > 0: + print('### Detailed Results') + pprint(result) + print() # Throughput Benchmark + if verbosity > 0: + print('## Throughput Benchmark') p_instrs = [] for i in instructions: - p_instrs.append(op.Serialized([i] * parallel_serial_factor)) + p_instrs.append(op.Serialized([i] * throughput_serial_factor)) p = op.Parallelized(p_instrs * parallel_factor) init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in p.get_source_registers()] b = IntegerLoopBenchmark(p, init_values) + if verbosity >= 3: + print('### LLVM IR') + print(b.build_ir()) + if verbosity >= 2: + print('### Assembly') + print(b.get_assembly()) result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) tp = min( - [(t / parallel_serial_factor / parallel_factor) * result['frequency'] / result['iterations'] + [(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations'] for t in result['runtimes']]) + if verbosity > 0: + print('### Detailed Results') + pprint(result) + print() # Result compilation return lat, tp if __name__ == '__main__': - llvm.initialize() - llvm.initialize_native_target() - llvm.initialize_native_asmprinter() - llvm.initialize_native_asmparser() + setup_llvm() i1 = op.Instruction( instruction='add $2, $0', @@ -303,10 +315,10 @@ if __name__ == '__main__': print(b.get_assembly()) print(b.build_and_execute()) - print(bench_instruction(op.Instruction( + print(bench_instructions([op.Instruction( instruction='add $2, $0', destination_operand=op.Register('i64', 'r'), - source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')]))) + source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])])) # if len(s.get_source_operand_types()) # b = IntegerLoopBenchmark(loop_body, diff --git a/asmjit/op.py b/asmjit/op.py index 563521f..c60033d 100755 --- a/asmjit/op.py +++ b/asmjit/op.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 - +import re # TODO use abc to force implementation of interface requirements -init_value_by_llvm_type = {'i' + bits: '1' for bits in ['1', '8', '16', '32', '64']} -init_value_by_llvm_type.update({fp_type: '1.0' for fp_type in ['float', 'double', 'fp128']}) +init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']} +init_value_by_llvm_type.update({fp_type: '1.00023' for fp_type in ['float', 'double', 'fp128']}) init_value_by_llvm_type.update( {'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>' for t, v in init_value_by_llvm_type.items() @@ -191,35 +191,37 @@ class Instruction(Operation): Create Instruction object from string. :param s: must have the form: - "asm_instruction_name ( (src|dst|srcdst):llvm_type:constraint_char)+" + "asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+" """ - instruction, operands = s.split(maxsplit=1) + instruction = s + # It is important that the match objects are in reverse order, to allow string replacements + # based on original match group locations + operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s)))) + # Destination indices start at 0, source indices at "number of destination operands" + dst_index, src_index = 0, ['dst' in o.group(1) for o in operands].count(True) dst_ops = [] src_ops = [] - src_index = list(['dst' in o.split()[0] for o in operands.split()]).count(True) - dst_index = 0 - for o in operands.split(): - direction, reg_options = o.split(':', maxsplit=1) - r = Register.from_string(reg_options) - valid = False + for m in operands: + direction, register_string = m.group(1, 2) + register = Register.from_string(register_string) if 'src' in direction and not 'dst' in direction: - valid = True - src_ops.append(r) - instruction += " ${}".format(src_index) + src_ops.append(register) + # replace with index string + instruction = (instruction[:m.start()] + "${}".format(src_index) + + instruction[m.end():]) src_index += 1 if 'dst' in direction: - valid = True - dst_ops.append(r) - instruction += " ${}".format(dst_index) + dst_ops.append(register) + # replace with index string + instruction = (instruction[:m.start()] + "${}".format(dst_index) + + instruction[m.end():]) if 'src' in direction: - src_ops.append(Register(reg_options.split(':', 1)[0], str(dst_index))) + src_ops.append(Register(register_string.split(':', 1)[0], str(dst_index))) dst_index += 1 - if not valid: - raise ValueError("Invalid direction '{}', may only be src, dst or srcdst.".format( - direction)) - assert len(dst_ops) == 1, "Instruction supports only single destinations." - return cls(instruction, dst_ops[0], src_ops) + if len(dst_ops) != 1: + raise ValueError("Instruction supports only single destinations.") + return cls(instruction, dst_ops[0], src_ops) class Load(Operation): @@ -404,4 +406,4 @@ if __name__ == '__main__': s4 = Serialized([i1, i2, i3, i4, i5, i6]) print(s4.build_ir(['%out'], ['%in']), '\n') - print(Instruction.from_string("add src:i64:r srcdst:i64:r")) + print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}")) diff --git a/setup.py b/setup.py index b5f2854..6a48a82 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,5 @@ setup( author='Julian Hammer', author_email='julian.hammer@u-sys.org', description='A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT', - install_requires=['llvmlite>=0.23.2'], + install_requires=['llvmlite>=0.23.2', 'psutil'], ) diff --git a/tablegen.py b/tablegen.py index 64fb61a..67f8fff 100755 --- a/tablegen.py +++ b/tablegen.py @@ -443,7 +443,7 @@ def main(): # Benchmark TP and Lat for each instruction # for instr_name, instr_op in instructions.items(): - # tp, lat = bench.bench_instruction(instr_op) + # tp, lat = bench.bench_instructions([instr_op]) # print("{:>12} {:>5.2f} {:>5.2f}".format(instr_name, tp, lat)) # Benchmark TP and Lat for all valid instruction pairs