diff --git a/asmjit/__main__.py b/asmjit/__main__.py index bd3878d..49effb9 100644 --- a/asmjit/__main__.py +++ b/asmjit/__main__.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 import argparse +import llvmlite.binding as llvm + from . import op, bench @@ -18,6 +20,9 @@ def main(): 'benchmark') parser.add_argument('--throughput-serial', '-t', type=int, default=8, help='length of serial instances of serial chains in throughput benchmark') + parser.add_argument('--iaca', type=str, default=None, + help='Compare throughput measurement with IACA analysis, pass ' + 'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)') parser.add_argument("--verbose", "-v", action="count", default=0, help="increase output verbosity") args = parser.parse_args() @@ -28,12 +33,10 @@ def main(): parallel_factor=args.parallel, throughput_serial_factor=args.throughput_serial, serialize=args.serialize, - verbosity=args.verbose) + verbosity=args.verbose, + iaca_comparison=args.iaca) print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp)) - #b = bench.IntegerLoopBenchmark(args.instructions[0]) - #b.get_iaca_analysis() - if __name__ == "__main__": main() diff --git a/asmjit/bench.py b/asmjit/bench.py index ec1f3a8..a9cdcca 100755 --- a/asmjit/bench.py +++ b/asmjit/bench.py @@ -7,9 +7,14 @@ import re from pprint import pprint import tempfile import subprocess +import sys import llvmlite.binding as llvm import psutil +try: + from kerncraft import iaca +except ImportError: + iaca = None from . import op @@ -28,10 +33,6 @@ def uniquify(l): class Benchmark: - def __init__(self): - self._tm = None - self._llvm_module = None - def __repr__(self): return '{}({})'.format( self.__class__.__name__, @@ -47,34 +48,30 @@ class Benchmark: return int(previous_args[0] * time_factor), @staticmethod - def get_iterations(args): + def get_iterations(args) -> int: """Return number of iterations performed, based on lower level function arguments.""" return args[0] def build_ir(self): raise NotImplementedError() - def get_llvm_module(self): + def get_llvm_module(self, iaca_marker=False): """Build and return LLVM module from LLVM IR code.""" - if self._llvm_module is None: - self._llvm_module = llvm.parse_assembly(self.build_ir()) - self._llvm_module.verify() - return self._llvm_module + ir = self.build_ir(iaca_marker=iaca_marker) + return llvm.parse_assembly(ir) def get_target_machine(self): """Instantiate and return target machine.""" - if self._tm is None: - features = llvm.get_host_cpu_features().flatten() - cpu = llvm.get_host_cpu_name() - self._tm = llvm.Target.from_default_triple().create_target_machine( - cpu=cpu, features=features, opt=3) - return self._tm + features = llvm.get_host_cpu_features().flatten() + cpu = llvm.get_host_cpu_name() + return llvm.Target.from_default_triple().create_target_machine( + cpu=cpu, features=features, opt=3) - def get_assembly(self): + def get_assembly(self, iaca_marker=False): """Compile and return assembly from LLVM module.""" tm = self.get_target_machine() tm.set_asm_verbosity(0) - asm = tm.emit_assembly(self.get_llvm_module()) + asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker)) # Remove double comments asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm) return asm @@ -82,17 +79,15 @@ class Benchmark: def get_function_ctype(self): return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64) - def get_iaca_analysis(self): + def get_iaca_analysis(self, arch): """Compile and return IACA analysis.""" + if iaca is None: + raise ValueError("kerncraft not installed. IACA analysis is not supported.") tm = self.get_target_machine() tmpf = tempfile.NamedTemporaryFile("wb") - tmpf.write(tm.emit_object(self.get_llvm_module())) + tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True))) tmpf.flush() - - # assuming "iaca.sh" to be available - subprocess.check_output(['objdump', tmpf.name]) - # WORK IN PROGRESS - + return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch) def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3): # Compile the module to machine code using MCJIT @@ -141,10 +136,11 @@ class Benchmark: class LoopBenchmark(Benchmark): - def __init__(self, root_synth, init_values=None): + def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True): super().__init__() self.root_synth = root_synth self.init_values = init_values or root_synth.get_default_init_values() + self.loop_carried_dependencies = loop_carried_dependencies if len(root_synth.get_source_registers()) != len(self.init_values): raise ValueError("Number of init values and source registers do not match.") @@ -156,7 +152,9 @@ class LoopBenchmark(Benchmark): return ['%out.{}'.format(i) for i in range(len(self.root_synth.get_destination_registers()))] - def get_phi_code(self, latency=True): + def get_phi_code(self): + if not self.loop_carried_dependencies: + return '' # Compile loop carried dependencies lcd = [] # Change in naming (src <-> dst) is on purpose! @@ -210,8 +208,19 @@ class LoopBenchmark(Benchmark): class IntegerLoopBenchmark(LoopBenchmark): - def build_ir(self): - return textwrap.dedent('''\ + def build_ir(self, iaca_marker=False): + if iaca_marker: + iaca_start_marker = textwrap.dedent('''\ + call void asm "movl $$111,%ebx", ""() + call void asm ".byte 100,103,144", ""()''') + iaca_stop_marker = textwrap.dedent('''\ + call void asm "movl $$222,%ebx", ""() + call void asm ".byte 100,103,144", ""()''') + else: + iaca_start_marker = '' + iaca_stop_marker = '' + + ir = textwrap.dedent('''\ define i64 @"test"(i64 %"N") {{ entry: @@ -221,24 +230,30 @@ class IntegerLoopBenchmark(LoopBenchmark): loop: %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"] {phi} + {iaca_start_marker} {loop_body} %"loop_counter.1" = add i64 %"loop_counter", 1 %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N" br i1 %"loop_cond.1", label %"loop", label %"end" - + end: %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"] + {iaca_stop_marker} ret i64 %"ret" }} ''').format( loop_body=textwrap.indent( self.root_synth.build_ir(self.get_destination_names(), self.get_source_names()), ' '), - phi=textwrap.indent(self.get_phi_code(), ' ')) + phi=textwrap.indent(self.get_phi_code(), ' '), + iaca_start_marker=iaca_start_marker, + iaca_stop_marker=iaca_stop_marker) + + return ir def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8, - serialize=False, verbosity=0): + serialize=False, verbosity=0, iaca_comparison=None): not_serializable = False try: # Latency Benchmark @@ -261,6 +276,7 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations'] for t in result['runtimes']]) + result['latency'] = lat if verbosity > 0: print('### Detailed Results') pprint(result) @@ -294,10 +310,20 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through tp = min( [(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations'] for t in result['runtimes']]) + result['throughput'] = tp + if iaca_comparison is not None: + iaca_analysis = b.get_iaca_analysis(iaca_comparison) + result['iaca throughput'] = iaca_analysis['throughput']/( + parallel_factor * throughput_serial_factor) if verbosity > 0: print('### Detailed Results') pprint(result) print() + if verbosity > 1 and iaca_comparison is not None: + print('### IACA Results') + print(iaca_analysis['output']) + print('!!! throughput_serial_factor={} and parallel_factor={}'.format( + throughput_serial_factor, parallel_factor)) # Result compilation return lat, tp diff --git a/asmjit/op.py b/asmjit/op.py index 418da11..f1c206b 100755 --- a/asmjit/op.py +++ b/asmjit/op.py @@ -205,9 +205,18 @@ class Instruction(Operation): self.source_operands = source_operands def get_source_registers(self): - return [sop for sop in self.source_operands if isinstance(sop, Register)] + \ - [r for mop in self.source_operands if isinstance(mop, MemoryReference) - for r in mop.get_registers()] + sop_types = set() + sr = [] + for sop in self.source_operands: + if isinstance(sop, Register): + t = (sop.llvm_type, sop.get_constraint_char()) + if t not in sop_types: + sop_types.add(t) + sr.append(sop) + elif isinstance(sop, MemoryReference): + sr += list(sop.get_registers()) + + return sr def get_destination_registers(self): if isinstance(self.destination_operand, Register): @@ -229,6 +238,7 @@ class Instruction(Operation): # Build argument string from operands and register names operands = [] + sop_types = {} i = 0 for sop in self.source_operands: if isinstance(sop, Immediate): @@ -236,10 +246,17 @@ class Instruction(Operation): type=sop.llvm_type, repr=sop.value)) elif isinstance(sop, Register): - operands.append('{type} {repr}'.format( - type=sop.llvm_type, - repr=src_reg_names[i])) - i += 1 + sop_t = (sop.llvm_type, sop.get_constraint_char()) + if sop_t in sop_types: + operands.append('{type} {repr}'.format( + type=sop.llvm_type, + repr=src_reg_names[sop_types[sop_t]])) + else: + sop_types[sop_t] = i + operands.append('{type} {repr}'.format( + type=sop.llvm_type, + repr=src_reg_names[i])) + i += 1 elif isinstance(sop, MemoryReference): operands.append('{type} {repr}'.format( type=sop.llvm_type, diff --git a/tablegen.py b/tablegen.py index cd6cea4..4535512 100755 --- a/tablegen.py +++ b/tablegen.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 import sys +import textwrap + import collections import re import itertools @@ -466,44 +468,89 @@ def main(): instructions_ret_type = collections.defaultdict(collections.OrderedDict) if args.verbosity > 0: for ret_type in rel_instruction_names: - print(ret_type, 'has', len(instrs), 'instructions') + print(ret_type, 'has', len(instructions_ret_type[ret_type]), 'instructions') # Benchmark random instruction sequences for instr_name, instr_op in instructions.items(): instructions_ret_type[instr_op.get_destination_registers()[0].llvm_type][ instr_name] = (instr_name, instr_op) # Constructing random benchmarks, one for each return type - random.seed(42) - parallel_factor = 8 - for t in sorted(instructions_ret_type): - valid = False - while not valid: - selected_names, selected_instrs = zip( - *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)]) + #random.seed(42) + #parallel_factor = 8 + #for t in sorted(instructions_ret_type): + # valid = False + # while not valid: + # selected_names, selected_instrs = zip( + # *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)]) + # + # if not all([can_serialize(i) for i in selected_instrs]): + # continue + # else: + # valid = True + # + # serial = op.Serialized(selected_instrs) + # p = op.Parallelized([serial] * parallel_factor) + # + # init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in + # p.get_source_registers()] + # b = bench.IntegerLoopBenchmark(p, init_values) + # print('## Selected Instructions') + # print(', '.join(selected_names)) + # print('## Generated Assembly ({}x parallel)'.format(parallel_factor)) + # print(b.get_assembly()) + # #pprint(selected_instrs) + # r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) + # r['parallel_factor'] = parallel_factor + # print('## Detailed Results') + # pprint(r) + # print("minimal throughput: {:.2f} cy".format( + # min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor)) - if not all([can_serialize(i) for i in selected_instrs]): - continue - else: - valid = True + # Reduce to 100 instructions: + #instructions = dict(list(instructions.items())[:100]) - serial = op.Serialized(selected_instrs) - p = op.Parallelized([serial] * parallel_factor) - - init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in - p.get_source_registers()] - b = bench.IntegerLoopBenchmark(p, init_values) - print('## Selected Instructions') - print(', '.join(selected_names)) - print('## Generated Assembly ({}x parallel)'.format(parallel_factor)) - print(b.get_assembly()) - #pprint(selected_instrs) - r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) - r['parallel_factor'] = parallel_factor - print('## Detailed Results') - pprint(r) - print("minimal throughput: {:.2f} cy".format( - min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor)) + # Reduce to set of instructions used in Stream Triad: + instructions = {k: v for k,v in instructions.items() if k in ['ADD32ri', 'ADD64ri32', 'INC64r', 'SUB32ri', 'VADDPDYrr', 'VADDSDrr', 'VADDSSrr', 'VCVTSI642SSrr', 'VFMADD213PDYr', 'VFMADD213PDr', 'VFMADD213PSYr', 'VFMADD213PSr', 'VFMADD213SDr', 'VFMADD213SSr', 'VINSERTF128rr', 'VMULPDYrr', 'VMULSDrr_Int', 'VMULSSrr_Int', 'VSUBSDrr_Int', 'VSUBSSrr_Int']} + random.seed(23) + instructions_per_run = 3 + parallel_factor = 4 + print(textwrap.dedent(""" + # This file contains example data measured on an Intel I7-6700HQ with 2.6GHz with Turbo mode + # disabled. + + # Comments are possible everywhere after hash symbols. + + # This part contains necessary configuration information. + configuration: + model: three-level # we assume that instructions are decomposed into uops + num_ports: 7 # our hardware has 4 execution ports + num_uops_per_insn: 4 # the maximal number of uops into which an instruction can be decomposed + slack_limit: 0.0 # relative margin of error for cycle measurements + + + # Here follows a list of experiments. + """)) + for i in range(100): + selected_names, selected_instrs = zip(*[random.choice(list(instructions.items())) + for i in range(instructions_per_run)]) + print("experiment:") + p = op.Parallelized(selected_instrs*parallel_factor) + b = bench.IntegerLoopBenchmark(p) + print(' instructions:') + print(' '+('\n '.join(selected_names))) + if args.verbosity > 0: + print(' ir:') + print(textwrap.indent(b.build_ir(), ' '*8)) + print(' asm:') + print(textwrap.indent(b.get_assembly(), ' '*8)) + r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2) + r['parallel_factor'] = parallel_factor + if args.verbosity > 0: + print(' detailed_result:') + pprint(r, indent=8) + print(" cycles: {:.2f}".format( + min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor)) def can_serialize(instr): if not any([so.llvm_type == instr.destination_operand.llvm_type and @@ -517,7 +564,7 @@ def can_serialize(instr): def combined_instructions(instructions, length): for instr_names in itertools.combinations(instructions, length): instrs = [instructions[n] for n in instr_names] - dst_types = list([i.get_destination_registers()[0].llvm_type ]) + dst_types = list([i.get_destination_registers()[0].llvm_type for i in instrs]) if not all([can_serialize(i) for i in instrs]) and dst_types[1:] == dst_types[:-1]: continue yield instrs