added support for iaca analysis

2026-01-07 21:10:05 +01:00 · 2018-07-10 14:26:19 +02:00
parent 3dac7d6795
commit 6786c399ad
4 changed files with 166 additions and 73 deletions
--- a/asmjit/main.py
+++ b/asmjit/main.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 import argparse

+import llvmlite.binding as llvm
+
 from . import op, bench


@@ -18,6 +20,9 @@ def main():
                             'benchmark')
    parser.add_argument('--throughput-serial', '-t', type=int, default=8,
                        help='length of serial instances of serial chains in throughput benchmark')
+    parser.add_argument('--iaca', type=str, default=None,
+                        help='Compare throughput measurement with IACA analysis, pass '
+                             'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
    parser.add_argument("--verbose", "-v", action="count", default=0,
                        help="increase output verbosity")
    args = parser.parse_args()
@@ -28,12 +33,10 @@ def main():
                                       parallel_factor=args.parallel,
                                       throughput_serial_factor=args.throughput_serial,
                                       serialize=args.serialize,
-                                       verbosity=args.verbose)
+                                       verbosity=args.verbose,
+                                       iaca_comparison=args.iaca)
    print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))

-    #b = bench.IntegerLoopBenchmark(args.instructions[0])
-    #b.get_iaca_analysis()
-

 if __name__ == "__main__":
    main()
--- a/asmjit/bench.py
+++ b/asmjit/bench.py
@@ -7,9 +7,14 @@ import re
 from pprint import pprint
 import tempfile
 import subprocess
+import sys

 import llvmlite.binding as llvm
 import psutil
+try:
+    from kerncraft import iaca
+except ImportError:
+    iaca = None

 from . import op

@@ -28,10 +33,6 @@ def uniquify(l):


 class Benchmark:
-    def __init__(self):
-        self._tm = None
-        self._llvm_module = None
-
    def __repr__(self):
        return '{}({})'.format(
            self.__class__.__name__,
@@ -47,34 +48,30 @@ class Benchmark:
            return int(previous_args[0] * time_factor),

    @staticmethod
-    def get_iterations(args):
+    def get_iterations(args) -> int:
        """Return number of iterations performed, based on lower level function arguments."""
        return args[0]

    def build_ir(self):
        raise NotImplementedError()

-    def get_llvm_module(self):
+    def get_llvm_module(self, iaca_marker=False):
        """Build and return LLVM module from LLVM IR code."""
-        if self._llvm_module is None:
-            self._llvm_module = llvm.parse_assembly(self.build_ir())
-            self._llvm_module.verify()
-        return self._llvm_module
+        ir = self.build_ir(iaca_marker=iaca_marker)
+        return llvm.parse_assembly(ir)

    def get_target_machine(self):
        """Instantiate and return target machine."""
-        if self._tm is None:
-            features = llvm.get_host_cpu_features().flatten()
-            cpu = llvm.get_host_cpu_name()
-            self._tm = llvm.Target.from_default_triple().create_target_machine(
-                cpu=cpu, features=features, opt=3)
-        return self._tm
+        features = llvm.get_host_cpu_features().flatten()
+        cpu = llvm.get_host_cpu_name()
+        return llvm.Target.from_default_triple().create_target_machine(
+             cpu=cpu, features=features, opt=3)

-    def get_assembly(self):
+    def get_assembly(self, iaca_marker=False):
        """Compile and return assembly from LLVM module."""
        tm = self.get_target_machine()
        tm.set_asm_verbosity(0)
-        asm = tm.emit_assembly(self.get_llvm_module())
+        asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
        # Remove double comments
        asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
        return asm
@@ -82,17 +79,15 @@ class Benchmark:
    def get_function_ctype(self):
        return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)

-    def get_iaca_analysis(self):
+    def get_iaca_analysis(self, arch):
        """Compile and return IACA analysis."""
+        if iaca is None:
+            raise ValueError("kerncraft not installed. IACA analysis is not supported.")
        tm = self.get_target_machine()
        tmpf = tempfile.NamedTemporaryFile("wb")
-        tmpf.write(tm.emit_object(self.get_llvm_module()))
+        tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
        tmpf.flush()
-
-        # assuming "iaca.sh" to be available
-        subprocess.check_output(['objdump', tmpf.name])
-        # WORK IN PROGRESS
-
+        return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)

    def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
        # Compile the module to machine code using MCJIT
@@ -141,10 +136,11 @@ class Benchmark:


 class LoopBenchmark(Benchmark):
-    def __init__(self, root_synth, init_values=None):
+    def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True):
        super().__init__()
        self.root_synth = root_synth
        self.init_values = init_values or root_synth.get_default_init_values()
+        self.loop_carried_dependencies = loop_carried_dependencies

        if len(root_synth.get_source_registers()) != len(self.init_values):
            raise ValueError("Number of init values and source registers do not match.")
@@ -156,7 +152,9 @@ class LoopBenchmark(Benchmark):
        return ['%out.{}'.format(i) for i in
                range(len(self.root_synth.get_destination_registers()))]

-    def get_phi_code(self, latency=True):
+    def get_phi_code(self):
+        if not self.loop_carried_dependencies:
+            return ''
        # Compile loop carried dependencies
        lcd = []
        # Change in naming (src <-> dst) is on purpose!
@@ -210,8 +208,19 @@ class LoopBenchmark(Benchmark):


 class IntegerLoopBenchmark(LoopBenchmark):
-    def build_ir(self):
-        return textwrap.dedent('''\
+    def build_ir(self, iaca_marker=False):
+        if iaca_marker:
+            iaca_start_marker = textwrap.dedent('''\
+                call void asm "movl    $$111,%ebx", ""()
+                call void asm ".byte   100,103,144", ""()''')
+            iaca_stop_marker = textwrap.dedent('''\
+                call void asm "movl    $$222,%ebx", ""()
+                call void asm ".byte   100,103,144", ""()''')
+        else:
+            iaca_start_marker = ''
+            iaca_stop_marker = ''
+
+        ir = textwrap.dedent('''\
            define i64 @"test"(i64 %"N")
            {{
            entry:
@@ -221,24 +230,30 @@ class IntegerLoopBenchmark(LoopBenchmark):
            loop:
              %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
            {phi}
+            {iaca_start_marker}
            {loop_body}
              %"loop_counter.1" = add i64 %"loop_counter", 1
              %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
              br i1 %"loop_cond.1", label %"loop", label %"end"
-
+            
            end:
              %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
+            {iaca_stop_marker}
              ret i64 %"ret"
            }}
            ''').format(
            loop_body=textwrap.indent(
                self.root_synth.build_ir(self.get_destination_names(),
                                         self.get_source_names()), '  '),
-            phi=textwrap.indent(self.get_phi_code(), '  '))
+            phi=textwrap.indent(self.get_phi_code(), '  '),
+            iaca_start_marker=iaca_start_marker,
+            iaca_stop_marker=iaca_stop_marker)
+
+        return ir


 def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
-                       serialize=False, verbosity=0):
+                       serialize=False, verbosity=0, iaca_comparison=None):
    not_serializable = False
    try:
        # Latency Benchmark
@@ -261,6 +276,7 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
        result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
        lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
                    for t in result['runtimes']])
+        result['latency'] = lat
        if verbosity > 0:
            print('### Detailed Results')
            pprint(result)
@@ -294,10 +310,20 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
    tp = min(
        [(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
         for t in result['runtimes']])
+    result['throughput'] = tp
+    if iaca_comparison is not None:
+        iaca_analysis = b.get_iaca_analysis(iaca_comparison)
+        result['iaca throughput'] = iaca_analysis['throughput']/(
+                parallel_factor * throughput_serial_factor)
    if verbosity > 0:
        print('### Detailed Results')
        pprint(result)
        print()
+    if verbosity > 1 and iaca_comparison is not None:
+        print('### IACA Results')
+        print(iaca_analysis['output'])
+        print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
+            throughput_serial_factor, parallel_factor))

    # Result compilation
    return lat, tp
--- a/asmjit/op.py
+++ b/asmjit/op.py
@@ -205,9 +205,18 @@ class Instruction(Operation):
        self.source_operands = source_operands

    def get_source_registers(self):
-        return [sop for sop in self.source_operands if isinstance(sop, Register)] + \
-               [r for mop in self.source_operands if isinstance(mop, MemoryReference)
-                for r in mop.get_registers()]
+        sop_types = set()
+        sr = []
+        for sop in self.source_operands:
+            if isinstance(sop, Register):
+                t = (sop.llvm_type, sop.get_constraint_char())
+                if t not in sop_types:
+                    sop_types.add(t)
+                    sr.append(sop)
+            elif isinstance(sop, MemoryReference):
+                sr += list(sop.get_registers())
+
+        return sr

    def get_destination_registers(self):
        if isinstance(self.destination_operand, Register):
@@ -229,6 +238,7 @@ class Instruction(Operation):

        # Build argument string from operands and register names
        operands = []
+        sop_types = {}
        i = 0
        for sop in self.source_operands:
            if isinstance(sop, Immediate):
@@ -236,10 +246,17 @@ class Instruction(Operation):
                    type=sop.llvm_type,
                    repr=sop.value))
            elif isinstance(sop, Register):
-                operands.append('{type} {repr}'.format(
-                    type=sop.llvm_type,
-                    repr=src_reg_names[i]))
-                i += 1
+                sop_t = (sop.llvm_type, sop.get_constraint_char())
+                if sop_t in sop_types:
+                    operands.append('{type} {repr}'.format(
+                        type=sop.llvm_type,
+                        repr=src_reg_names[sop_types[sop_t]]))
+                else:
+                    sop_types[sop_t] = i
+                    operands.append('{type} {repr}'.format(
+                        type=sop.llvm_type,
+                        repr=src_reg_names[i]))
+                    i += 1
            elif isinstance(sop, MemoryReference):
                operands.append('{type} {repr}'.format(
                    type=sop.llvm_type,
--- a/tablegen.py
+++ b/tablegen.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3

 import sys
+import textwrap
+
 import collections
 import re
 import itertools
@@ -466,44 +468,89 @@ def main():
    instructions_ret_type = collections.defaultdict(collections.OrderedDict)
    if args.verbosity > 0:
        for ret_type in rel_instruction_names:
-            print(ret_type, 'has', len(instrs), 'instructions')
+            print(ret_type, 'has', len(instructions_ret_type[ret_type]), 'instructions')

    # Benchmark random instruction sequences
    for instr_name, instr_op in instructions.items():
        instructions_ret_type[instr_op.get_destination_registers()[0].llvm_type][
            instr_name] = (instr_name, instr_op)
    # Constructing random benchmarks, one for each return type
-    random.seed(42)
-    parallel_factor = 8
-    for t in sorted(instructions_ret_type):
-        valid = False
-        while not valid:
-            selected_names, selected_instrs = zip(
-                *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
+    #random.seed(42)
+    #parallel_factor = 8
+    #for t in sorted(instructions_ret_type):
+    #    valid = False
+    #    while not valid:
+    #        selected_names, selected_instrs = zip(
+    #            *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
+    #
+    #        if not all([can_serialize(i) for i in selected_instrs]):
+    #            continue
+    #        else:
+    #            valid = True
+    #
+    #        serial = op.Serialized(selected_instrs)
+    #        p = op.Parallelized([serial] * parallel_factor)
+    #
+    #        init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
+    #                       p.get_source_registers()]
+    #        b = bench.IntegerLoopBenchmark(p, init_values)
+    #        print('## Selected Instructions')
+    #        print(', '.join(selected_names))
+    #        print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
+    #        print(b.get_assembly())
+    #        #pprint(selected_instrs)
+    #        r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
+    #        r['parallel_factor'] = parallel_factor
+    #        print('## Detailed Results')
+    #        pprint(r)
+    #        print("minimal throughput: {:.2f} cy".format(
+    #            min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))

-            if not all([can_serialize(i) for i in selected_instrs]):
-                continue
-            else:
-                valid = True
+    # Reduce to 100 instructions:
+    #instructions = dict(list(instructions.items())[:100])

-            serial = op.Serialized(selected_instrs)
-            p = op.Parallelized([serial] * parallel_factor)
-
-            init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
-                           p.get_source_registers()]
-            b = bench.IntegerLoopBenchmark(p, init_values)
-            print('## Selected Instructions')
-            print(', '.join(selected_names))
-            print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
-            print(b.get_assembly())
-            #pprint(selected_instrs)
-            r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
-            r['parallel_factor'] = parallel_factor
-            print('## Detailed Results')
-            pprint(r)
-            print("minimal throughput: {:.2f} cy".format(
-                min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
+    # Reduce to set of instructions used in Stream Triad:
+    instructions = {k: v for k,v in instructions.items() if k in ['ADD32ri', 'ADD64ri32', 'INC64r', 'SUB32ri', 'VADDPDYrr', 'VADDSDrr', 'VADDSSrr', 'VCVTSI642SSrr', 'VFMADD213PDYr', 'VFMADD213PDr', 'VFMADD213PSYr', 'VFMADD213PSr', 'VFMADD213SDr', 'VFMADD213SSr', 'VINSERTF128rr', 'VMULPDYrr', 'VMULSDrr_Int', 'VMULSSrr_Int', 'VSUBSDrr_Int', 'VSUBSSrr_Int']}

+    random.seed(23)
+    instructions_per_run = 3
+    parallel_factor = 4
+    print(textwrap.dedent("""
+        # This file contains example data measured on an Intel I7-6700HQ with 2.6GHz with Turbo mode
+        # disabled.
+        
+        # Comments are possible everywhere after hash symbols.
+        
+        # This part contains necessary configuration information.
+        configuration:
+            model: three-level   # we assume that instructions are decomposed into uops
+            num_ports: 7         # our hardware has 4 execution ports
+            num_uops_per_insn: 4 # the maximal number of uops into which an instruction can be decomposed
+            slack_limit: 0.0     # relative margin of error for cycle measurements
+        
+        
+        # Here follows a list of experiments.
+        """))
+    for i in range(100):
+        selected_names, selected_instrs = zip(*[random.choice(list(instructions.items()))
+                                                for i in range(instructions_per_run)])
+        print("experiment:")
+        p = op.Parallelized(selected_instrs*parallel_factor)
+        b = bench.IntegerLoopBenchmark(p)
+        print('    instructions:')
+        print('        '+('\n        '.join(selected_names)))
+        if args.verbosity > 0:
+            print('    ir:')
+            print(textwrap.indent(b.build_ir(), ' '*8))
+            print('    asm:')
+            print(textwrap.indent(b.get_assembly(), ' '*8))
+        r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
+        r['parallel_factor'] = parallel_factor
+        if args.verbosity > 0:
+            print('    detailed_result:')
+            pprint(r, indent=8)
+        print("    cycles: {:.2f}".format(
+            min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))

 def can_serialize(instr):
    if not any([so.llvm_type == instr.destination_operand.llvm_type and
@@ -517,7 +564,7 @@ def can_serialize(instr):
 def combined_instructions(instructions, length):
    for instr_names in itertools.combinations(instructions, length):
        instrs = [instructions[n] for n in instr_names]
-        dst_types = list([i.get_destination_registers()[0].llvm_type ])
+        dst_types = list([i.get_destination_registers()[0].llvm_type for i in instrs])
        if not all([can_serialize(i) for i in instrs]) and dst_types[1:] == dst_types[:-1]:
            continue
        yield instrs