diff --git a/asmjit/__main__.py b/asmjit/__main__.py
index bd3878d..49effb9 100644
--- a/asmjit/__main__.py
+++ b/asmjit/__main__.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 import argparse
 
+import llvmlite.binding as llvm
+
 from . import op, bench
 
 
@@ -18,6 +20,9 @@ def main():
                              'benchmark')
     parser.add_argument('--throughput-serial', '-t', type=int, default=8,
                         help='length of serial instances of serial chains in throughput benchmark')
+    parser.add_argument('--iaca', type=str, default=None,
+                        help='Compare throughput measurement with IACA analysis, pass '
+                             'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
     parser.add_argument("--verbose", "-v", action="count", default=0,
                         help="increase output verbosity")
     args = parser.parse_args()
@@ -28,12 +33,10 @@ def main():
                                        parallel_factor=args.parallel,
                                        throughput_serial_factor=args.throughput_serial,
                                        serialize=args.serialize,
-                                       verbosity=args.verbose)
+                                       verbosity=args.verbose,
+                                       iaca_comparison=args.iaca)
     print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
 
-    #b = bench.IntegerLoopBenchmark(args.instructions[0])
-    #b.get_iaca_analysis()
-
 
 if __name__ == "__main__":
     main()
diff --git a/asmjit/bench.py b/asmjit/bench.py
index ec1f3a8..a9cdcca 100755
--- a/asmjit/bench.py
+++ b/asmjit/bench.py
@@ -7,9 +7,14 @@ import re
 from pprint import pprint
 import tempfile
 import subprocess
+import sys
 
 import llvmlite.binding as llvm
 import psutil
+try:
+    from kerncraft import iaca
+except ImportError:
+    iaca = None
 
 from . import op
 
@@ -28,10 +33,6 @@ def uniquify(l):
 
 
 class Benchmark:
-    def __init__(self):
-        self._tm = None
-        self._llvm_module = None
-
     def __repr__(self):
         return '{}({})'.format(
             self.__class__.__name__,
@@ -47,34 +48,30 @@ class Benchmark:
             return int(previous_args[0] * time_factor),
 
     @staticmethod
-    def get_iterations(args):
+    def get_iterations(args) -> int:
         """Return number of iterations performed, based on lower level function arguments."""
         return args[0]
 
     def build_ir(self):
         raise NotImplementedError()
 
-    def get_llvm_module(self):
+    def get_llvm_module(self, iaca_marker=False):
         """Build and return LLVM module from LLVM IR code."""
-        if self._llvm_module is None:
-            self._llvm_module = llvm.parse_assembly(self.build_ir())
-            self._llvm_module.verify()
-        return self._llvm_module
+        ir = self.build_ir(iaca_marker=iaca_marker)
+        return llvm.parse_assembly(ir)
 
     def get_target_machine(self):
         """Instantiate and return target machine."""
-        if self._tm is None:
-            features = llvm.get_host_cpu_features().flatten()
-            cpu = llvm.get_host_cpu_name()
-            self._tm = llvm.Target.from_default_triple().create_target_machine(
-                cpu=cpu, features=features, opt=3)
-        return self._tm
+        features = llvm.get_host_cpu_features().flatten()
+        cpu = llvm.get_host_cpu_name()
+        return llvm.Target.from_default_triple().create_target_machine(
+             cpu=cpu, features=features, opt=3)
 
-    def get_assembly(self):
+    def get_assembly(self, iaca_marker=False):
         """Compile and return assembly from LLVM module."""
         tm = self.get_target_machine()
         tm.set_asm_verbosity(0)
-        asm = tm.emit_assembly(self.get_llvm_module())
+        asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
         # Remove double comments
         asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
         return asm
@@ -82,17 +79,15 @@ class Benchmark:
     def get_function_ctype(self):
         return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
 
-    def get_iaca_analysis(self):
+    def get_iaca_analysis(self, arch):
         """Compile and return IACA analysis."""
+        if iaca is None:
+            raise ValueError("kerncraft not installed. IACA analysis is not supported.")
         tm = self.get_target_machine()
         tmpf = tempfile.NamedTemporaryFile("wb")
-        tmpf.write(tm.emit_object(self.get_llvm_module()))
+        tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
         tmpf.flush()
-
-        # assuming "iaca.sh" to be available
-        subprocess.check_output(['objdump', tmpf.name])
-        # WORK IN PROGRESS
-
+        return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
 
     def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
         # Compile the module to machine code using MCJIT
@@ -141,10 +136,11 @@ class Benchmark:
 
 
 class LoopBenchmark(Benchmark):
-    def __init__(self, root_synth, init_values=None):
+    def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True):
         super().__init__()
         self.root_synth = root_synth
         self.init_values = init_values or root_synth.get_default_init_values()
+        self.loop_carried_dependencies = loop_carried_dependencies
 
         if len(root_synth.get_source_registers()) != len(self.init_values):
             raise ValueError("Number of init values and source registers do not match.")
@@ -156,7 +152,9 @@ class LoopBenchmark(Benchmark):
         return ['%out.{}'.format(i) for i in
                 range(len(self.root_synth.get_destination_registers()))]
 
-    def get_phi_code(self, latency=True):
+    def get_phi_code(self):
+        if not self.loop_carried_dependencies:
+            return ''
         # Compile loop carried dependencies
         lcd = []
         # Change in naming (src <-> dst) is on purpose!
@@ -210,8 +208,19 @@ class LoopBenchmark(Benchmark):
 
 
 class IntegerLoopBenchmark(LoopBenchmark):
-    def build_ir(self):
-        return textwrap.dedent('''\
+    def build_ir(self, iaca_marker=False):
+        if iaca_marker:
+            iaca_start_marker = textwrap.dedent('''\
+                call void asm "movl    $$111,%ebx", ""()
+                call void asm ".byte   100,103,144", ""()''')
+            iaca_stop_marker = textwrap.dedent('''\
+                call void asm "movl    $$222,%ebx", ""()
+                call void asm ".byte   100,103,144", ""()''')
+        else:
+            iaca_start_marker = ''
+            iaca_stop_marker = ''
+
+        ir = textwrap.dedent('''\
             define i64 @"test"(i64 %"N")
             {{
             entry:
@@ -221,24 +230,30 @@ class IntegerLoopBenchmark(LoopBenchmark):
             loop:
               %"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
             {phi}
+            {iaca_start_marker}
             {loop_body}
               %"loop_counter.1" = add i64 %"loop_counter", 1
               %"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
               br i1 %"loop_cond.1", label %"loop", label %"end"
-
+            
             end:
               %"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
+            {iaca_stop_marker}
               ret i64 %"ret"
             }}
             ''').format(
             loop_body=textwrap.indent(
                 self.root_synth.build_ir(self.get_destination_names(),
                                          self.get_source_names()), '  '),
-            phi=textwrap.indent(self.get_phi_code(), '  '))
+            phi=textwrap.indent(self.get_phi_code(), '  '),
+            iaca_start_marker=iaca_start_marker,
+            iaca_stop_marker=iaca_stop_marker)
+
+        return ir
 
 
 def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
-                       serialize=False, verbosity=0):
+                       serialize=False, verbosity=0, iaca_comparison=None):
     not_serializable = False
     try:
         # Latency Benchmark
@@ -261,6 +276,7 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
         result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
         lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
                     for t in result['runtimes']])
+        result['latency'] = lat
         if verbosity > 0:
             print('### Detailed Results')
             pprint(result)
@@ -294,10 +310,20 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
     tp = min(
         [(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
          for t in result['runtimes']])
+    result['throughput'] = tp
+    if iaca_comparison is not None:
+        iaca_analysis = b.get_iaca_analysis(iaca_comparison)
+        result['iaca throughput'] = iaca_analysis['throughput']/(
+                parallel_factor * throughput_serial_factor)
     if verbosity > 0:
         print('### Detailed Results')
         pprint(result)
         print()
+    if verbosity > 1 and iaca_comparison is not None:
+        print('### IACA Results')
+        print(iaca_analysis['output'])
+        print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
+            throughput_serial_factor, parallel_factor))
 
     # Result compilation
     return lat, tp
diff --git a/asmjit/op.py b/asmjit/op.py
index 418da11..f1c206b 100755
--- a/asmjit/op.py
+++ b/asmjit/op.py
@@ -205,9 +205,18 @@ class Instruction(Operation):
         self.source_operands = source_operands
 
     def get_source_registers(self):
-        return [sop for sop in self.source_operands if isinstance(sop, Register)] + \
-               [r for mop in self.source_operands if isinstance(mop, MemoryReference)
-                for r in mop.get_registers()]
+        sop_types = set()
+        sr = []
+        for sop in self.source_operands:
+            if isinstance(sop, Register):
+                t = (sop.llvm_type, sop.get_constraint_char())
+                if t not in sop_types:
+                    sop_types.add(t)
+                    sr.append(sop)
+            elif isinstance(sop, MemoryReference):
+                sr += list(sop.get_registers())
+
+        return sr
 
     def get_destination_registers(self):
         if isinstance(self.destination_operand, Register):
@@ -229,6 +238,7 @@ class Instruction(Operation):
 
         # Build argument string from operands and register names
         operands = []
+        sop_types = {}
         i = 0
         for sop in self.source_operands:
             if isinstance(sop, Immediate):
@@ -236,10 +246,17 @@ class Instruction(Operation):
                     type=sop.llvm_type,
                     repr=sop.value))
             elif isinstance(sop, Register):
-                operands.append('{type} {repr}'.format(
-                    type=sop.llvm_type,
-                    repr=src_reg_names[i]))
-                i += 1
+                sop_t = (sop.llvm_type, sop.get_constraint_char())
+                if sop_t in sop_types:
+                    operands.append('{type} {repr}'.format(
+                        type=sop.llvm_type,
+                        repr=src_reg_names[sop_types[sop_t]]))
+                else:
+                    sop_types[sop_t] = i
+                    operands.append('{type} {repr}'.format(
+                        type=sop.llvm_type,
+                        repr=src_reg_names[i]))
+                    i += 1
             elif isinstance(sop, MemoryReference):
                 operands.append('{type} {repr}'.format(
                     type=sop.llvm_type,
diff --git a/tablegen.py b/tablegen.py
index cd6cea4..4535512 100755
--- a/tablegen.py
+++ b/tablegen.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 
 import sys
+import textwrap
+
 import collections
 import re
 import itertools
@@ -466,44 +468,89 @@ def main():
     instructions_ret_type = collections.defaultdict(collections.OrderedDict)
     if args.verbosity > 0:
         for ret_type in rel_instruction_names:
-            print(ret_type, 'has', len(instrs), 'instructions')
+            print(ret_type, 'has', len(instructions_ret_type[ret_type]), 'instructions')
 
     # Benchmark random instruction sequences
     for instr_name, instr_op in instructions.items():
         instructions_ret_type[instr_op.get_destination_registers()[0].llvm_type][
             instr_name] = (instr_name, instr_op)
     # Constructing random benchmarks, one for each return type
-    random.seed(42)
-    parallel_factor = 8
-    for t in sorted(instructions_ret_type):
-        valid = False
-        while not valid:
-            selected_names, selected_instrs = zip(
-                *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
+    #random.seed(42)
+    #parallel_factor = 8
+    #for t in sorted(instructions_ret_type):
+    #    valid = False
+    #    while not valid:
+    #        selected_names, selected_instrs = zip(
+    #            *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
+    #
+    #        if not all([can_serialize(i) for i in selected_instrs]):
+    #            continue
+    #        else:
+    #            valid = True
+    #
+    #        serial = op.Serialized(selected_instrs)
+    #        p = op.Parallelized([serial] * parallel_factor)
+    #
+    #        init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
+    #                       p.get_source_registers()]
+    #        b = bench.IntegerLoopBenchmark(p, init_values)
+    #        print('## Selected Instructions')
+    #        print(', '.join(selected_names))
+    #        print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
+    #        print(b.get_assembly())
+    #        #pprint(selected_instrs)
+    #        r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
+    #        r['parallel_factor'] = parallel_factor
+    #        print('## Detailed Results')
+    #        pprint(r)
+    #        print("minimal throughput: {:.2f} cy".format(
+    #            min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
 
-            if not all([can_serialize(i) for i in selected_instrs]):
-                continue
-            else:
-                valid = True
+    # Reduce to 100 instructions:
+    #instructions = dict(list(instructions.items())[:100])
 
-            serial = op.Serialized(selected_instrs)
-            p = op.Parallelized([serial] * parallel_factor)
-
-            init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
-                           p.get_source_registers()]
-            b = bench.IntegerLoopBenchmark(p, init_values)
-            print('## Selected Instructions')
-            print(', '.join(selected_names))
-            print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
-            print(b.get_assembly())
-            #pprint(selected_instrs)
-            r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
-            r['parallel_factor'] = parallel_factor
-            print('## Detailed Results')
-            pprint(r)
-            print("minimal throughput: {:.2f} cy".format(
-                min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
+    # Reduce to set of instructions used in Stream Triad:
+    instructions = {k: v for k,v in instructions.items() if k in ['ADD32ri', 'ADD64ri32', 'INC64r', 'SUB32ri', 'VADDPDYrr', 'VADDSDrr', 'VADDSSrr', 'VCVTSI642SSrr', 'VFMADD213PDYr', 'VFMADD213PDr', 'VFMADD213PSYr', 'VFMADD213PSr', 'VFMADD213SDr', 'VFMADD213SSr', 'VINSERTF128rr', 'VMULPDYrr', 'VMULSDrr_Int', 'VMULSSrr_Int', 'VSUBSDrr_Int', 'VSUBSSrr_Int']}
 
+    random.seed(23)
+    instructions_per_run = 3
+    parallel_factor = 4
+    print(textwrap.dedent("""
+        # This file contains example data measured on an Intel I7-6700HQ with 2.6GHz with Turbo mode
+        # disabled.
+        
+        # Comments are possible everywhere after hash symbols.
+        
+        # This part contains necessary configuration information.
+        configuration:
+            model: three-level   # we assume that instructions are decomposed into uops
+            num_ports: 7         # our hardware has 4 execution ports
+            num_uops_per_insn: 4 # the maximal number of uops into which an instruction can be decomposed
+            slack_limit: 0.0     # relative margin of error for cycle measurements
+        
+        
+        # Here follows a list of experiments.
+        """))
+    for i in range(100):
+        selected_names, selected_instrs = zip(*[random.choice(list(instructions.items()))
+                                                for i in range(instructions_per_run)])
+        print("experiment:")
+        p = op.Parallelized(selected_instrs*parallel_factor)
+        b = bench.IntegerLoopBenchmark(p)
+        print('    instructions:')
+        print('        '+('\n        '.join(selected_names)))
+        if args.verbosity > 0:
+            print('    ir:')
+            print(textwrap.indent(b.build_ir(), ' '*8))
+            print('    asm:')
+            print(textwrap.indent(b.get_assembly(), ' '*8))
+        r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
+        r['parallel_factor'] = parallel_factor
+        if args.verbosity > 0:
+            print('    detailed_result:')
+            pprint(r, indent=8)
+        print("    cycles: {:.2f}".format(
+            min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
 
 def can_serialize(instr):
     if not any([so.llvm_type == instr.destination_operand.llvm_type and
@@ -517,7 +564,7 @@ def can_serialize(instr):
 def combined_instructions(instructions, length):
     for instr_names in itertools.combinations(instructions, length):
         instrs = [instructions[n] for n in instr_names]
-        dst_types = list([i.get_destination_registers()[0].llvm_type ])
+        dst_types = list([i.get_destination_registers()[0].llvm_type for i in instrs])
         if not all([can_serialize(i) for i in instrs]) and dst_types[1:] == dst_types[:-1]:
             continue
         yield instrs