added support for iaca analysis

This commit is contained in:
Julian Hammer
2018-07-10 14:26:19 +02:00
parent 3dac7d6795
commit 6786c399ad
4 changed files with 166 additions and 73 deletions

View File

@@ -1,6 +1,8 @@
#!/usr/bin/env python3
import argparse
import llvmlite.binding as llvm
from . import op, bench
@@ -18,6 +20,9 @@ def main():
'benchmark')
parser.add_argument('--throughput-serial', '-t', type=int, default=8,
help='length of serial instances of serial chains in throughput benchmark')
parser.add_argument('--iaca', type=str, default=None,
help='Compare throughput measurement with IACA analysis, pass '
'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
parser.add_argument("--verbose", "-v", action="count", default=0,
help="increase output verbosity")
args = parser.parse_args()
@@ -28,12 +33,10 @@ def main():
parallel_factor=args.parallel,
throughput_serial_factor=args.throughput_serial,
serialize=args.serialize,
verbosity=args.verbose)
verbosity=args.verbose,
iaca_comparison=args.iaca)
print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
#b = bench.IntegerLoopBenchmark(args.instructions[0])
#b.get_iaca_analysis()
if __name__ == "__main__":
main()

View File

@@ -7,9 +7,14 @@ import re
from pprint import pprint
import tempfile
import subprocess
import sys
import llvmlite.binding as llvm
import psutil
try:
from kerncraft import iaca
except ImportError:
iaca = None
from . import op
@@ -28,10 +33,6 @@ def uniquify(l):
class Benchmark:
def __init__(self):
self._tm = None
self._llvm_module = None
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
@@ -47,34 +48,30 @@ class Benchmark:
return int(previous_args[0] * time_factor),
@staticmethod
def get_iterations(args):
def get_iterations(args) -> int:
"""Return number of iterations performed, based on lower level function arguments."""
return args[0]
def build_ir(self):
raise NotImplementedError()
def get_llvm_module(self):
def get_llvm_module(self, iaca_marker=False):
"""Build and return LLVM module from LLVM IR code."""
if self._llvm_module is None:
self._llvm_module = llvm.parse_assembly(self.build_ir())
self._llvm_module.verify()
return self._llvm_module
ir = self.build_ir(iaca_marker=iaca_marker)
return llvm.parse_assembly(ir)
def get_target_machine(self):
"""Instantiate and return target machine."""
if self._tm is None:
features = llvm.get_host_cpu_features().flatten()
cpu = llvm.get_host_cpu_name()
self._tm = llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=3)
return self._tm
features = llvm.get_host_cpu_features().flatten()
cpu = llvm.get_host_cpu_name()
return llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=3)
def get_assembly(self):
def get_assembly(self, iaca_marker=False):
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
asm = tm.emit_assembly(self.get_llvm_module())
asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
# Remove double comments
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
return asm
@@ -82,17 +79,15 @@ class Benchmark:
def get_function_ctype(self):
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
def get_iaca_analysis(self):
def get_iaca_analysis(self, arch):
"""Compile and return IACA analysis."""
if iaca is None:
raise ValueError("kerncraft not installed. IACA analysis is not supported.")
tm = self.get_target_machine()
tmpf = tempfile.NamedTemporaryFile("wb")
tmpf.write(tm.emit_object(self.get_llvm_module()))
tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
tmpf.flush()
# assuming "iaca.sh" to be available
subprocess.check_output(['objdump', tmpf.name])
# WORK IN PROGRESS
return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT
@@ -141,10 +136,11 @@ class Benchmark:
class LoopBenchmark(Benchmark):
def __init__(self, root_synth, init_values=None):
def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True):
super().__init__()
self.root_synth = root_synth
self.init_values = init_values or root_synth.get_default_init_values()
self.loop_carried_dependencies = loop_carried_dependencies
if len(root_synth.get_source_registers()) != len(self.init_values):
raise ValueError("Number of init values and source registers do not match.")
@@ -156,7 +152,9 @@ class LoopBenchmark(Benchmark):
return ['%out.{}'.format(i) for i in
range(len(self.root_synth.get_destination_registers()))]
def get_phi_code(self, latency=True):
def get_phi_code(self):
if not self.loop_carried_dependencies:
return ''
# Compile loop carried dependencies
lcd = []
# Change in naming (src <-> dst) is on purpose!
@@ -210,8 +208,19 @@ class LoopBenchmark(Benchmark):
class IntegerLoopBenchmark(LoopBenchmark):
def build_ir(self):
return textwrap.dedent('''\
def build_ir(self, iaca_marker=False):
if iaca_marker:
iaca_start_marker = textwrap.dedent('''\
call void asm "movl $$111,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
iaca_stop_marker = textwrap.dedent('''\
call void asm "movl $$222,%ebx", ""()
call void asm ".byte 100,103,144", ""()''')
else:
iaca_start_marker = ''
iaca_stop_marker = ''
ir = textwrap.dedent('''\
define i64 @"test"(i64 %"N")
{{
entry:
@@ -221,24 +230,30 @@ class IntegerLoopBenchmark(LoopBenchmark):
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{phi}
{iaca_start_marker}
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
{iaca_stop_marker}
ret i64 %"ret"
}}
''').format(
loop_body=textwrap.indent(
self.root_synth.build_ir(self.get_destination_names(),
self.get_source_names()), ' '),
phi=textwrap.indent(self.get_phi_code(), ' '))
phi=textwrap.indent(self.get_phi_code(), ' '),
iaca_start_marker=iaca_start_marker,
iaca_stop_marker=iaca_stop_marker)
return ir
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
serialize=False, verbosity=0):
serialize=False, verbosity=0, iaca_comparison=None):
not_serializable = False
try:
# Latency Benchmark
@@ -261,6 +276,7 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
result['latency'] = lat
if verbosity > 0:
print('### Detailed Results')
pprint(result)
@@ -294,10 +310,20 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
tp = min(
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
result['throughput'] = tp
if iaca_comparison is not None:
iaca_analysis = b.get_iaca_analysis(iaca_comparison)
result['iaca throughput'] = iaca_analysis['throughput']/(
parallel_factor * throughput_serial_factor)
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
if verbosity > 1 and iaca_comparison is not None:
print('### IACA Results')
print(iaca_analysis['output'])
print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
throughput_serial_factor, parallel_factor))
# Result compilation
return lat, tp

View File

@@ -205,9 +205,18 @@ class Instruction(Operation):
self.source_operands = source_operands
def get_source_registers(self):
return [sop for sop in self.source_operands if isinstance(sop, Register)] + \
[r for mop in self.source_operands if isinstance(mop, MemoryReference)
for r in mop.get_registers()]
sop_types = set()
sr = []
for sop in self.source_operands:
if isinstance(sop, Register):
t = (sop.llvm_type, sop.get_constraint_char())
if t not in sop_types:
sop_types.add(t)
sr.append(sop)
elif isinstance(sop, MemoryReference):
sr += list(sop.get_registers())
return sr
def get_destination_registers(self):
if isinstance(self.destination_operand, Register):
@@ -229,6 +238,7 @@ class Instruction(Operation):
# Build argument string from operands and register names
operands = []
sop_types = {}
i = 0
for sop in self.source_operands:
if isinstance(sop, Immediate):
@@ -236,10 +246,17 @@ class Instruction(Operation):
type=sop.llvm_type,
repr=sop.value))
elif isinstance(sop, Register):
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[i]))
i += 1
sop_t = (sop.llvm_type, sop.get_constraint_char())
if sop_t in sop_types:
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[sop_types[sop_t]]))
else:
sop_types[sop_t] = i
operands.append('{type} {repr}'.format(
type=sop.llvm_type,
repr=src_reg_names[i]))
i += 1
elif isinstance(sop, MemoryReference):
operands.append('{type} {repr}'.format(
type=sop.llvm_type,

View File

@@ -1,6 +1,8 @@
#!/usr/bin/env python3
import sys
import textwrap
import collections
import re
import itertools
@@ -466,44 +468,89 @@ def main():
instructions_ret_type = collections.defaultdict(collections.OrderedDict)
if args.verbosity > 0:
for ret_type in rel_instruction_names:
print(ret_type, 'has', len(instrs), 'instructions')
print(ret_type, 'has', len(instructions_ret_type[ret_type]), 'instructions')
# Benchmark random instruction sequences
for instr_name, instr_op in instructions.items():
instructions_ret_type[instr_op.get_destination_registers()[0].llvm_type][
instr_name] = (instr_name, instr_op)
# Constructing random benchmarks, one for each return type
random.seed(42)
parallel_factor = 8
for t in sorted(instructions_ret_type):
valid = False
while not valid:
selected_names, selected_instrs = zip(
*[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
#random.seed(42)
#parallel_factor = 8
#for t in sorted(instructions_ret_type):
# valid = False
# while not valid:
# selected_names, selected_instrs = zip(
# *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
#
# if not all([can_serialize(i) for i in selected_instrs]):
# continue
# else:
# valid = True
#
# serial = op.Serialized(selected_instrs)
# p = op.Parallelized([serial] * parallel_factor)
#
# init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
# p.get_source_registers()]
# b = bench.IntegerLoopBenchmark(p, init_values)
# print('## Selected Instructions')
# print(', '.join(selected_names))
# print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
# print(b.get_assembly())
# #pprint(selected_instrs)
# r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
# r['parallel_factor'] = parallel_factor
# print('## Detailed Results')
# pprint(r)
# print("minimal throughput: {:.2f} cy".format(
# min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
if not all([can_serialize(i) for i in selected_instrs]):
continue
else:
valid = True
# Reduce to 100 instructions:
#instructions = dict(list(instructions.items())[:100])
serial = op.Serialized(selected_instrs)
p = op.Parallelized([serial] * parallel_factor)
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
p.get_source_registers()]
b = bench.IntegerLoopBenchmark(p, init_values)
print('## Selected Instructions')
print(', '.join(selected_names))
print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
print(b.get_assembly())
#pprint(selected_instrs)
r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
r['parallel_factor'] = parallel_factor
print('## Detailed Results')
pprint(r)
print("minimal throughput: {:.2f} cy".format(
min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
# Reduce to set of instructions used in Stream Triad:
instructions = {k: v for k,v in instructions.items() if k in ['ADD32ri', 'ADD64ri32', 'INC64r', 'SUB32ri', 'VADDPDYrr', 'VADDSDrr', 'VADDSSrr', 'VCVTSI642SSrr', 'VFMADD213PDYr', 'VFMADD213PDr', 'VFMADD213PSYr', 'VFMADD213PSr', 'VFMADD213SDr', 'VFMADD213SSr', 'VINSERTF128rr', 'VMULPDYrr', 'VMULSDrr_Int', 'VMULSSrr_Int', 'VSUBSDrr_Int', 'VSUBSSrr_Int']}
random.seed(23)
instructions_per_run = 3
parallel_factor = 4
print(textwrap.dedent("""
# This file contains example data measured on an Intel I7-6700HQ with 2.6GHz with Turbo mode
# disabled.
# Comments are possible everywhere after hash symbols.
# This part contains necessary configuration information.
configuration:
model: three-level # we assume that instructions are decomposed into uops
num_ports: 7 # our hardware has 4 execution ports
num_uops_per_insn: 4 # the maximal number of uops into which an instruction can be decomposed
slack_limit: 0.0 # relative margin of error for cycle measurements
# Here follows a list of experiments.
"""))
for i in range(100):
selected_names, selected_instrs = zip(*[random.choice(list(instructions.items()))
for i in range(instructions_per_run)])
print("experiment:")
p = op.Parallelized(selected_instrs*parallel_factor)
b = bench.IntegerLoopBenchmark(p)
print(' instructions:')
print(' '+('\n '.join(selected_names)))
if args.verbosity > 0:
print(' ir:')
print(textwrap.indent(b.build_ir(), ' '*8))
print(' asm:')
print(textwrap.indent(b.get_assembly(), ' '*8))
r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
r['parallel_factor'] = parallel_factor
if args.verbosity > 0:
print(' detailed_result:')
pprint(r, indent=8)
print(" cycles: {:.2f}".format(
min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
def can_serialize(instr):
if not any([so.llvm_type == instr.destination_operand.llvm_type and
@@ -517,7 +564,7 @@ def can_serialize(instr):
def combined_instructions(instructions, length):
for instr_names in itertools.combinations(instructions, length):
instrs = [instructions[n] for n in instr_names]
dst_types = list([i.get_destination_registers()[0].llvm_type ])
dst_types = list([i.get_destination_registers()[0].llvm_type for i in instrs])
if not all([can_serialize(i) for i in instrs]) and dst_types[1:] == dst_types[:-1]:
continue
yield instrs