Files
asmbench/asmjit/bench.py
Julian Hammer 0349de9e64 first CLI :)
2018-06-26 15:11:35 +02:00

328 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
import ctypes
import time
import textwrap
import itertools
import re
from pprint import pprint
import llvmlite.binding as llvm
import psutil
from . import op
def setup_llvm():
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
llvm.initialize_native_asmparser()
def uniquify(l):
# Uniquify list while preserving order
seen = set()
return [x for x in l if x not in seen and not seen.add(x)]
class Benchmark:
def __init__(self):
self._tm = None
self._llvm_module = None
def __repr__(self):
return '{}({})'.format(
self.__class__.__name__,
', '.join(['{}={!r}'.format(k, v) for k, v in self.__dict__.items()
if not k.startswith('_')]))
@staticmethod
def prepare_arguments(previous_args=None, time_factor=1.0):
"""Build argument tuple, to be passed to low level function."""
if previous_args is None:
return 10000000,
else:
return int(previous_args[0] * time_factor),
@staticmethod
def get_iterations(args):
"""Return number of iterations performed, based on lower level function arguments."""
return args[0]
def build_ir(self):
raise NotImplementedError()
def get_llvm_module(self):
"""Build and return LLVM module from LLVM IR code."""
if self._llvm_module is None:
self._llvm_module = llvm.parse_assembly(self.build_ir())
self._llvm_module.verify()
return self._llvm_module
def get_target_machine(self):
"""Instantiate and return target machine."""
if self._tm is None:
features = llvm.get_host_cpu_features().flatten()
cpu = llvm.get_host_cpu_name()
self._tm = llvm.Target.from_default_triple().create_target_machine(
cpu=cpu, features=features, opt=3)
return self._tm
def get_assembly(self):
"""Compile and return assembly from LLVM module."""
tm = self.get_target_machine()
tm.set_asm_verbosity(0)
asm = tm.emit_assembly(self.get_llvm_module())
# Remove double comments
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
return asm
def get_function_ctype(self):
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
# Compile the module to machine code using MCJIT
tm = self.get_target_machine()
runtimes = []
return_values = []
args = self.prepare_arguments()
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
ee.finalize_object()
# Obtain a pointer to the compiled 'sum' - it's the address of its JITed
# code in memory.
cfptr = ee.get_function_address('test')
# To convert an address to an actual callable thing we have to use
# CFUNCTYPE, and specify the arguments & return type.
cfunc = self.get_function_ctype()(cfptr)
# Now 'cfunc' is an actual callable we can invoke
# TODO replace time.clock with a C implemententation for less overhead
# TODO return result in machine readable format
fixed_args = False
for i in range(repeat):
while True:
start = time.perf_counter()
ret = cfunc(*args)
end = time.perf_counter()
elapsed = end - start
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
target_elapsed = 2 / 3 * min_elapsed + 1 / 3 * max_elapsed
factor = target_elapsed / elapsed
args = self.prepare_arguments(previous_args=args, time_factor=factor)
continue
else:
# After we have the right argument choice, we keep it.
fixed_args = True
break
return_values.append(ret)
runtimes.append(elapsed)
return {'iterations': self.get_iterations(args),
'arguments': args,
'runtimes': runtimes,
'frequency': psutil.cpu_freq().current * 1e6,
'returned': return_values}
class LoopBenchmark(Benchmark):
def __init__(self, root_synth, init_values=None):
super().__init__()
self.root_synth = root_synth
self.init_values = init_values or []
if len(root_synth.get_source_registers()) != len(self.init_values):
raise ValueError("Number of init values and source registers do not match.")
def get_source_names(self):
return ['%in.{}'.format(i) for i in range(len(self.root_synth.get_source_registers()))]
def get_destination_names(self):
return ['%out.{}'.format(i) for i in
range(len(self.root_synth.get_destination_registers()))]
def get_phi_code(self):
# Compile loop carried dependencies
lcd = []
# Change in naming (src <-> dst) is on purpose!
srcs = self.root_synth.get_destination_registers()
dsts = self.root_synth.get_source_registers()
# cycle iterator is used to not only reuse a single destination, but go through all of them
srcs_it = itertools.cycle(enumerate(srcs))
matched = False
last_match_idx = len(srcs) - 1
for dst_idx, dst in enumerate(dsts):
for src_idx, src in srcs_it:
if src.llvm_type == dst.llvm_type:
lcd.append([dst,
self.get_source_names()[dst_idx],
self.init_values[dst_idx],
src,
self.get_destination_names()[src_idx]])
matched = True
last_match_idx = src_idx
break
# since srcs_it is an infinity iterator, we need to abort after a complete cycle
if src_idx == last_match_idx:
break
if not matched:
raise ValueError("Unable to match source to any destination.")
code = ''
for dst_reg, dst_name, init_value, src_reg, src_name in lcd:
assert dst_reg.llvm_type == src_reg.llvm_type, \
"Source and destination types do not match"
code += ('{dst_name} = phi {llvm_type} [{init_value}, %"entry"], '
'[{src_name}, %"loop"]\n').format(
llvm_type=dst_reg.llvm_type,
dst_name=dst_name,
init_value=init_value,
src_name=src_name)
# Add extra phi for constant values. Assuming LLVM will optimiz them "away"
for dst_idx, dst in enumerate(dsts):
if dst not in [d for d, dn, i, s, sn in lcd]:
code += ('{dst_reg} = phi {llvm_type} [{init_value}, %"entry"], '
'[{init_value}, %"loop"]\n').format(
llvm_type=dst.llvm_type,
dst_reg=self.get_source_names()[dst_idx],
init_value=self.init_values[dst_idx])
return code
def build_ir(self):
raise NotImplementedError()
class IntegerLoopBenchmark(LoopBenchmark):
def build_ir(self):
return textwrap.dedent('''\
define i64 @"test"(i64 %"N")
{{
entry:
%"loop_cond" = icmp slt i64 0, %"N"
br i1 %"loop_cond", label %"loop", label %"end"
loop:
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
{phi}
{loop_body}
%"loop_counter.1" = add i64 %"loop_counter", 1
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
br i1 %"loop_cond.1", label %"loop", label %"end"
end:
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
ret i64 %"ret"
}}
''').format(
loop_body=textwrap.indent(
self.root_synth.build_ir(self.get_destination_names(),
self.get_source_names()), ' '),
phi=textwrap.indent(self.get_phi_code(), ' '))
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
verbosity=0):
# Latency Benchmark
if verbosity > 0:
print('## Latency Benchmark')
p_instrs = []
for i in instructions:
p_instrs.append(op.Serialized([i] * serial_factor))
p = op.Parallelized(p_instrs)
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in p.get_source_registers()]
b = IntegerLoopBenchmark(p, init_values)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
# Throughput Benchmark
if verbosity > 0:
print('## Throughput Benchmark')
p_instrs = []
for i in instructions:
p_instrs.append(op.Serialized([i] * throughput_serial_factor))
p = op.Parallelized(p_instrs * parallel_factor)
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
p.get_source_registers()]
b = IntegerLoopBenchmark(p, init_values)
if verbosity >= 3:
print('### LLVM IR')
print(b.build_ir())
if verbosity >= 2:
print('### Assembly')
print(b.get_assembly())
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
tp = min(
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
for t in result['runtimes']])
if verbosity > 0:
print('### Detailed Results')
pprint(result)
print()
# Result compilation
return lat, tp
if __name__ == '__main__':
setup_llvm()
i1 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
i2 = op.Instruction(
instruction='sub $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])
s = op.Serialized([i1, i2])
i3 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
i4 = op.Instruction(
instruction='sub $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
i5 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '23')])
i6 = op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Register('i64', 'r')])
s1 = op.Serialized([i1, i2])
s2 = op.Serialized([s1, i3])
s3 = op.Serialized([i4, i5])
p1 = op.Parallelized([i6, s2, s3])
init_values = ['1' for r in p1.get_source_registers()]
b = IntegerLoopBenchmark(p1, init_values)
print(b.build_ir())
print(b.get_assembly())
print(b.build_and_execute())
print(bench_instructions([op.Instruction(
instruction='add $2, $0',
destination_operand=op.Register('i64', 'r'),
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])]))
# if len(s.get_source_operand_types())
# b = IntegerLoopBenchmark(loop_body,
# [(type_, dst_reg, '1', src_reg)
# # for type_, dst_reg, src_reg in zip(s.get_last_destination_type(), )])
# print(b.get_ir())