mirror of
https://github.com/RRZE-HPC/asmbench.git
synced 2026-01-06 20:40:07 +01:00
first CLI :)
This commit is contained in:
0
asmjit/__init__.py
Normal file
0
asmjit/__init__.py
Normal file
33
asmjit/__main__.py
Normal file
33
asmjit/__main__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
from . import op, bench
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Assembly Instruction Benchmark Toolkit')
|
||||
# parser.add_argument('mode', metavar='MODE', type=str, choices=['latency', 'throughput'])
|
||||
parser.add_argument('instructions', metavar='INSTR', type=op.Instruction.from_string, nargs='+',
|
||||
help='instruction declaration, e.g., "add {src:i32:r} {srcdst:i32:r}"')
|
||||
parser.add_argument('--latency-serial', '-l', type=int, default=8,
|
||||
help='length of serial chain for each instruction in latency benchmark')
|
||||
parser.add_argument('--parallel', '-p',type=int, default=4,
|
||||
help='number of parallel instances of serial chains in throughput '
|
||||
'benchmark')
|
||||
parser.add_argument('--throughput-serial', '-t', type=int, default=8,
|
||||
help='length of serial instances of serial chains in throughput benchmark')
|
||||
parser.add_argument("--verbose", "-v", action="count", default=0,
|
||||
help="increase output verbosity")
|
||||
args = parser.parse_args()
|
||||
|
||||
bench.setup_llvm()
|
||||
lat, tp = bench.bench_instructions(args.instructions,
|
||||
serial_factor=args.latency_serial,
|
||||
parallel_factor=args.parallel,
|
||||
throughput_serial_factor=args.throughput_serial,
|
||||
verbosity=args.verbose)
|
||||
print("Latency: {}\nThroughput: {}\n".format(lat, tp))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -3,11 +3,20 @@ import ctypes
|
||||
import time
|
||||
import textwrap
|
||||
import itertools
|
||||
import re
|
||||
from pprint import pprint
|
||||
|
||||
import llvmlite.binding as llvm
|
||||
import psutil
|
||||
|
||||
from asmjit import op
|
||||
from . import op
|
||||
|
||||
|
||||
def setup_llvm():
|
||||
llvm.initialize()
|
||||
llvm.initialize_native_target()
|
||||
llvm.initialize_native_asmprinter()
|
||||
llvm.initialize_native_asmparser()
|
||||
|
||||
|
||||
def uniquify(l):
|
||||
@@ -31,7 +40,7 @@ class Benchmark:
|
||||
def prepare_arguments(previous_args=None, time_factor=1.0):
|
||||
"""Build argument tuple, to be passed to low level function."""
|
||||
if previous_args is None:
|
||||
return 100,
|
||||
return 10000000,
|
||||
else:
|
||||
return int(previous_args[0] * time_factor),
|
||||
|
||||
@@ -63,7 +72,10 @@ class Benchmark:
|
||||
"""Compile and return assembly from LLVM module."""
|
||||
tm = self.get_target_machine()
|
||||
tm.set_asm_verbosity(0)
|
||||
return tm.emit_assembly(self.get_llvm_module())
|
||||
asm = tm.emit_assembly(self.get_llvm_module())
|
||||
# Remove double comments
|
||||
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
|
||||
return asm
|
||||
|
||||
def get_function_ctype(self):
|
||||
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
|
||||
@@ -72,6 +84,7 @@ class Benchmark:
|
||||
# Compile the module to machine code using MCJIT
|
||||
tm = self.get_target_machine()
|
||||
runtimes = []
|
||||
return_values = []
|
||||
args = self.prepare_arguments()
|
||||
with llvm.create_mcjit_compiler(self.get_llvm_module(), tm) as ee:
|
||||
ee.finalize_object()
|
||||
@@ -88,11 +101,10 @@ class Benchmark:
|
||||
# TODO replace time.clock with a C implemententation for less overhead
|
||||
# TODO return result in machine readable format
|
||||
fixed_args = False
|
||||
results = []
|
||||
for i in range(repeat):
|
||||
while True:
|
||||
start = time.perf_counter()
|
||||
results.append(cfunc(*args))
|
||||
ret = cfunc(*args)
|
||||
end = time.perf_counter()
|
||||
elapsed = end - start
|
||||
if not fixed_args and (elapsed < min_elapsed or elapsed > max_elapsed):
|
||||
@@ -104,13 +116,14 @@ class Benchmark:
|
||||
# After we have the right argument choice, we keep it.
|
||||
fixed_args = True
|
||||
break
|
||||
return_values.append(ret)
|
||||
runtimes.append(elapsed)
|
||||
|
||||
return {'iterations': self.get_iterations(args),
|
||||
'arguments': args,
|
||||
'runtimes': runtimes,
|
||||
'frequency': psutil.cpu_freq().current * 1e6,
|
||||
'results': results}
|
||||
'returned': return_values}
|
||||
|
||||
|
||||
class LoopBenchmark(Benchmark):
|
||||
@@ -210,63 +223,62 @@ class IntegerLoopBenchmark(LoopBenchmark):
|
||||
phi=textwrap.indent(self.get_phi_code(), ' '))
|
||||
|
||||
|
||||
def bench_instruction(instruction, serial_factor=8, parallel_factor=8, parallel_serial_factor=8):
|
||||
# Latency Benchmark
|
||||
s = op.Serialized([instruction] * serial_factor)
|
||||
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in s.get_source_registers()]
|
||||
b = IntegerLoopBenchmark(s, init_values)
|
||||
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
|
||||
# Throughput Benchmark
|
||||
p = op.Parallelized([op.Serialized([instruction] * parallel_serial_factor)] * parallel_factor)
|
||||
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
|
||||
p.get_source_registers()]
|
||||
b = IntegerLoopBenchmark(p, init_values)
|
||||
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
tp = min(
|
||||
[(t / parallel_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
|
||||
# Result compilation
|
||||
return lat, tp
|
||||
|
||||
|
||||
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, parallel_serial_factor=8):
|
||||
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
|
||||
verbosity=0):
|
||||
# Latency Benchmark
|
||||
if verbosity > 0:
|
||||
print('## Latency Benchmark')
|
||||
p_instrs = []
|
||||
for i in instructions:
|
||||
p_instrs.append(op.Serialized([i] * serial_factor))
|
||||
p = op.Parallelized(p_instrs)
|
||||
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in p.get_source_registers()]
|
||||
b = IntegerLoopBenchmark(p, init_values)
|
||||
if verbosity >= 3:
|
||||
print('### LLVM IR')
|
||||
print(b.build_ir())
|
||||
if verbosity >= 2:
|
||||
print('### Assembly')
|
||||
print(b.get_assembly())
|
||||
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
if verbosity > 0:
|
||||
print('### Detailed Results')
|
||||
pprint(result)
|
||||
print()
|
||||
|
||||
# Throughput Benchmark
|
||||
if verbosity > 0:
|
||||
print('## Throughput Benchmark')
|
||||
p_instrs = []
|
||||
for i in instructions:
|
||||
p_instrs.append(op.Serialized([i] * parallel_serial_factor))
|
||||
p_instrs.append(op.Serialized([i] * throughput_serial_factor))
|
||||
p = op.Parallelized(p_instrs * parallel_factor)
|
||||
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
|
||||
p.get_source_registers()]
|
||||
b = IntegerLoopBenchmark(p, init_values)
|
||||
if verbosity >= 3:
|
||||
print('### LLVM IR')
|
||||
print(b.build_ir())
|
||||
if verbosity >= 2:
|
||||
print('### Assembly')
|
||||
print(b.get_assembly())
|
||||
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
tp = min(
|
||||
[(t / parallel_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
|
||||
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
if verbosity > 0:
|
||||
print('### Detailed Results')
|
||||
pprint(result)
|
||||
print()
|
||||
|
||||
# Result compilation
|
||||
return lat, tp
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
llvm.initialize()
|
||||
llvm.initialize_native_target()
|
||||
llvm.initialize_native_asmprinter()
|
||||
llvm.initialize_native_asmparser()
|
||||
setup_llvm()
|
||||
|
||||
i1 = op.Instruction(
|
||||
instruction='add $2, $0',
|
||||
@@ -303,10 +315,10 @@ if __name__ == '__main__':
|
||||
print(b.get_assembly())
|
||||
print(b.build_and_execute())
|
||||
|
||||
print(bench_instruction(op.Instruction(
|
||||
print(bench_instructions([op.Instruction(
|
||||
instruction='add $2, $0',
|
||||
destination_operand=op.Register('i64', 'r'),
|
||||
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])))
|
||||
source_operands=[op.Register('i64', '0'), op.Immediate('i64', '1')])]))
|
||||
|
||||
# if len(s.get_source_operand_types())
|
||||
# b = IntegerLoopBenchmark(loop_body,
|
||||
|
||||
50
asmjit/op.py
50
asmjit/op.py
@@ -1,10 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
|
||||
# TODO use abc to force implementation of interface requirements
|
||||
|
||||
init_value_by_llvm_type = {'i' + bits: '1' for bits in ['1', '8', '16', '32', '64']}
|
||||
init_value_by_llvm_type.update({fp_type: '1.0' for fp_type in ['float', 'double', 'fp128']})
|
||||
init_value_by_llvm_type = {'i' + bits: '3' for bits in ['1', '8', '16', '32', '64']}
|
||||
init_value_by_llvm_type.update({fp_type: '1.00023' for fp_type in ['float', 'double', 'fp128']})
|
||||
init_value_by_llvm_type.update(
|
||||
{'<{} x {}>'.format(vec, t): '<' + ', '.join([t + ' ' + v] * vec) + '>'
|
||||
for t, v in init_value_by_llvm_type.items()
|
||||
@@ -191,35 +191,37 @@ class Instruction(Operation):
|
||||
Create Instruction object from string.
|
||||
|
||||
:param s: must have the form:
|
||||
"asm_instruction_name ( (src|dst|srcdst):llvm_type:constraint_char)+"
|
||||
"asm_instruction_name ({(src|dst|srcdst):llvm_type:constraint_char})+"
|
||||
"""
|
||||
instruction, operands = s.split(maxsplit=1)
|
||||
instruction = s
|
||||
# It is important that the match objects are in reverse order, to allow string replacements
|
||||
# based on original match group locations
|
||||
operands = list(reversed(list(re.finditer(r"\{((?:src|dst)+):([^\}]+)\}", s))))
|
||||
# Destination indices start at 0, source indices at "number of destination operands"
|
||||
dst_index, src_index = 0, ['dst' in o.group(1) for o in operands].count(True)
|
||||
dst_ops = []
|
||||
src_ops = []
|
||||
src_index = list(['dst' in o.split()[0] for o in operands.split()]).count(True)
|
||||
dst_index = 0
|
||||
for o in operands.split():
|
||||
direction, reg_options = o.split(':', maxsplit=1)
|
||||
r = Register.from_string(reg_options)
|
||||
valid = False
|
||||
for m in operands:
|
||||
direction, register_string = m.group(1, 2)
|
||||
register = Register.from_string(register_string)
|
||||
if 'src' in direction and not 'dst' in direction:
|
||||
valid = True
|
||||
src_ops.append(r)
|
||||
instruction += " ${}".format(src_index)
|
||||
src_ops.append(register)
|
||||
# replace with index string
|
||||
instruction = (instruction[:m.start()] + "${}".format(src_index)
|
||||
+ instruction[m.end():])
|
||||
src_index += 1
|
||||
if 'dst' in direction:
|
||||
valid = True
|
||||
dst_ops.append(r)
|
||||
instruction += " ${}".format(dst_index)
|
||||
dst_ops.append(register)
|
||||
# replace with index string
|
||||
instruction = (instruction[:m.start()] + "${}".format(dst_index)
|
||||
+ instruction[m.end():])
|
||||
if 'src' in direction:
|
||||
src_ops.append(Register(reg_options.split(':', 1)[0], str(dst_index)))
|
||||
src_ops.append(Register(register_string.split(':', 1)[0], str(dst_index)))
|
||||
dst_index += 1
|
||||
if not valid:
|
||||
raise ValueError("Invalid direction '{}', may only be src, dst or srcdst.".format(
|
||||
direction))
|
||||
assert len(dst_ops) == 1, "Instruction supports only single destinations."
|
||||
return cls(instruction, dst_ops[0], src_ops)
|
||||
|
||||
if len(dst_ops) != 1:
|
||||
raise ValueError("Instruction supports only single destinations.")
|
||||
return cls(instruction, dst_ops[0], src_ops)
|
||||
|
||||
|
||||
class Load(Operation):
|
||||
@@ -404,4 +406,4 @@ if __name__ == '__main__':
|
||||
s4 = Serialized([i1, i2, i3, i4, i5, i6])
|
||||
print(s4.build_ir(['%out'], ['%in']), '\n')
|
||||
|
||||
print(Instruction.from_string("add src:i64:r srcdst:i64:r"))
|
||||
print(Instruction.from_string("add {src:i64:r} {srcdst:i64:r}"))
|
||||
|
||||
2
setup.py
2
setup.py
@@ -10,5 +10,5 @@ setup(
|
||||
author='Julian Hammer',
|
||||
author_email='julian.hammer@u-sys.org',
|
||||
description='A Benchmark Toolkit for Assembly Instructions Using the LLVM JIT',
|
||||
install_requires=['llvmlite>=0.23.2'],
|
||||
install_requires=['llvmlite>=0.23.2', 'psutil'],
|
||||
)
|
||||
|
||||
@@ -443,7 +443,7 @@ def main():
|
||||
|
||||
# Benchmark TP and Lat for each instruction
|
||||
# for instr_name, instr_op in instructions.items():
|
||||
# tp, lat = bench.bench_instruction(instr_op)
|
||||
# tp, lat = bench.bench_instructions([instr_op])
|
||||
# print("{:>12} {:>5.2f} {:>5.2f}".format(instr_name, tp, lat))
|
||||
|
||||
# Benchmark TP and Lat for all valid instruction pairs
|
||||
|
||||
Reference in New Issue
Block a user