mirror of
https://github.com/RRZE-HPC/asmbench.git
synced 2026-01-07 21:10:05 +01:00
added support for iaca analysis
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
import llvmlite.binding as llvm
|
||||
|
||||
from . import op, bench
|
||||
|
||||
|
||||
@@ -18,6 +20,9 @@ def main():
|
||||
'benchmark')
|
||||
parser.add_argument('--throughput-serial', '-t', type=int, default=8,
|
||||
help='length of serial instances of serial chains in throughput benchmark')
|
||||
parser.add_argument('--iaca', type=str, default=None,
|
||||
help='Compare throughput measurement with IACA analysis, pass '
|
||||
'micro-architecuture abbreviation. (i.e. SNB, IVB, HSW, SKL, SKX)')
|
||||
parser.add_argument("--verbose", "-v", action="count", default=0,
|
||||
help="increase output verbosity")
|
||||
args = parser.parse_args()
|
||||
@@ -28,12 +33,10 @@ def main():
|
||||
parallel_factor=args.parallel,
|
||||
throughput_serial_factor=args.throughput_serial,
|
||||
serialize=args.serialize,
|
||||
verbosity=args.verbose)
|
||||
verbosity=args.verbose,
|
||||
iaca_comparison=args.iaca)
|
||||
print("Latency: {:.2f} cycle\nThroughput: {:.2f} cycle\n".format(lat, tp))
|
||||
|
||||
#b = bench.IntegerLoopBenchmark(args.instructions[0])
|
||||
#b.get_iaca_analysis()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -7,9 +7,14 @@ import re
|
||||
from pprint import pprint
|
||||
import tempfile
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import llvmlite.binding as llvm
|
||||
import psutil
|
||||
try:
|
||||
from kerncraft import iaca
|
||||
except ImportError:
|
||||
iaca = None
|
||||
|
||||
from . import op
|
||||
|
||||
@@ -28,10 +33,6 @@ def uniquify(l):
|
||||
|
||||
|
||||
class Benchmark:
|
||||
def __init__(self):
|
||||
self._tm = None
|
||||
self._llvm_module = None
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(
|
||||
self.__class__.__name__,
|
||||
@@ -47,34 +48,30 @@ class Benchmark:
|
||||
return int(previous_args[0] * time_factor),
|
||||
|
||||
@staticmethod
|
||||
def get_iterations(args):
|
||||
def get_iterations(args) -> int:
|
||||
"""Return number of iterations performed, based on lower level function arguments."""
|
||||
return args[0]
|
||||
|
||||
def build_ir(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_llvm_module(self):
|
||||
def get_llvm_module(self, iaca_marker=False):
|
||||
"""Build and return LLVM module from LLVM IR code."""
|
||||
if self._llvm_module is None:
|
||||
self._llvm_module = llvm.parse_assembly(self.build_ir())
|
||||
self._llvm_module.verify()
|
||||
return self._llvm_module
|
||||
ir = self.build_ir(iaca_marker=iaca_marker)
|
||||
return llvm.parse_assembly(ir)
|
||||
|
||||
def get_target_machine(self):
|
||||
"""Instantiate and return target machine."""
|
||||
if self._tm is None:
|
||||
features = llvm.get_host_cpu_features().flatten()
|
||||
cpu = llvm.get_host_cpu_name()
|
||||
self._tm = llvm.Target.from_default_triple().create_target_machine(
|
||||
cpu=cpu, features=features, opt=3)
|
||||
return self._tm
|
||||
features = llvm.get_host_cpu_features().flatten()
|
||||
cpu = llvm.get_host_cpu_name()
|
||||
return llvm.Target.from_default_triple().create_target_machine(
|
||||
cpu=cpu, features=features, opt=3)
|
||||
|
||||
def get_assembly(self):
|
||||
def get_assembly(self, iaca_marker=False):
|
||||
"""Compile and return assembly from LLVM module."""
|
||||
tm = self.get_target_machine()
|
||||
tm.set_asm_verbosity(0)
|
||||
asm = tm.emit_assembly(self.get_llvm_module())
|
||||
asm = tm.emit_assembly(self.get_llvm_module(iaca_marker=iaca_marker))
|
||||
# Remove double comments
|
||||
asm = re.sub(r'## InlineAsm End\n\s*## InlineAsm Start\n\s*', '', asm)
|
||||
return asm
|
||||
@@ -82,17 +79,15 @@ class Benchmark:
|
||||
def get_function_ctype(self):
|
||||
return ctypes.CFUNCTYPE(ctypes.c_int64, ctypes.c_int64)
|
||||
|
||||
def get_iaca_analysis(self):
|
||||
def get_iaca_analysis(self, arch):
|
||||
"""Compile and return IACA analysis."""
|
||||
if iaca is None:
|
||||
raise ValueError("kerncraft not installed. IACA analysis is not supported.")
|
||||
tm = self.get_target_machine()
|
||||
tmpf = tempfile.NamedTemporaryFile("wb")
|
||||
tmpf.write(tm.emit_object(self.get_llvm_module()))
|
||||
tmpf.write(tm.emit_object(self.get_llvm_module(iaca_marker=True)))
|
||||
tmpf.flush()
|
||||
|
||||
# assuming "iaca.sh" to be available
|
||||
subprocess.check_output(['objdump', tmpf.name])
|
||||
# WORK IN PROGRESS
|
||||
|
||||
return iaca.iaca_analyse_instrumented_binary(tmpf.name, arch)
|
||||
|
||||
def build_and_execute(self, repeat=10, min_elapsed=0.1, max_elapsed=0.3):
|
||||
# Compile the module to machine code using MCJIT
|
||||
@@ -141,10 +136,11 @@ class Benchmark:
|
||||
|
||||
|
||||
class LoopBenchmark(Benchmark):
|
||||
def __init__(self, root_synth, init_values=None):
|
||||
def __init__(self, root_synth, init_values=None, loop_carried_dependencies=True):
|
||||
super().__init__()
|
||||
self.root_synth = root_synth
|
||||
self.init_values = init_values or root_synth.get_default_init_values()
|
||||
self.loop_carried_dependencies = loop_carried_dependencies
|
||||
|
||||
if len(root_synth.get_source_registers()) != len(self.init_values):
|
||||
raise ValueError("Number of init values and source registers do not match.")
|
||||
@@ -156,7 +152,9 @@ class LoopBenchmark(Benchmark):
|
||||
return ['%out.{}'.format(i) for i in
|
||||
range(len(self.root_synth.get_destination_registers()))]
|
||||
|
||||
def get_phi_code(self, latency=True):
|
||||
def get_phi_code(self):
|
||||
if not self.loop_carried_dependencies:
|
||||
return ''
|
||||
# Compile loop carried dependencies
|
||||
lcd = []
|
||||
# Change in naming (src <-> dst) is on purpose!
|
||||
@@ -210,8 +208,19 @@ class LoopBenchmark(Benchmark):
|
||||
|
||||
|
||||
class IntegerLoopBenchmark(LoopBenchmark):
|
||||
def build_ir(self):
|
||||
return textwrap.dedent('''\
|
||||
def build_ir(self, iaca_marker=False):
|
||||
if iaca_marker:
|
||||
iaca_start_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$111,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
iaca_stop_marker = textwrap.dedent('''\
|
||||
call void asm "movl $$222,%ebx", ""()
|
||||
call void asm ".byte 100,103,144", ""()''')
|
||||
else:
|
||||
iaca_start_marker = ''
|
||||
iaca_stop_marker = ''
|
||||
|
||||
ir = textwrap.dedent('''\
|
||||
define i64 @"test"(i64 %"N")
|
||||
{{
|
||||
entry:
|
||||
@@ -221,24 +230,30 @@ class IntegerLoopBenchmark(LoopBenchmark):
|
||||
loop:
|
||||
%"loop_counter" = phi i64 [0, %"entry"], [%"loop_counter.1", %"loop"]
|
||||
{phi}
|
||||
{iaca_start_marker}
|
||||
{loop_body}
|
||||
%"loop_counter.1" = add i64 %"loop_counter", 1
|
||||
%"loop_cond.1" = icmp slt i64 %"loop_counter.1", %"N"
|
||||
br i1 %"loop_cond.1", label %"loop", label %"end"
|
||||
|
||||
|
||||
end:
|
||||
%"ret" = phi i64 [0, %"entry"], [%"loop_counter", %"loop"]
|
||||
{iaca_stop_marker}
|
||||
ret i64 %"ret"
|
||||
}}
|
||||
''').format(
|
||||
loop_body=textwrap.indent(
|
||||
self.root_synth.build_ir(self.get_destination_names(),
|
||||
self.get_source_names()), ' '),
|
||||
phi=textwrap.indent(self.get_phi_code(), ' '))
|
||||
phi=textwrap.indent(self.get_phi_code(), ' '),
|
||||
iaca_start_marker=iaca_start_marker,
|
||||
iaca_stop_marker=iaca_stop_marker)
|
||||
|
||||
return ir
|
||||
|
||||
|
||||
def bench_instructions(instructions, serial_factor=8, parallel_factor=4, throughput_serial_factor=8,
|
||||
serialize=False, verbosity=0):
|
||||
serialize=False, verbosity=0, iaca_comparison=None):
|
||||
not_serializable = False
|
||||
try:
|
||||
# Latency Benchmark
|
||||
@@ -261,6 +276,7 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
|
||||
result = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
lat = min(*[(t / serial_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
result['latency'] = lat
|
||||
if verbosity > 0:
|
||||
print('### Detailed Results')
|
||||
pprint(result)
|
||||
@@ -294,10 +310,20 @@ def bench_instructions(instructions, serial_factor=8, parallel_factor=4, through
|
||||
tp = min(
|
||||
[(t / throughput_serial_factor / parallel_factor) * result['frequency'] / result['iterations']
|
||||
for t in result['runtimes']])
|
||||
result['throughput'] = tp
|
||||
if iaca_comparison is not None:
|
||||
iaca_analysis = b.get_iaca_analysis(iaca_comparison)
|
||||
result['iaca throughput'] = iaca_analysis['throughput']/(
|
||||
parallel_factor * throughput_serial_factor)
|
||||
if verbosity > 0:
|
||||
print('### Detailed Results')
|
||||
pprint(result)
|
||||
print()
|
||||
if verbosity > 1 and iaca_comparison is not None:
|
||||
print('### IACA Results')
|
||||
print(iaca_analysis['output'])
|
||||
print('!!! throughput_serial_factor={} and parallel_factor={}'.format(
|
||||
throughput_serial_factor, parallel_factor))
|
||||
|
||||
# Result compilation
|
||||
return lat, tp
|
||||
|
||||
31
asmjit/op.py
31
asmjit/op.py
@@ -205,9 +205,18 @@ class Instruction(Operation):
|
||||
self.source_operands = source_operands
|
||||
|
||||
def get_source_registers(self):
|
||||
return [sop for sop in self.source_operands if isinstance(sop, Register)] + \
|
||||
[r for mop in self.source_operands if isinstance(mop, MemoryReference)
|
||||
for r in mop.get_registers()]
|
||||
sop_types = set()
|
||||
sr = []
|
||||
for sop in self.source_operands:
|
||||
if isinstance(sop, Register):
|
||||
t = (sop.llvm_type, sop.get_constraint_char())
|
||||
if t not in sop_types:
|
||||
sop_types.add(t)
|
||||
sr.append(sop)
|
||||
elif isinstance(sop, MemoryReference):
|
||||
sr += list(sop.get_registers())
|
||||
|
||||
return sr
|
||||
|
||||
def get_destination_registers(self):
|
||||
if isinstance(self.destination_operand, Register):
|
||||
@@ -229,6 +238,7 @@ class Instruction(Operation):
|
||||
|
||||
# Build argument string from operands and register names
|
||||
operands = []
|
||||
sop_types = {}
|
||||
i = 0
|
||||
for sop in self.source_operands:
|
||||
if isinstance(sop, Immediate):
|
||||
@@ -236,10 +246,17 @@ class Instruction(Operation):
|
||||
type=sop.llvm_type,
|
||||
repr=sop.value))
|
||||
elif isinstance(sop, Register):
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
repr=src_reg_names[i]))
|
||||
i += 1
|
||||
sop_t = (sop.llvm_type, sop.get_constraint_char())
|
||||
if sop_t in sop_types:
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
repr=src_reg_names[sop_types[sop_t]]))
|
||||
else:
|
||||
sop_types[sop_t] = i
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
repr=src_reg_names[i]))
|
||||
i += 1
|
||||
elif isinstance(sop, MemoryReference):
|
||||
operands.append('{type} {repr}'.format(
|
||||
type=sop.llvm_type,
|
||||
|
||||
107
tablegen.py
107
tablegen.py
@@ -1,6 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
import collections
|
||||
import re
|
||||
import itertools
|
||||
@@ -466,44 +468,89 @@ def main():
|
||||
instructions_ret_type = collections.defaultdict(collections.OrderedDict)
|
||||
if args.verbosity > 0:
|
||||
for ret_type in rel_instruction_names:
|
||||
print(ret_type, 'has', len(instrs), 'instructions')
|
||||
print(ret_type, 'has', len(instructions_ret_type[ret_type]), 'instructions')
|
||||
|
||||
# Benchmark random instruction sequences
|
||||
for instr_name, instr_op in instructions.items():
|
||||
instructions_ret_type[instr_op.get_destination_registers()[0].llvm_type][
|
||||
instr_name] = (instr_name, instr_op)
|
||||
# Constructing random benchmarks, one for each return type
|
||||
random.seed(42)
|
||||
parallel_factor = 8
|
||||
for t in sorted(instructions_ret_type):
|
||||
valid = False
|
||||
while not valid:
|
||||
selected_names, selected_instrs = zip(
|
||||
*[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
|
||||
#random.seed(42)
|
||||
#parallel_factor = 8
|
||||
#for t in sorted(instructions_ret_type):
|
||||
# valid = False
|
||||
# while not valid:
|
||||
# selected_names, selected_instrs = zip(
|
||||
# *[random.choice(list(instructions_ret_type[t].values())) for i in range(10)])
|
||||
#
|
||||
# if not all([can_serialize(i) for i in selected_instrs]):
|
||||
# continue
|
||||
# else:
|
||||
# valid = True
|
||||
#
|
||||
# serial = op.Serialized(selected_instrs)
|
||||
# p = op.Parallelized([serial] * parallel_factor)
|
||||
#
|
||||
# init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
|
||||
# p.get_source_registers()]
|
||||
# b = bench.IntegerLoopBenchmark(p, init_values)
|
||||
# print('## Selected Instructions')
|
||||
# print(', '.join(selected_names))
|
||||
# print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
|
||||
# print(b.get_assembly())
|
||||
# #pprint(selected_instrs)
|
||||
# r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
# r['parallel_factor'] = parallel_factor
|
||||
# print('## Detailed Results')
|
||||
# pprint(r)
|
||||
# print("minimal throughput: {:.2f} cy".format(
|
||||
# min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
|
||||
|
||||
if not all([can_serialize(i) for i in selected_instrs]):
|
||||
continue
|
||||
else:
|
||||
valid = True
|
||||
# Reduce to 100 instructions:
|
||||
#instructions = dict(list(instructions.items())[:100])
|
||||
|
||||
serial = op.Serialized(selected_instrs)
|
||||
p = op.Parallelized([serial] * parallel_factor)
|
||||
|
||||
init_values = [op.init_value_by_llvm_type[reg.llvm_type] for reg in
|
||||
p.get_source_registers()]
|
||||
b = bench.IntegerLoopBenchmark(p, init_values)
|
||||
print('## Selected Instructions')
|
||||
print(', '.join(selected_names))
|
||||
print('## Generated Assembly ({}x parallel)'.format(parallel_factor))
|
||||
print(b.get_assembly())
|
||||
#pprint(selected_instrs)
|
||||
r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
r['parallel_factor'] = parallel_factor
|
||||
print('## Detailed Results')
|
||||
pprint(r)
|
||||
print("minimal throughput: {:.2f} cy".format(
|
||||
min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
|
||||
# Reduce to set of instructions used in Stream Triad:
|
||||
instructions = {k: v for k,v in instructions.items() if k in ['ADD32ri', 'ADD64ri32', 'INC64r', 'SUB32ri', 'VADDPDYrr', 'VADDSDrr', 'VADDSSrr', 'VCVTSI642SSrr', 'VFMADD213PDYr', 'VFMADD213PDr', 'VFMADD213PSYr', 'VFMADD213PSr', 'VFMADD213SDr', 'VFMADD213SSr', 'VINSERTF128rr', 'VMULPDYrr', 'VMULSDrr_Int', 'VMULSSrr_Int', 'VSUBSDrr_Int', 'VSUBSSrr_Int']}
|
||||
|
||||
random.seed(23)
|
||||
instructions_per_run = 3
|
||||
parallel_factor = 4
|
||||
print(textwrap.dedent("""
|
||||
# This file contains example data measured on an Intel I7-6700HQ with 2.6GHz with Turbo mode
|
||||
# disabled.
|
||||
|
||||
# Comments are possible everywhere after hash symbols.
|
||||
|
||||
# This part contains necessary configuration information.
|
||||
configuration:
|
||||
model: three-level # we assume that instructions are decomposed into uops
|
||||
num_ports: 7 # our hardware has 4 execution ports
|
||||
num_uops_per_insn: 4 # the maximal number of uops into which an instruction can be decomposed
|
||||
slack_limit: 0.0 # relative margin of error for cycle measurements
|
||||
|
||||
|
||||
# Here follows a list of experiments.
|
||||
"""))
|
||||
for i in range(100):
|
||||
selected_names, selected_instrs = zip(*[random.choice(list(instructions.items()))
|
||||
for i in range(instructions_per_run)])
|
||||
print("experiment:")
|
||||
p = op.Parallelized(selected_instrs*parallel_factor)
|
||||
b = bench.IntegerLoopBenchmark(p)
|
||||
print(' instructions:')
|
||||
print(' '+('\n '.join(selected_names)))
|
||||
if args.verbosity > 0:
|
||||
print(' ir:')
|
||||
print(textwrap.indent(b.build_ir(), ' '*8))
|
||||
print(' asm:')
|
||||
print(textwrap.indent(b.get_assembly(), ' '*8))
|
||||
r = b.build_and_execute(repeat=4, min_elapsed=0.1, max_elapsed=0.2)
|
||||
r['parallel_factor'] = parallel_factor
|
||||
if args.verbosity > 0:
|
||||
print(' detailed_result:')
|
||||
pprint(r, indent=8)
|
||||
print(" cycles: {:.2f}".format(
|
||||
min(r['runtimes'])/r['iterations']*r['frequency']/parallel_factor))
|
||||
|
||||
def can_serialize(instr):
|
||||
if not any([so.llvm_type == instr.destination_operand.llvm_type and
|
||||
@@ -517,7 +564,7 @@ def can_serialize(instr):
|
||||
def combined_instructions(instructions, length):
|
||||
for instr_names in itertools.combinations(instructions, length):
|
||||
instrs = [instructions[n] for n in instr_names]
|
||||
dst_types = list([i.get_destination_registers()[0].llvm_type ])
|
||||
dst_types = list([i.get_destination_registers()[0].llvm_type for i in instrs])
|
||||
if not all([can_serialize(i) for i in instrs]) and dst_types[1:] == dst_types[:-1]:
|
||||
continue
|
||||
yield instrs
|
||||
|
||||
Reference in New Issue
Block a user