Files
OSACA/osaca/osaca.py
2019-01-18 14:19:39 +01:00

829 lines
30 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import collections
import sys
import os
import io
import re
import subprocess
from datetime import datetime
from pprint import pprint
import pandas as pd
import numpy as np
from osaca.param import Register, MemAddr, Parameter
from osaca.eu_sched import Scheduler
from osaca.testcase import Testcase
DATA_DIR = os.path.expanduser('~') + '/.osaca/'
# Matches every variation of the IACA start marker
IACA_START_MARKER = re.compile(r'\s*movl?[ \t]+\$(?:111|0x6f)[ \t]*,[ \t]*%ebx.*\n\s*'
r'(?:\.byte[ \t]+100.*((,[ \t]*103.*((,[ \t]*144)|'
r'(\n\s*\.byte[ \t]+144)))|'
r'(\n\s*\.byte[ \t]+103.*((,[ \t]*144)|'
r'(\n\s*\.byte[ \t]+144))))|(?:fs addr32 )?nop)')
# Matches every variation of the IACA end marker
IACA_END_MARKER = re.compile(r'\s*movl?[ \t]+\$(?:222|0x1f3)[ \t]*,[ \t]*%ebx.*\n\s*'
r'(?:\.byte[ \t]+100.*((,[ \t]*103.*((,[ \t]*144)|'
r'(\n\s*\.byte[ \t]+144)))|'
r'(\n\s*\.byte[ \t]+103.*((,[ \t]*144)|'
r'(\n\s*\.byte[ \t]+144))))|(?:fs addr32 )?nop)')
def flatten(l):
"""
Flatten a nested list of strings.
Parameters
----------
l : [[...[str]]]
Nested list of strings
Returns
-------
[str]
List of strings
"""
if not l:
return l
if isinstance(l[0], list):
return flatten(l[0]) + flatten(l[1:])
return l[:1] + flatten(l[1:])
def get_assembly_from_binary(bin_path):
"""
Disassemble binary with llvm-objdump and transform into a canonical from.
Replace jump and call target offsets with labels.
:param bin_path: path to binary file to disassemble
:return assembly string
"""
asm_lines = subprocess.run(
['objdump', '-d', '--no-show-raw-insn', bin_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).stdout.decode('utf-8').split('\n')
asm = []
# Separate label, offsets and instructions
# Store offset with each label (thus iterate in reverse)
label_offsets = {}
for l in reversed(asm_lines):
m = re.match(r'^(?:(?P<label>[0-9a-zA-Z_\.]+):|'
r'\s*(?P<offset>[0-9a-fA-F]+):(?:\s*(?P<instr>.*)))$', l)
if m:
d = m.groupdict()
if d['offset'] is not None:
d['offset'] = int(d['offset'], base=16)
last_offset = d['offset']
else:
label_offsets[d['label']] = last_offset
# insert at front to preserve order
asm.insert(0, d)
# Find all jump locations and replace with labels
new_labels = {}
for a in asm:
if a['instr'] is not None:
m = re.search(r'[\-]?[0-9a-fA-F]+ <(?P<label>[0-9a-zA-Z_\.]+)'
r'(?:\+(?P<offset>0x[0-9a-fA-F]+))?>',
a['instr'])
if m and m.group('label') in label_offsets:
target = label_offsets[m.group('label')]
label_name = m.group('label')
if m.group('offset') is not None:
# Need to create new label at target + offset
target += int(m.group('offset'), base=16)
label_name += '_'+str(m.group('offset'))
new_labels[label_name] = target
# replace reference with new name
a['instr'] = (a['instr'][:m.start()] +
'{}'.format(label_name) +
a['instr'][m.end():])
# Find instruction at target and insert label before
for label, target in new_labels.items():
for i, a in enumerate(asm):
if target == a['offset']:
break
asm.insert(i, {'label': label, 'offset': None, 'instr': None})
# Remove trailing suffixes (lqwb) from instructions
# FIXME this falsely removed b from jb and potentially others as well
for a in asm:
if a['instr'] is not None:
m = re.match(r'^(?P<instr>[^j][a-z0-9]+)[lqwb](?P<tail>\s+.+|$)', a['instr'])
if m:
a['instr'] = m.group('instr') + m.group('tail')
# Return instructions and labels in canonical assembly
assembly = ''
for a in asm:
if a['label'] is not None:
assembly += a['label'] + ':\n'
elif a['instr'] is not None:
assembly += a['instr'] + '\n'
# Replace all hexadecimals with decimals
m = True
while m:
m = re.search(r'0x[0-9a-fA-F]+', assembly)
if m:
assembly = assembly[:m.start()] + str(int(m.group(0), base=16)) + assembly[m.end():]
# Remove trailing ",1)" from offsets
assembly.replace(',1)', ')')
return assembly
def create_sequences(end=101):
"""
Create list of integers from 1 to end and list of their reciprocals.
Parameters
----------
end : int
End value for list of integers (default 101)
Returns
-------
[int]
cyc_list of integers
[float]
reci_list of floats
"""
cyc_list = []
reci_list = []
for i in range(1, end):
cyc_list.append(i)
reci_list.append(1 / i)
return cyc_list, reci_list
def validate_val(clk_cyc, instr, is_tp, cyc_list, reci_list):
"""
Validate given clock cycle clk_cyc and return rounded value in case of
success.
A succeeded validation means the clock cycle clk_cyc is only 5% higher or
lower than an integer value from cyc_list or - if clk_cyc is a throughput
value - 5% higher or lower than a reciprocal from the reci_list.
Parameters
----------
clk_cyc : float
Clock cycle to validate
instr : str
Instruction for warning output
is_tp : bool
True if a throughput value is to check, False for a latency value
cyc_list : [int]
Cycle list for validating
reci_list : [float]
Reciprocal cycle list for validating
Returns
-------
float
Clock cycle, either rounded to an integer or its reciprocal or the
given clk_cyc parameter
"""
column = 'LT'
if is_tp:
column = 'TP'
for i in range(0, len(cyc_list)):
if cyc_list[i] * 1.05 > float(clk_cyc) > cyc_list[i] * 0.95:
# Value is probably correct, so round it to the estimated value
return cyc_list[i]
# Check reciprocal only if it is a throughput value
elif is_tp and reci_list[i] * 1.05 > float(clk_cyc) > reci_list[i] * 0.95:
# Value is probably correct, so round it to the estimated value
return reci_list[i]
# No value close to an integer or its reciprocal found, we assume the
# measurement is incorrect
raise ValueError('Your measurement for {} ({}) is probably wrong. '
'Please inspect your benchmark!'.format(instr, column))
return clk_cyc
def include_ibench(arch, ibench_output):
"""
Read ibench output and include it in the architecture specific csv file.
"""
df = read_csv(arch)
# Create sequence of numbers and their reciprocals for validate the measurements
cyc_list, reci_list = create_sequences()
new_data = []
added_vals = 0
with open(ibench_output) as f:
source = f.readline()
for line in source:
if 'Using frequency' in line or len(line) == 0:
continue
column = 'LT'
instr = line.split()[0][:-1]
if 'TP' in line:
# We found a command with a throughput value. Get instruction and the number of
# clock cycles and remove the '-TP' suffix.
column = 'TP'
instr = instr[:-3]
# Otherwise it is a latency value. Nothing to do.
clk_cyc = float(line.split()[1])
clk_cyc = validate_val(clk_cyc, instr, True if (column == 'TP') else False,
cyc_list, reci_list)
val = -2
new = False
try:
entry = df.loc[lambda df, inst=instr: df.instr == inst, column]
val = entry.values[0]
# If val is -1 (= not filled with a valid value) add it immediately
if val == -1:
df.set_value(entry.index[0], column, clk_cyc)
added_vals += 1
continue
except IndexError:
# Instruction not in database yet --> add it
new = True
# First check if LT or TP value has already been added before
for i, item in enumerate(new_data):
if instr in item:
if column == 'TP':
new_data[i][1] = clk_cyc
elif column == 'LT':
new_data[i][2] = clk_cyc
new = False
break
if new and column == 'TP':
new_data.append([instr, clk_cyc, '-1', (-1,)])
elif new and column == 'LT':
new_data.append([instr, '-1', clk_cyc, (-1,)])
new = True
added_vals += 1
if not new and abs((val / np.float64(clk_cyc)) - 1) > 0.05:
raise ValueError(
"Different measurement for {} ({}): {}(old) vs. {}(new)\n"
"Please check for correctness "
"(no changes were made).".format(instr, column, val, clk_cyc))
# Now merge the DataFrames and write new csv file
df = df.append(pd.DataFrame(new_data, columns=['instr', 'TP', 'LT', 'ports']),
ignore_index=True)
write_csv(arch, df)
return added_vals
def extract_marked_section(assembly):
"""
Return the assembly section marked with IACA markers.
Raise ValueError if none or only one marker was found.
"""
m_start = re.search(IACA_START_MARKER, assembly)
m_end = re.search(IACA_END_MARKER, assembly)
if not m_start or not m_end:
raise ValueError("Could not find start and end markers.")
return assembly[m_start.end():m_end.start()]
def strip_assembly(assembly):
"""
Remove comments and unnecessary whitespaces from assembly.
:param assembly: assembly string
:return: assembly string without comments nor any unnecessary whitespaces
"""
asm_lines = assembly.split('\n')
for i, line in enumerate(asm_lines):
# find and remove comment
c = line.find('#')
if c != -1:
line = line[:c]
# strip leading and trailing whitespaces
asm_lines[i] = line.strip()
# remove blank lines
asm_lines = [l for l in asm_lines if l]
return '\n'.join(asm_lines)
# TODO replacement for instr_forms entries in OSACA
# class InstructionForm:
# def __init__(self, mnemonic, parameters, line=None):
# self.mnemonic = mnemonic
# self.parameters = parameters
# self.line = line
#
# @classmethod
# def from_assembly(cls, line):
# # Skip clang padding bytes
# while line.startswith('data32 '):
# line = line[7:]
#
# line_split = line.split()
# mnemonic = line_split[0]
# if len(line_split) > 1:
# parameters = line_split[1:]
# else:
# parameters = None
#
# return cls(mnemonic, parameters, line)
#
# def __str__(self):
# return line
#
# def __repr__(self):
# return '{}({!r}, {!r}, {!r})'.format(
# self.__class__.__name__, self.mnemonic, self.parameters, self.line)
class OSACA(object):
"""
A single OSACA analysis.
"""
srcCode = None
tp_list = False
# Variables for checking lines
numSeps = 0
indentChar = ''
sem = 0
# Variables for creating output
longestInstr = 30
machine_readable = False
VALID_ARCHS = Scheduler.arch_dict
def __init__(self, arch, assembly, extract_with_markers=True):
"""
Create and run analysis on assembly for architecture.
:param arch: architecture abbreviation
:param assembly: assembly code as string
:param extract_with_markers: if True, use markers to isolate relavent section
"""
# Check architecture
if arch not in self.VALID_ARCHS:
raise ValueError("Invalid architecture ({!r}), must be one of {}.".format(
arch, self.VALID_ARCHS))
self.arch = arch
if extract_with_markers:
assembly = extract_marked_section(assembly)
self.assembly = strip_assembly(assembly).split('\n')
self.instr_forms = []
# Check if data files are already in usr dir, otherwise create them
if not os.path.isdir(os.path.join(DATA_DIR, 'data')):
#print('Copying files in user directory...', file=self.file_output, end='')
os.makedirs(os.path.join(DATA_DIR, 'data'))
subprocess.call(['cp', '-r',
'/'.join(os.path.realpath(__file__).split('/')[:-1]) + '/data',
DATA_DIR])
#print(' Done!', file=self.file_output)
# Check for database for the chosen architecture
self.df = read_csv(arch)
# Run analysis and populate instr_forms
self.inspect()
# Create schedule
self.schedule = Scheduler(self.arch, self.instr_forms)
def inspect(self):
"""
Run analysis.
"""
for line in self.assembly:
# TODO potential replacement for instr_forms entries in OSACA
# InstructionForm.from_assembly(line)
if re.match(r'^[a-zA-Z0-9\_\.]+:$', line):
continue
self.check_instr(line)
def check_instr(self, instr):
"""
Inspect instruction for its parameters and add it to the instruction forms
pool instr_form.
Parameters
----------
instr : str
Instruction as string
"""
# Ignore labels
# Check for strange clang padding bytes
while instr.startswith('data32'):
instr = instr[7:]
# Separate mnemonic and operands
mnemonic = instr.split()[0]
params = instr.split()[1:]
# Check if line is not only a byte
empty_byte = re.compile(r'[0-9a-f]{2}')
if re.match(empty_byte, mnemonic) and len(mnemonic) == 2:
return
# Check if there's one or more operands and store all in a list
param_list = flatten(self._separate_params(params))
param_list_types = list(param_list)
# Check operands and separate them by IMMEDIATE (IMD), REGISTER (REG),
# MEMORY (MEM) or LABEL(LBL)
for i, op in enumerate(param_list):
if len(op) <= 0:
op = Parameter('NONE')
elif op[0] == '$':
op = Parameter('IMD')
elif op[0] == '%' and '(' not in op:
j = len(op)
opmask = False
if '{' in op:
j = op.index('{')
opmask = True
op = Register(op[1:j].strip(" ,"), opmask)
elif '<' in op or re.match(r'^([a-zA-Z\._]+[a-zA-Z0-9_\.]*)+$', op):
op = Parameter('LBL')
else:
op = MemAddr(op)
param_list[i] = str(op)
param_list_types[i] = op
# Add to list
instr = instr.rstrip()
if len(instr) > self.longestInstr:
self.longestInstr = len(instr)
instr_form = [mnemonic] + list(reversed(param_list_types)) + [instr]
self.instr_forms.append(instr_form)
# If flag is set, create testcase for instruction form
# Do this in reversed param list order, du to the fact it's intel syntax
# Only create benchmark if no label (LBL) is part of the operands
if 'LBL' in param_list or '' in param_list:
return
tc = Testcase(mnemonic, list(reversed(param_list_types)), '32')
# Only write a testcase if it not already exists or already in data file
writeTP, writeLT = tc.is_in_dir()
inDB = len(self.df.loc[lambda df: df.instr == tc.get_entryname()])
if inDB == 0:
tc.write_testcase(not writeTP, not writeLT)
def _separate_params(self, params):
"""
Delete comments, separates parameters and return them as a list.
Parameters
----------
params : str
Splitted line after mnemonic
Returns
-------
[[...[str]]]
Nested list of strings. The number of nest levels depend on the
number of parametes given.
"""
param_list = [params]
if ',' in params:
if ')' in params:
if params.index(')') < len(params) - 1 and params[params.index(')') + 1] == ',':
i = params.index(')') + 1
elif params.index('(') < params.index(','):
return param_list
else:
i = params.index(',')
else:
i = params.index(',')
param_list = [params[:i], self._separate_params(params[i + 1:])]
elif '#' in params:
i = params.index('#')
param_list = [params[:i]]
return param_list
def create_output(self, tp_list=False, pr_sched=True, machine_readable=False):
"""
Creates output of analysed file including a time stamp.
Used to interface with Kerncraft.
Parameters
----------
tp_list : bool
Boolean for indicating the need for the throughput list as output
(default False)
pr_sched : bool
Boolean for indicating the need for predicting a scheduling
(default True)
Returns
-------
str
OSACA output
"""
# Check the output alignment depending on the longest instruction
if self.longestInstr > 70:
self.longestInstr = 70
horiz_line = self.create_horiz_sep()
# Write general information about the benchmark
output = '--{}\n| Architecture:\t\t{}\n|\n'.format(
horiz_line, self.arch)
if tp_list:
output += self.create_tp_list(horiz_line)
if pr_sched:
output += '\n\n'
sched_output, port_binding = self.schedule.new_schedule(machine_readable)
# if machine_readable, we're already done here
if machine_readable:
return sched_output
binding = self.schedule.get_port_binding(port_binding)
output += self.schedule.get_report_info() + '\n' + binding + '\n\n' + sched_output
block_tp = round(max(port_binding), 2)
output += 'Total number of estimated throughput: {}\n'.format(block_tp)
return output
def get_port_occupation_cycles(self):
"""
Build dict with port names and cycles they are occupied during one block execution
Used to interface with Kerncraft.
:return: dictionary of ports and cycles
"""
sched_output, port_binding = self.schedule.new_schedule()
return collections.OrderedDict([
(port_name, port_binding[i])
for i, port_name in enumerate(self.schedule.get_port_naming())])
def get_unmatched_instruction_ratio(self):
"""
Calculate ratio of unmatched vs total instructions
:return: float
"""
sched_output, port_binding = self.schedule.new_schedule()
return sched_output.count('| X ') / len(self.instr_forms)
def get_total_throughput(self):
"""
Return total cycles estimated per block execution. Including (potential) penalties.
Used to interface with Kerncraft.
:return: float of cycles
"""
return max(self.get_port_occupation_cycles().values())
def create_horiz_sep(self):
"""
Calculate and return horizontal separator line.
Returns
-------
str
Horizontal separator line
"""
return '-' * (self.longestInstr + 8)
def create_tp_list(self, horiz_line):
"""
Create list of instruction forms with the proper throughput value.
Parameter
---------
horiz_line : str
Calculated horizontal line for nice alignement
Returns
-------
str
Throughput list output for printing
"""
warning = False
ws = ' ' * (len(horiz_line) - 23)
output = '\n| INSTRUCTION{}CLOCK CYCLES\n| {}\n|\n'.format(ws, horiz_line)
# Check for the throughput data in CSV
for elem in self.instr_forms:
op_ext = []
for i in range(1, len(elem) - 1):
if isinstance(elem[i], Register) and elem[i].reg_type == 'GPR':
optmp = 'r' + str(elem[i].size)
elif isinstance(elem[i], MemAddr):
optmp = 'mem'
else:
optmp = str(elem[i]).lower()
op_ext.append(optmp)
operands = '_'.join(op_ext)
# Now look up the value in the dataframe
# Check if there is a stored throughput value in database
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')
series = self.df['instr'].str.contains(elem[0] + '-' + operands)
if True in series.values:
# It's a match!
not_found = False
try:
tp = self.df[self.df.instr == elem[0] + '-' + operands].TP.values[0]
except IndexError:
# Something went wrong
#print('Error while fetching data from data file', file=self.file_output)
continue
# Did not found the exact instruction form.
# Try to find the instruction form for register operands only
else:
op_ext_regs = []
for operand in op_ext:
try:
# regTmp = Register(operand)
# Create Register only to see if it is one
Register(operand)
op_ext_regs.append(True)
except KeyError:
op_ext_regs.append(False)
if True not in op_ext_regs:
# No register in whole instr form. How can I find out what regsize we need?
#print('Feature not included yet: ', end='', file=self.file_output)
#print(elem[0] + ' for ' + operands, file=self.file_output)
tp = 0
warning = True
num_whitespaces = self.longestInstr - len(elem[-1])
ws = ' ' * num_whitespaces + '| '
n_f = ' ' * (5 - len(str(tp))) + '*'
data = '| ' + elem[-1] + ws + str(tp) + n_f + '\n'
output += data
continue
if op_ext_regs[0] is False:
# Instruction stores result in memory. Check for storing in register instead.
if len(op_ext) > 1:
if op_ext_regs[1] is True:
op_ext[0] = op_ext[1]
elif len(op_ext) > 2:
if op_ext_regs[2] is True:
op_ext[0] = op_ext[2]
if len(op_ext_regs) == 2 and op_ext_regs[1] is False:
# Instruction loads value from memory and has only two operands. Check for
# loading from register instead
if op_ext_regs[0] is True:
op_ext[1] = op_ext[0]
if len(op_ext_regs) == 3 and op_ext_regs[2] is False:
# Instruction loads value from memory and has three operands. Check for loading
# from register instead
op_ext[2] = op_ext[0]
operands = '_'.join(op_ext)
# Check for register equivalent instruction
series = self.df['instr'].str.contains(elem[0] + '-' + operands)
if True in series.values:
# It's a match!
not_found = False
try:
tp = self.df[self.df.instr == elem[0] + '-' + operands].TP.values[0]
except IndexError:
# Something went wrong
#print('Error while fetching data from data file', file=self.file_output)
continue
# Did not found the register instruction form. Set warning and go on with
# throughput 0
else:
tp = 0
not_found = True
warning = True
# Check the alignement again
num_whitespaces = self.longestInstr - len(elem[-1])
ws = ' ' * num_whitespaces + '| '
n_f = ''
if not_found:
n_f = ' ' * (5 - len(str(tp))) + '*'
data = '| ' + elem[-1] + ws + '{:3.2f}'.format(tp) + n_f + '\n'
output += data
# Finally end the list of throughput values
output += '| ' + horiz_line + '\n'
if warning:
output += ('\n\n* There was no throughput value found for the specific instruction '
'form.\n Please create a testcase via the create_testcase-method or add a '
'value manually.')
return output
def generate_text_output(self):
"""Generate and return an output string showing the analysis results."""
output = self.create_output(self.tp_list, True, self.machine_readable)
return output
def read_csv(arch):
"""
Read architecture dependent CSV from data directory.
Returns
-------
DataFrame
CSV as DataFrame object
"""
# curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
return pd.read_csv(DATA_DIR + 'data/' + arch.lower() + '_data.csv')
def write_csv(arch, df):
"""
Write architecture DataFrame as CSV into data directory.
"""
# curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
csv = df.to_csv(index=False)
with open(DATA_DIR + 'data/' + arch.lower() + '_data.csv', 'w') as f:
f.write(csv)
# Stolen from pip
def __read(*names, **kwargs):
with io.open(
os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8")
) as fp:
return fp.read()
# Stolen from pip
def __find_version(*file_paths):
version_file = __read(*file_paths)
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
if version_match:
return version_match.group(1)
raise RuntimeError('Unable to find version string.')
def main():
# Parse args
parser = argparse.ArgumentParser(description='Analyzes a marked innermost loop snippet'
'for a given architecture type and prints out the '
'estimated average throughput.')
parser.add_argument('-V', '--version', action='version',
version='%(prog)s ' + __find_version('__init__.py'))
parser.add_argument('--arch', type=str, required=True,
help='define architecture (SNB, IVB, HSW, BDW, SKL, ZEN)')
parser.add_argument('--binary', '-b', action='store_true',
help='binary file must be disassembled first')
parser.add_argument('--tp-list', action='store_true',
help='print an additional list of all throughput values for the kernel')
parser.add_argument('-i', '--include-ibench', action='store_true',
help='includes the given values in form of the output of ibench in the'
'data file')
parser.add_argument('--insert-marker', '-m', action='store_true',
help='try to find blocks probably corresponding to loops in assembly and'
'insert IACA marker')
parser.add_argument('-l', '--list-output', dest='machine_readable', action='store_true',
help='returns output as machine readable list of lists')
parser.add_argument('filepath', type=str, help='path to object (Binary, ASM, CSV)')
# Store args in global variables
args = parser.parse_args()
# --include-ibench acts stand alone, ignoring everything else
if args.include_ibench:
added_values = include_ibench()
print("Sucessfully adde {} value(s)".format(added_values))
return
if args.binary:
# Read disassembled binary
assembly = get_assembly_from_binary(args.filepath)
else:
# read assembly directly
with open(args.filepath) as f:
assembly = f.read()
if args.insert_marker:
if args.binary:
raise NotImplementedError("Marker insertion is unsupported for binary input files.")
# Insert markers using kerncraft
try:
from kerncraft import iaca
except ImportError:
print("Module kerncraft not installed. Use 'pip install --user "
"kerncraft' for installation.\nFor more information see "
"https://github.com/RRZE-HPC/kerncraft", file=sys.stderr)
sys.exit(1)
# Change due to newer kerncraft version (hopefully temporary)
# iaca.iaca_instrumentation(input_file=filepath, output_file=filepath,
# block_selection='manual', pointer_increment=1)
# TODO use io.StringIO here
unmarked_assembly = io.StringIO(assembly)
marked_assembly = io.StringIO()
iaca.iaca_instrumentation(input_file=unmarked_assembly, output_file=marked_assembly,
block_selection='manual', pointer_increment=1)
marked_assembly.seek(0)
assembly = marked_assembly.read(0)
osaca = OSACA(args.arch, assembly)
print(osaca.generate_text_output())
if __name__ == '__main__':
main()