applied flake8 and black rules

This commit is contained in:
JanLJL
2021-08-26 16:58:19 +02:00
parent 34523e1b23
commit d418c16f4a
23 changed files with 781 additions and 471 deletions

View File

@@ -7,7 +7,8 @@ import re
def __read(*names, **kwargs): def __read(*names, **kwargs):
"""Reads in file""" """Reads in file"""
with io.open( with io.open(
os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8") os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8"),
) as fp: ) as fp:
return fp.read() return fp.read()

View File

@@ -88,7 +88,7 @@ class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
comment = None comment = None
if load: if load:
if 'ymm' in operand_types: if "ymm" in operand_types:
port2D3D_pressure = 2 port2D3D_pressure = 2
else: else:
port2D3D_pressure = 1 port2D3D_pressure = 1
@@ -96,7 +96,7 @@ class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
latency += 4 latency += 4
comment = "with load" comment = "with load"
if store: if store:
if 'ymm' in operand_types: if "ymm" in operand_types:
port4_pressure = 2 port4_pressure = 2
else: else:
port4_pressure = 1 port4_pressure = 1
@@ -716,14 +716,14 @@ skx_mov_instructions = list(
# ('movapd xmm xmm', ('1*p5', 1)), # ('movapd xmm xmm', ('1*p5', 1)),
# ('vmovapd xmm xmm', ('1*p5', 1)), # ('vmovapd xmm xmm', ('1*p5', 1)),
# ('vmovapd ymm ymm', ('1*p5', 1)), # ('vmovapd ymm ymm', ('1*p5', 1)),
('vmovapd zmm zmm', ('', 0)), ("vmovapd zmm zmm", ("", 0)),
# https://www.felixcloutier.com/x86/movaps # https://www.felixcloutier.com/x86/movaps
# TODO with masking! # TODO with masking!
# TODO the following may eliminate or be bound to 1*p0156: # TODO the following may eliminate or be bound to 1*p0156:
# ('movaps xmm xmm', ('1*p5', 1)), # ('movaps xmm xmm', ('1*p5', 1)),
# ('vmovaps xmm xmm', ('1*p5', 1)), # ('vmovaps xmm xmm', ('1*p5', 1)),
# ('vmovaps ymm ymm', ('1*p5', 1)), # ('vmovaps ymm ymm', ('1*p5', 1)),
('vmovaps zmm zmm', ('', 0)), ("vmovaps zmm zmm", ("", 0)),
# https://www.felixcloutier.com/x86/movbe # https://www.felixcloutier.com/x86/movbe
("movbe gpr mem", ("1*p15", 4)), ("movbe gpr mem", ("1*p15", 4)),
("movbe mem gpr", ("1*p15", 4)), ("movbe mem gpr", ("1*p15", 4)),

View File

@@ -140,9 +140,11 @@ def extract_model(tree, arch, skip_mem=True):
print("Couldn't find port utilization, skip: ", iform, file=sys.stderr) print("Couldn't find port utilization, skip: ", iform, file=sys.stderr)
continue continue
# skip if measured TP is smaller than computed # skip if measured TP is smaller than computed
if [float(x.attrib["TP_ports"]) > min(float(x.attrib["TP_loop"]), if [
float(x.attrib["TP_unrolled"])) float(x.attrib["TP_ports"])
for x in arch_tag.findall("measurement")][0]: > min(float(x.attrib["TP_loop"]), float(x.attrib["TP_unrolled"]))
for x in arch_tag.findall("measurement")
][0]:
print( print(
"Calculated TP is greater than measured TP.", "Calculated TP is greater than measured TP.",
iform, iform,
@@ -160,13 +162,15 @@ def extract_model(tree, arch, skip_mem=True):
throughput = float(measurement_tag.attrib["TP_ports"]) throughput = float(measurement_tag.attrib["TP_ports"])
else: else:
throughput = min( throughput = min(
measurement_tag.attrib.get("TP_loop", float('inf')), measurement_tag.attrib.get("TP_loop", float("inf")),
measurement_tag.attrib.get("TP_unroll", float('inf')), measurement_tag.attrib.get("TP_unroll", float("inf")),
measurement_tag.attrib.get("TP", float('inf')), measurement_tag.attrib.get("TP", float("inf")),
) )
if throughput == float('inf'): if throughput == float("inf"):
throughput = None throughput = None
uops = int(measurement_tag.attrib["uops"]) if "uops" in measurement_tag.attrib else None uops = (
int(measurement_tag.attrib["uops"]) if "uops" in measurement_tag.attrib else None
)
if "ports" in measurement_tag.attrib: if "ports" in measurement_tag.attrib:
port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib)) port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib))
latencies = [ latencies = [
@@ -202,7 +206,11 @@ def extract_model(tree, arch, skip_mem=True):
# Check if all are equal # Check if all are equal
if port_pressure: if port_pressure:
if port_pressure[1:] != port_pressure[:-1]: if port_pressure[1:] != port_pressure[:-1]:
print("Contradicting port occupancies, using latest IACA:", iform, file=sys.stderr) print(
"Contradicting port occupancies, using latest IACA:",
iform,
file=sys.stderr,
)
port_pressure = port_pressure[-1] port_pressure = port_pressure[-1]
else: else:
# print("No data available for this architecture:", mnemonic, file=sys.stderr) # print("No data available for this architecture:", mnemonic, file=sys.stderr)
@@ -222,9 +230,11 @@ def extract_model(tree, arch, skip_mem=True):
port_4 = True port_4 = True
# Add (x, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4) # Add (x, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
if port_23 and not port_4: if port_23 and not port_4:
if arch.upper() in ["SNB", "IVB"] and any( if (
[p.get('name', '') == 'ymm' for p in parameters]) and \ arch.upper() in ["SNB", "IVB"]
not '128' in mnemonic: and any([p.get("name", "") == "ymm" for p in parameters])
and not ("128" in mnemonic)
):
# x = 2 if SNB or IVB and ymm regiser in any operand and not '128' in # x = 2 if SNB or IVB and ymm regiser in any operand and not '128' in
# instruction name # instruction name
port2D3D_pressure = 2 port2D3D_pressure = 2

View File

@@ -125,7 +125,10 @@ def _get_asmbench_output(input_data, isa):
db_entries = {} db_entries = {}
for i in range(0, len(input_data), 4): for i in range(0, len(input_data), 4):
if input_data[i + 3].strip() != "": if input_data[i + 3].strip() != "":
print("asmbench output not in the correct format! Format must be: ", file=sys.stderr) print(
"asmbench output not in the correct format! Format must be: ",
file=sys.stderr,
)
print( print(
"-------------\nMNEMONIC[-OP1[_OP2][...]]\nLatency: X cycles\n" "-------------\nMNEMONIC[-OP1[_OP2][...]]\nLatency: X cycles\n"
"Throughput: Y cycles\n\n-------------", "Throughput: Y cycles\n\n-------------",
@@ -540,7 +543,16 @@ def _get_sanity_report(
def _get_sanity_report_verbose( def _get_sanity_report_verbose(
total, m_tp, m_l, m_pp, suspic_instr, dup_arch, dup_isa, only_isa, bad_operands, colors=False total,
m_tp,
m_l,
m_pp,
suspic_instr,
dup_arch,
dup_isa,
only_isa,
bad_operands,
colors=False,
): ):
"""Get the verbose part of the sanity report with all missing instruction forms.""" """Get the verbose part of the sanity report with all missing instruction forms."""
BRIGHT_CYAN = "\033[1;36;1m" if colors else "" BRIGHT_CYAN = "\033[1;36;1m" if colors else ""

View File

@@ -202,7 +202,12 @@ class Frontend(object):
) )
def combined_view( def combined_view(
self, kernel, cp_kernel: KernelDG, dep_dict, ignore_unknown=False, show_cmnts=True self,
kernel,
cp_kernel: KernelDG,
dep_dict,
ignore_unknown=False,
show_cmnts=True,
): ):
""" """
Build combined view of kernel including port pressure (TP), a CP column and a Build combined view of kernel including port pressure (TP), a CP column and a
@@ -238,8 +243,8 @@ class Frontend(object):
lcd_sum = 0.0 lcd_sum = 0.0
lcd_lines = {} lcd_lines = {}
if dep_dict: if dep_dict:
longest_lcd = max(dep_dict, key=lambda ln: dep_dict[ln]['latency']) longest_lcd = max(dep_dict, key=lambda ln: dep_dict[ln]["latency"])
lcd_sum = dep_dict[longest_lcd]['latency'] lcd_sum = dep_dict[longest_lcd]["latency"]
lcd_lines = { lcd_lines = {
instr["line_number"]: lat for instr, lat in dep_dict[longest_lcd]["dependencies"] instr["line_number"]: lat for instr, lat in dep_dict[longest_lcd]["dependencies"]
} }

View File

@@ -10,7 +10,13 @@ from functools import lru_cache
from osaca.db_interface import import_benchmark_output, sanity_check from osaca.db_interface import import_benchmark_output, sanity_check
from osaca.frontend import Frontend from osaca.frontend import Frontend
from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT
from osaca.semantics import INSTR_FLAGS, ArchSemantics, KernelDG, MachineModel, reduce_to_section from osaca.semantics import (
INSTR_FLAGS,
ArchSemantics,
KernelDG,
MachineModel,
reduce_to_section,
)
SUPPORTED_ARCHS = [ SUPPORTED_ARCHS = [
@@ -37,7 +43,8 @@ DEFAULT_ARCHS = {
def __read(*names, **kwargs): def __read(*names, **kwargs):
"""Reads in file""" """Reads in file"""
with io.open( with io.open(
os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8") os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8"),
) as fp: ) as fp:
return fp.read() return fp.read()
@@ -79,7 +86,10 @@ def create_parser(parser=None):
# Add arguments # Add arguments
parser.add_argument( parser.add_argument(
"-V", "--version", action="version", version="%(prog)s " + __find_version("__init__.py") "-V",
"--version",
action="version",
version="%(prog)s " + __find_version("__init__.py"),
) )
parser.add_argument( parser.add_argument(
"--arch", "--arch",
@@ -167,7 +177,9 @@ def create_parser(parser=None):
help="Write analysis to this file (default to stdout).", help="Write analysis to this file (default to stdout).",
) )
parser.add_argument( parser.add_argument(
"file", type=argparse.FileType("r"), help="Path to object (ASM or instruction file)." "file",
type=argparse.FileType("r"),
help="Path to object (ASM or instruction file).",
) )
return parser return parser
@@ -347,7 +359,10 @@ def run(args, output_file=sys.stdout):
# Sanity check on DB # Sanity check on DB
verbose = True if args.verbose > 0 else False verbose = True if args.verbose > 0 else False
sanity_check( sanity_check(
args.arch, verbose=verbose, internet_check=args.internet_check, output_file=output_file args.arch,
verbose=verbose,
internet_check=args.internet_check,
output_file=output_file,
) )
elif "import_data" in args: elif "import_data" in args:
# Import microbench output file into DB # Import microbench output file into DB

View File

@@ -26,9 +26,9 @@ class ParserAArch64(BaseParser):
pp.ZeroOrMore(pp.Word(pp.printables)) pp.ZeroOrMore(pp.Word(pp.printables))
).setResultsName(self.COMMENT_ID) ).setResultsName(self.COMMENT_ID)
# Define ARM assembly identifier # Define ARM assembly identifier
decimal_number = pp.Combine(pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)).setResultsName( decimal_number = pp.Combine(
"value" pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)
) ).setResultsName("value")
hex_number = pp.Combine(pp.Literal("0x") + pp.Word(pp.hexnums)).setResultsName("value") hex_number = pp.Combine(pp.Literal("0x") + pp.Word(pp.hexnums)).setResultsName("value")
relocation = pp.Combine(pp.Literal(":") + pp.Word(pp.alphanums + "_") + pp.Literal(":")) relocation = pp.Combine(pp.Literal(":") + pp.Word(pp.alphanums + "_") + pp.Literal(":"))
first = pp.Word(pp.alphas + "_.", exact=1) first = pp.Word(pp.alphas + "_.", exact=1)
@@ -152,7 +152,9 @@ class ParserAArch64(BaseParser):
pp.Literal("{") pp.Literal("{")
+ ( + (
pp.delimitedList(pp.Combine(self.list_element), delim=",").setResultsName("list") pp.delimitedList(pp.Combine(self.list_element), delim=",").setResultsName("list")
^ pp.delimitedList(pp.Combine(self.list_element), delim="-").setResultsName("range") ^ pp.delimitedList(pp.Combine(self.list_element), delim="-").setResultsName(
"range"
)
) )
+ pp.Literal("}") + pp.Literal("}")
+ pp.Optional(index) + pp.Optional(index)
@@ -256,9 +258,7 @@ class ParserAArch64(BaseParser):
# 2. Parse label # 2. Parse label
if result is None: if result is None:
try: try:
result = self.process_operand( result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
self.label.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result) result = AttrDict.convert_dict(result)
instruction_form[self.LABEL_ID] = result[self.LABEL_ID].name instruction_form[self.LABEL_ID] = result[self.LABEL_ID].name
if self.COMMENT_ID in result[self.LABEL_ID]: if self.COMMENT_ID in result[self.LABEL_ID]:
@@ -293,7 +293,9 @@ class ParserAArch64(BaseParser):
try: try:
result = self.parse_instruction(line) result = self.parse_instruction(line)
except (pp.ParseException, KeyError) as e: except (pp.ParseException, KeyError) as e:
raise ValueError("Unable to parse {!r} on line {}".format(line, line_number)) from e raise ValueError(
"Unable to parse {!r} on line {}".format(line, line_number)
) from e
instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID] instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID] instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID] instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
@@ -390,9 +392,9 @@ class ParserAArch64(BaseParser):
new_dict["pre_indexed"] = True new_dict["pre_indexed"] = True
if "post_indexed" in memory_address: if "post_indexed" in memory_address:
if "value" in memory_address["post_indexed"]: if "value" in memory_address["post_indexed"]:
new_dict["post_indexed"] = {"value": int( new_dict["post_indexed"] = {
memory_address["post_indexed"]["value"], 0 "value": int(memory_address["post_indexed"]["value"], 0)
)} }
else: else:
new_dict["post_indexed"] = memory_address["post_indexed"] new_dict["post_indexed"] = memory_address["post_indexed"]
return AttrDict({self.MEMORY_ID: new_dict}) return AttrDict({self.MEMORY_ID: new_dict})
@@ -408,27 +410,27 @@ class ParserAArch64(BaseParser):
Resolve range or list register operand to list of registers. Resolve range or list register operand to list of registers.
Returns None if neither list nor range Returns None if neither list nor range
""" """
if 'register' in operand: if "register" in operand:
if 'list' in operand.register: if "list" in operand.register:
index = operand.register.get('index') index = operand.register.get("index")
range_list = [] range_list = []
for reg in operand.register.list: for reg in operand.register.list:
reg = deepcopy(reg) reg = deepcopy(reg)
if index is not None: if index is not None:
reg['index'] = int(index, 0) reg["index"] = int(index, 0)
range_list.append(AttrDict({self.REGISTER_ID: reg})) range_list.append(AttrDict({self.REGISTER_ID: reg}))
return range_list return range_list
elif 'range' in operand.register: elif "range" in operand.register:
base_register = operand.register.range[0] base_register = operand.register.range[0]
index = operand.register.get('index') index = operand.register.get("index")
range_list = [] range_list = []
start_name = base_register.name start_name = base_register.name
end_name = operand.register.range[1].name end_name = operand.register.range[1].name
for name in range(int(start_name), int(end_name) + 1): for name in range(int(start_name), int(end_name) + 1):
reg = deepcopy(base_register) reg = deepcopy(base_register)
if index is not None: if index is not None:
reg['index'] = int(index, 0) reg["index"] = int(index, 0)
reg['name'] = str(name) reg["name"] = str(name)
range_list.append(AttrDict({self.REGISTER_ID: reg})) range_list.append(AttrDict({self.REGISTER_ID: reg}))
return range_list return range_list
# neither register list nor range, return unmodified # neither register list nor range, return unmodified
@@ -482,10 +484,12 @@ class ParserAArch64(BaseParser):
return AttrDict({self.IMMEDIATE_ID: immediate}) return AttrDict({self.IMMEDIATE_ID: immediate})
else: else:
# change 'mantissa' key to 'value' # change 'mantissa' key to 'value'
return AttrDict({ return AttrDict(
self.IMMEDIATE_ID: AttrDict({ {
"value": immediate[dict_name]["mantissa"], self.IMMEDIATE_ID: AttrDict(
"type": dict_name})} {"value": immediate[dict_name]["mantissa"], "type": dict_name}
)
}
) )
def process_label(self, label): def process_label(self, label):

View File

@@ -23,9 +23,9 @@ class ParserX86ATT(BaseParser):
def construct_parser(self): def construct_parser(self):
"""Create parser for ARM AArch64 ISA.""" """Create parser for ARM AArch64 ISA."""
decimal_number = pp.Combine(pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)).setResultsName( decimal_number = pp.Combine(
"value" pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)
) ).setResultsName("value")
hex_number = pp.Combine( hex_number = pp.Combine(
pp.Optional(pp.Literal("-")) + pp.Literal("0x") + pp.Word(pp.hexnums) pp.Optional(pp.Literal("-")) + pp.Literal("0x") + pp.Word(pp.hexnums)
).setResultsName("value") ).setResultsName("value")
@@ -41,7 +41,8 @@ class ParserX86ATT(BaseParser):
identifier = pp.Group( identifier = pp.Group(
pp.Optional(id_offset).setResultsName("offset") pp.Optional(id_offset).setResultsName("offset")
+ pp.Combine( + pp.Combine(
pp.delimitedList(pp.Combine(first + pp.Optional(rest)), delim="::"), joinString="::" pp.delimitedList(pp.Combine(first + pp.Optional(rest)), delim="::"),
joinString="::",
).setResultsName("name") ).setResultsName("name")
+ pp.Optional(relocation).setResultsName("relocation") + pp.Optional(relocation).setResultsName("relocation")
).setResultsName("identifier") ).setResultsName("identifier")
@@ -443,7 +444,12 @@ class ParserX86ATT(BaseParser):
"""Check if register is a vector register""" """Check if register is a vector register"""
if register is None: if register is None:
return False return False
if register["name"].rstrip(string.digits).lower() in ["mm", "xmm", "ymm", "zmm"]: if register["name"].rstrip(string.digits).lower() in [
"mm",
"xmm",
"ymm",
"zmm",
]:
return True return True
return False return False

View File

@@ -47,7 +47,9 @@ class ArchSemantics(ISASemantics):
indices = [port_list.index(p) for p in ports] indices = [port_list.index(p) for p in ports]
# check if port sum of used ports for uop are unbalanced # check if port sum of used ports for uop are unbalanced
port_sums = self._to_list(itemgetter(*indices)(self.get_throughput_sum(kernel))) port_sums = self._to_list(itemgetter(*indices)(self.get_throughput_sum(kernel)))
instr_ports = self._to_list(itemgetter(*indices)(instruction_form["port_pressure"])) instr_ports = self._to_list(
itemgetter(*indices)(instruction_form["port_pressure"])
)
if len(set(port_sums)) > 1: if len(set(port_sums)) > 1:
# balance ports # balance ports
# init list for keeping track of the current change # init list for keeping track of the current change
@@ -270,7 +272,8 @@ class ArchSemantics(ISASemantics):
reg_type reg_type
] ]
st_data_port_pressure = [ st_data_port_pressure = [
pp * multiplier for pp in st_data_port_pressure] pp * multiplier for pp in st_data_port_pressure
]
data_port_pressure = [ data_port_pressure = [
sum(x) for x in zip(data_port_pressure, st_data_port_pressure) sum(x) for x in zip(data_port_pressure, st_data_port_pressure)
] ]
@@ -343,7 +346,9 @@ class ArchSemantics(ISASemantics):
def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags): def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags):
"""Apply performance data to instruction if it was found in the archDB""" """Apply performance data to instruction if it was found in the archDB"""
throughput = instruction_data["throughput"] throughput = instruction_data["throughput"]
port_pressure = self._machine_model.average_port_pressure(instruction_data["port_pressure"]) port_pressure = self._machine_model.average_port_pressure(
instruction_data["port_pressure"]
)
instruction_form["port_uops"] = instruction_data["port_pressure"] instruction_form["port_uops"] = instruction_data["port_pressure"]
try: try:
assert isinstance(port_pressure, list) assert isinstance(port_pressure, list)

View File

@@ -1,20 +1,19 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import hashlib
import os import os
import pickle import pickle
import re import re
import string import string
from collections import defaultdict
from copy import deepcopy from copy import deepcopy
from itertools import product from itertools import product
import hashlib
from pathlib import Path from pathlib import Path
from collections import defaultdict
import ruamel.yaml import ruamel.yaml
from ruamel.yaml.compat import StringIO
from osaca import __version__, utils from osaca import __version__, utils
from osaca.parser import ParserX86ATT from osaca.parser import ParserX86ATT
from ruamel.yaml.compat import StringIO
class MachineModel(object): class MachineModel(object):
@@ -37,7 +36,13 @@ class MachineModel(object):
"hidden_loads": None, "hidden_loads": None,
"load_latency": {}, "load_latency": {},
"load_throughput": [ "load_throughput": [
{"base": b, "index": i, "offset": o, "scale": s, "port_pressure": []} {
"base": b,
"index": i,
"offset": o,
"scale": s,
"port_pressure": [],
}
for b, i, o, s in product(["gpr"], ["gpr", None], ["imd", None], [1, 8]) for b, i, o, s in product(["gpr"], ["gpr", None], ["imd", None], [1, 8])
], ],
"load_throughput_default": [], "load_throughput_default": [],
@@ -128,7 +133,8 @@ class MachineModel(object):
instruction_form instruction_form
for instruction_form in name_matched_iforms for instruction_form in name_matched_iforms
if self._match_operands( if self._match_operands(
instruction_form["operands"] if "operands" in instruction_form else [], operands instruction_form["operands"] if "operands" in instruction_form else [],
operands,
) )
) )
except StopIteration: except StopIteration:
@@ -150,7 +156,13 @@ class MachineModel(object):
return average_pressure return average_pressure
def set_instruction( def set_instruction(
self, name, operands=None, latency=None, port_pressure=None, throughput=None, uops=None self,
name,
operands=None,
latency=None,
port_pressure=None,
throughput=None,
uops=None,
): ):
"""Import instruction form information.""" """Import instruction form information."""
# If it already exists. Overwrite information. # If it already exists. Overwrite information.
@@ -500,7 +512,11 @@ class MachineModel(object):
"""Check if the types of operand ``i_operand`` and ``operand`` match.""" """Check if the types of operand ``i_operand`` and ``operand`` match."""
# check for wildcard # check for wildcard
if self.WILDCARD in operand: if self.WILDCARD in operand:
if "class" in i_operand and i_operand["class"] == "register" or "register" in i_operand: if (
"class" in i_operand
and i_operand["class"] == "register"
or "register" in i_operand
):
return True return True
else: else:
return False return False
@@ -527,20 +543,27 @@ class MachineModel(object):
return self._is_AArch64_mem_type(i_operand, operand["memory"]) return self._is_AArch64_mem_type(i_operand, operand["memory"])
# immediate # immediate
if i_operand["class"] == "immediate" and i_operand["imd"] == self.WILDCARD: if i_operand["class"] == "immediate" and i_operand["imd"] == self.WILDCARD:
return "value" in operand or \ return "value" in operand or (
("immediate" in operand and "value" in operand["immediate"]) "immediate" in operand and "value" in operand["immediate"]
)
if i_operand["class"] == "immediate" and i_operand["imd"] == "int": if i_operand["class"] == "immediate" and i_operand["imd"] == "int":
return ("value" in operand and operand.get("type", None) == "int") or \ return ("value" in operand and operand.get("type", None) == "int") or (
("immediate" in operand and "value" in operand["immediate"] and "immediate" in operand
operand["immediate"].get("type", None) == "int") and "value" in operand["immediate"]
and operand["immediate"].get("type", None) == "int"
)
if i_operand["class"] == "immediate" and i_operand["imd"] == "float": if i_operand["class"] == "immediate" and i_operand["imd"] == "float":
return ("float" in operand and operand.get("type", None) == "float") or \ return ("float" in operand and operand.get("type", None) == "float") or (
("immediate" in operand and "float" in operand["immediate"] and "immediate" in operand
operand["immediate"].get("type", None) == "float") and "float" in operand["immediate"]
and operand["immediate"].get("type", None) == "float"
)
if i_operand["class"] == "immediate" and i_operand["imd"] == "double": if i_operand["class"] == "immediate" and i_operand["imd"] == "double":
return ("double" in operand and operand.get("type", None) == "double") or \ return ("double" in operand and operand.get("type", None) == "double") or (
("immediate" in operand and "double" in operand["immediate"] and "immediate" in operand
operand["immediate"].get("type", None) == "double") and "double" in operand["immediate"]
and operand["immediate"].get("type", None) == "double"
)
# identifier # identifier
if "identifier" in operand or ( if "identifier" in operand or (
"immediate" in operand and "identifier" in operand["immediate"] "immediate" in operand and "identifier" in operand["immediate"]
@@ -577,7 +600,10 @@ class MachineModel(object):
def _compare_db_entries(self, operand_1, operand_2): def _compare_db_entries(self, operand_1, operand_2):
"""Check if operand types in DB format (i.e., not parsed) match.""" """Check if operand types in DB format (i.e., not parsed) match."""
operand_attributes = list( operand_attributes = list(
filter(lambda x: True if x != "source" and x != "destination" else False, operand_1) filter(
lambda x: True if x != "source" and x != "destination" else False,
operand_1,
)
) )
for key in operand_attributes: for key in operand_attributes:
try: try:

View File

@@ -1,6 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from itertools import chain from itertools import chain
from copy import deepcopy
from osaca import utils from osaca import utils
from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT
@@ -100,40 +99,51 @@ class ISASemantics(object):
# post-process pre- and post-indexing for aarch64 memory operands # post-process pre- and post-indexing for aarch64 memory operands
if self._isa == "aarch64": if self._isa == "aarch64":
for operand in [op for op in op_dict["source"] if "memory" in op]: for operand in [op for op in op_dict["source"] if "memory" in op]:
post_indexed = ("post_indexed" in operand["memory"] and post_indexed = (
operand["memory"]["post_indexed"]) "post_indexed" in operand["memory"] and operand["memory"]["post_indexed"]
pre_indexed = ("pre_indexed" in operand["memory"] and )
operand["memory"]["pre_indexed"]) pre_indexed = (
"pre_indexed" in operand["memory"] and operand["memory"]["pre_indexed"]
)
if post_indexed or pre_indexed: if post_indexed or pre_indexed:
op_dict["src_dst"].append( op_dict["src_dst"].append(
AttrDict.convert_dict({ AttrDict.convert_dict(
"register": operand["memory"]["base"], {
"pre_indexed": pre_indexed, "register": operand["memory"]["base"],
"post_indexed": post_indexed}) "pre_indexed": pre_indexed,
"post_indexed": post_indexed,
}
)
) )
for operand in [op for op in op_dict["destination"] if "memory" in op]: for operand in [op for op in op_dict["destination"] if "memory" in op]:
post_indexed = ("post_indexed" in operand["memory"] and post_indexed = (
operand["memory"]["post_indexed"]) "post_indexed" in operand["memory"] and operand["memory"]["post_indexed"]
pre_indexed = ("pre_indexed" in operand["memory"] and )
operand["memory"]["pre_indexed"]) pre_indexed = (
"pre_indexed" in operand["memory"] and operand["memory"]["pre_indexed"]
)
if post_indexed or pre_indexed: if post_indexed or pre_indexed:
op_dict["src_dst"].append( op_dict["src_dst"].append(
AttrDict.convert_dict({ AttrDict.convert_dict(
"register": operand["memory"]["base"], {
"pre_indexed": pre_indexed, "register": operand["memory"]["base"],
"post_indexed": post_indexed}) "pre_indexed": pre_indexed,
"post_indexed": post_indexed,
}
)
) )
# store operand list in dict and reassign operand key/value pair # store operand list in dict and reassign operand key/value pair
instruction_form["semantic_operands"] = AttrDict.convert_dict(op_dict) instruction_form["semantic_operands"] = AttrDict.convert_dict(op_dict)
# assign LD/ST flags # assign LD/ST flags
instruction_form["flags"] = instruction_form["flags"] if "flags" in instruction_form else [] instruction_form["flags"] = (
instruction_form["flags"] if "flags" in instruction_form else []
)
if self._has_load(instruction_form): if self._has_load(instruction_form):
instruction_form["flags"] += [INSTR_FLAGS.HAS_LD] instruction_form["flags"] += [INSTR_FLAGS.HAS_LD]
if self._has_store(instruction_form): if self._has_store(instruction_form):
instruction_form["flags"] += [INSTR_FLAGS.HAS_ST] instruction_form["flags"] += [INSTR_FLAGS.HAS_ST]
def get_reg_changes(self, instruction_form, only_postindexed=False): def get_reg_changes(self, instruction_form, only_postindexed=False):
""" """
Returns register changes, as dict, for insruction_form, based on operation defined in isa. Returns register changes, as dict, for insruction_form, based on operation defined in isa.
@@ -141,12 +151,16 @@ class ISASemantics(object):
Empty dict if no changes of registers occured. None for registers with unknown changes. Empty dict if no changes of registers occured. None for registers with unknown changes.
If only_postindexed is True, only considers changes due to post_indexed memory references. If only_postindexed is True, only considers changes due to post_indexed memory references.
""" """
if instruction_form.get('instruction') is None: if instruction_form.get("instruction") is None:
return {} return {}
dest_reg_names = [op.register.get('prefix', '') + op.register.name dest_reg_names = [
for op in chain(instruction_form.semantic_operands.destination, op.register.get("prefix", "") + op.register.name
instruction_form.semantic_operands.src_dst) for op in chain(
if 'register' in op] instruction_form.semantic_operands.destination,
instruction_form.semantic_operands.src_dst,
)
if "register" in op
]
isa_data = self._isa_model.get_instruction( isa_data = self._isa_model.get_instruction(
instruction_form["instruction"], instruction_form["operands"] instruction_form["instruction"], instruction_form["operands"]
) )
@@ -162,50 +176,50 @@ class ISASemantics(object):
if only_postindexed: if only_postindexed:
for o in instruction_form.operands: for o in instruction_form.operands:
if 'post_indexed' in o.get('memory', {}): if "post_indexed" in o.get("memory", {}):
base_name = o.memory.base.get('prefix', '') + o.memory.base.name base_name = o.memory.base.get("prefix", "") + o.memory.base.name
return {base_name: { return {
'name': o.memory.base.get('prefix', '') + o.memory.base.name, base_name: {
'value': o.memory.post_indexed.value "name": o.memory.base.get("prefix", "") + o.memory.base.name,
}} "value": o.memory.post_indexed.value,
}
}
return {} return {}
reg_operand_names = {} # e.g., {'rax': 'op1'} reg_operand_names = {} # e.g., {'rax': 'op1'}
operand_state = {} # e.g., {'op1': {'name': 'rax', 'value': 0}} 0 means unchanged operand_state = {} # e.g., {'op1': {'name': 'rax', 'value': 0}} 0 means unchanged
for o in instruction_form.operands: for o in instruction_form.operands:
if 'pre_indexed' in o.get('memory', {}): if "pre_indexed" in o.get("memory", {}):
# Assuming no isa_data.operation # Assuming no isa_data.operation
if isa_data.get("operation", None) is not None: if isa_data.get("operation", None) is not None:
raise ValueError( raise ValueError(
"ISA information for pre-indexed instruction {!r} has operation set." "ISA information for pre-indexed instruction {!r} has operation set."
"This is currently not supprted.".format(instruction_form.line)) "This is currently not supprted.".format(instruction_form.line)
base_name = o.memory.base.get('prefix', '') + o.memory.base.name )
reg_operand_names = {base_name: 'op1'} base_name = o.memory.base.get("prefix", "") + o.memory.base.name
operand_state = {'op1': { reg_operand_names = {base_name: "op1"}
'name': base_name, operand_state = {"op1": {"name": base_name, "value": o.memory.offset.value}}
'value': o.memory.offset.value
}}
if isa_data is not None and 'operation' in isa_data: if isa_data is not None and "operation" in isa_data:
for i, o in enumerate(instruction_form.operands): for i, o in enumerate(instruction_form.operands):
operand_name = "op{}".format(i + 1) operand_name = "op{}".format(i + 1)
if "register" in o: if "register" in o:
o_reg_name = o["register"].get('prefix', '') + o["register"]["name"] o_reg_name = o["register"].get("prefix", "") + o["register"]["name"]
reg_operand_names[o_reg_name] = operand_name reg_operand_names[o_reg_name] = operand_name
operand_state[operand_name] = { operand_state[operand_name] = {"name": o_reg_name, "value": 0}
'name': o_reg_name,
'value': 0}
elif "immediate" in o: elif "immediate" in o:
operand_state[operand_name] = {'value': o["immediate"]["value"]} operand_state[operand_name] = {"value": o["immediate"]["value"]}
elif "memory" in o: elif "memory" in o:
# TODO lea needs some thinking about # TODO lea needs some thinking about
pass pass
operand_changes = exec(isa_data['operation'], {}, operand_state) exec(isa_data["operation"], {}, operand_state)
change_dict = {reg_name: operand_state.get(reg_operand_names.get(reg_name)) change_dict = {
for reg_name in dest_reg_names} reg_name: operand_state.get(reg_operand_names.get(reg_name))
for reg_name in dest_reg_names
}
return change_dict return change_dict
def _apply_found_ISA_data(self, isa_data, operands): def _apply_found_ISA_data(self, isa_data, operands):
@@ -231,8 +245,10 @@ class ISASemantics(object):
if "hidden_operands" in isa_data: if "hidden_operands" in isa_data:
op_dict["destination"] += [ op_dict["destination"] += [
AttrDict.convert_dict( AttrDict.convert_dict(
{hop["class"]: {k: hop[k] for k in ["class", "source", "destination"]}}) {hop["class"]: {k: hop[k] for k in ["class", "source", "destination"]}}
for hop in isa_data["hidden_operands"]] )
for hop in isa_data["hidden_operands"]
]
return op_dict return op_dict
for i, op in enumerate(isa_data["operands"]): for i, op in enumerate(isa_data["operands"]):

View File

@@ -16,7 +16,12 @@ class KernelDG(nx.DiGraph):
INSTRUCTION_THRESHOLD = 50 INSTRUCTION_THRESHOLD = 50
def __init__( def __init__(
self, parsed_kernel, parser, hw_model: MachineModel, semantics: ArchSemantics, timeout=10 self,
parsed_kernel,
parser,
hw_model: MachineModel,
semantics: ArchSemantics,
timeout=10,
): ):
self.timed_out = False self.timed_out = False
self.kernel = parsed_kernel self.kernel = parsed_kernel
@@ -73,7 +78,7 @@ class KernelDG(nx.DiGraph):
else instruction_form["latency_wo_load"] else instruction_form["latency_wo_load"]
) )
if "storeload_dep" in dep_flags: if "storeload_dep" in dep_flags:
edge_weight += self.model.get('store_to_load_forward_latency', 0) edge_weight += self.model.get("store_to_load_forward_latency", 0)
dg.add_edge( dg.add_edge(
instruction_form["line_number"], instruction_form["line_number"],
dep["line_number"], dep["line_number"],
@@ -98,7 +103,7 @@ class KernelDG(nx.DiGraph):
tmp_kernel = [] + kernel tmp_kernel = [] + kernel
for orig_iform in kernel: for orig_iform in kernel:
temp_iform = copy.copy(orig_iform) temp_iform = copy.copy(orig_iform)
temp_iform['line_number'] += offset temp_iform["line_number"] += offset
tmp_kernel.append(temp_iform) tmp_kernel.append(temp_iform)
# get dependency graph # get dependency graph
dg = self.create_DG(tmp_kernel) dg = self.create_DG(tmp_kernel)
@@ -118,12 +123,15 @@ class KernelDG(nx.DiGraph):
with Manager() as manager: with Manager() as manager:
all_paths = manager.list() all_paths = manager.list()
processes = [ processes = [
Process(target=self._extend_path, args=(all_paths, instr_section, dg, offset)) Process(
target=self._extend_path,
args=(all_paths, instr_section, dg, offset),
)
for instr_section in instrs for instr_section in instrs
] ]
for p in processes: for p in processes:
p.start() p.start()
if (timeout == -1): if timeout == -1:
# no timeout # no timeout
for p in processes: for p in processes:
p.join() p.join()
@@ -162,7 +170,7 @@ class KernelDG(nx.DiGraph):
# extend path by edge bound latencies (e.g., store-to-load latency) # extend path by edge bound latencies (e.g., store-to-load latency)
lat_path = [] lat_path = []
for s, d in nx.utils.pairwise(path): for s, d in nx.utils.pairwise(path):
edge_lat = dg.edges[s, d]['latency'] edge_lat = dg.edges[s, d]["latency"]
# map source node back to original line numbers # map source node back to original line numbers
if s >= offset: if s >= offset:
s -= offset s -= offset
@@ -310,17 +318,17 @@ class KernelDG(nx.DiGraph):
if change is None or reg_state.get(reg, {}) is None: if change is None or reg_state.get(reg, {}) is None:
reg_state[reg] = None reg_state[reg] = None
else: else:
reg_state.setdefault(reg, {'name': reg, 'value': 0}) reg_state.setdefault(reg, {"name": reg, "value": 0})
if change['name'] != reg: if change["name"] != reg:
# renaming occured, ovrwrite value with up-to-now change of source register # renaming occured, ovrwrite value with up-to-now change of source register
reg_state[reg]['name'] = change['name'] reg_state[reg]["name"] = change["name"]
src_reg_state = reg_state.get(change['name'], {'value': 0}) src_reg_state = reg_state.get(change["name"], {"value": 0})
if src_reg_state is None: if src_reg_state is None:
# original register's state was changed beyond reconstruction # original register's state was changed beyond reconstruction
reg_state[reg] = None reg_state[reg] = None
continue continue
reg_state[reg]['value'] = src_reg_state['value'] reg_state[reg]["value"] = src_reg_state["value"]
reg_state[reg]['value'] += change['value'] reg_state[reg]["value"] += change["value"]
return reg_state return reg_state
def get_dependent_instruction_forms(self, instr_form=None, line_number=None): def get_dependent_instruction_forms(self, instr_form=None, line_number=None):
@@ -340,7 +348,8 @@ class KernelDG(nx.DiGraph):
if instruction_form.semantic_operands is None: if instruction_form.semantic_operands is None:
return is_read return is_read
for src in chain( for src in chain(
instruction_form.semantic_operands.source, instruction_form.semantic_operands.src_dst instruction_form.semantic_operands.source,
instruction_form.semantic_operands.src_dst,
): ):
if "register" in src: if "register" in src:
is_read = self.parser.is_reg_dependend_of(register, src.register) or is_read is_read = self.parser.is_reg_dependend_of(register, src.register) or is_read
@@ -372,7 +381,8 @@ class KernelDG(nx.DiGraph):
if instruction_form.semantic_operands is None: if instruction_form.semantic_operands is None:
return False return False
for src in chain( for src in chain(
instruction_form.semantic_operands.source, instruction_form.semantic_operands.src_dst instruction_form.semantic_operands.source,
instruction_form.semantic_operands.src_dst,
): ):
# Here we check for mem dependecies only # Here we check for mem dependecies only
if "memory" not in src: if "memory" not in src:
@@ -387,23 +397,23 @@ class KernelDG(nx.DiGraph):
addr_change -= mem.offset.value addr_change -= mem.offset.value
if mem.base and src.base: if mem.base and src.base:
base_change = register_changes.get( base_change = register_changes.get(
src.base.get('prefix', '') + src.base.name, src.base.get("prefix", "") + src.base.name,
{'name': src.base.get('prefix', '') + src.base.name, 'value': 0}, {"name": src.base.get("prefix", "") + src.base.name, "value": 0},
) )
if base_change is None: if base_change is None:
# Unknown change occurred # Unknown change occurred
continue continue
if mem.base.get('prefix', '') + mem.base['name'] != base_change['name']: if mem.base.get("prefix", "") + mem.base["name"] != base_change["name"]:
# base registers do not match # base registers do not match
continue continue
addr_change += base_change['value'] addr_change += base_change["value"]
elif mem.base or src.base: elif mem.base or src.base:
# base registers do not match # base registers do not match
continue continue
if mem.index and src.index: if mem.index and src.index:
index_change = register_changes.get( index_change = register_changes.get(
src.index.get('prefix', '') + src.index.name, src.index.get("prefix", "") + src.index.name,
{'name': src.index.get('prefix', '') + src.index.name, 'value': 0}, {"name": src.index.get("prefix", "") + src.index.name, "value": 0},
) )
if index_change is None: if index_change is None:
# Unknown change occurred # Unknown change occurred
@@ -411,10 +421,10 @@ class KernelDG(nx.DiGraph):
if mem.scale != src.scale: if mem.scale != src.scale:
# scale factors do not match # scale factors do not match
continue continue
if mem.index.get('prefix', '') + mem.index['name'] != index_change['name']: if mem.index.get("prefix", "") + mem.index["name"] != index_change["name"]:
# index registers do not match # index registers do not match
continue continue
addr_change += index_change['value'] * src.scale addr_change += index_change["value"] * src.scale
elif mem.index or src.index: elif mem.index or src.index:
# index registers do not match # index registers do not match
continue continue
@@ -443,7 +453,8 @@ class KernelDG(nx.DiGraph):
) )
# Check also for possible pre- or post-indexing in memory addresses # Check also for possible pre- or post-indexing in memory addresses
for src in chain( for src in chain(
instruction_form.semantic_operands.source, instruction_form.semantic_operands.src_dst instruction_form.semantic_operands.source,
instruction_form.semantic_operands.src_dst,
): ):
if "memory" in src: if "memory" in src:
if "pre_indexed" in src.memory or "post_indexed" in src.memory: if "pre_indexed" in src.memory or "post_indexed" in src.memory:

View File

@@ -1,7 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os.path import os.path
DATA_DIRS = [os.path.expanduser("~/.osaca/data"), os.path.join(os.path.dirname(__file__), "data")] DATA_DIRS = [
os.path.expanduser("~/.osaca/data"),
os.path.join(os.path.dirname(__file__), "data"),
]
CACHE_DIR = os.path.expanduser("~/.osaca/cache") CACHE_DIR = os.path.expanduser("~/.osaca/cache")

View File

@@ -18,7 +18,8 @@ here = os.path.abspath(os.path.dirname(__file__))
# Stolen from pip # Stolen from pip
def read(*names, **kwargs): def read(*names, **kwargs):
with io.open( with io.open(
os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8") os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8"),
) as fp: ) as fp:
return fp.read() return fp.read()
@@ -38,13 +39,20 @@ def _run_build_cache(dir):
# This is run inside the install staging directory (that had no .pyc files) # This is run inside the install staging directory (that had no .pyc files)
# We don't want to generate any. # We don't want to generate any.
# https://github.com/eliben/pycparser/pull/135 # https://github.com/eliben/pycparser/pull/135
check_call([sys.executable, "-B", "_build_cache.py"], cwd=os.path.join(dir, "osaca", "data")) check_call(
[sys.executable, "-B", "_build_cache.py"],
cwd=os.path.join(dir, "osaca", "data"),
)
class install(_install): class install(_install):
def run(self): def run(self):
_install.run(self) _install.run(self)
self.execute(_run_build_cache, (self.install_lib,), msg="Build ISA and architecture cache") self.execute(
_run_build_cache,
(self.install_lib,),
msg="Build ISA and architecture cache",
)
class sdist(_sdist): class sdist(_sdist):

View File

@@ -33,7 +33,13 @@ class TestCLI(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
osaca.check_arguments(args, parser) osaca.check_arguments(args, parser)
args = parser.parse_args( args = parser.parse_args(
["--arch", "csx", "--import", "WRONG_BENCH", self._find_file("gs", "csx", "gcc")] [
"--arch",
"csx",
"--import",
"WRONG_BENCH",
self._find_file("gs", "csx", "gcc"),
]
) )
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
osaca.check_arguments(args, parser) osaca.check_arguments(args, parser)
@@ -65,7 +71,13 @@ class TestCLI(unittest.TestCase):
def test_check_db(self): def test_check_db(self):
parser = osaca.create_parser(parser=ErrorRaisingArgumentParser()) parser = osaca.create_parser(parser=ErrorRaisingArgumentParser())
args = parser.parse_args( args = parser.parse_args(
["--arch", "tx2", "--db-check", "--verbose", self._find_test_file("triad_x86_iaca.s")] [
"--arch",
"tx2",
"--db-check",
"--verbose",
self._find_test_file("triad_x86_iaca.s"),
]
) )
output = StringIO() output = StringIO()
osaca.run(args, output_file=output) osaca.run(args, output_file=output)
@@ -134,7 +146,13 @@ class TestCLI(unittest.TestCase):
for c in comps[a]: for c in comps[a]:
with self.subTest(kernel=k, arch=a, comp=c): with self.subTest(kernel=k, arch=a, comp=c):
args = parser.parse_args( args = parser.parse_args(
["--arch", a, self._find_file(k, a, c), "--export-graph", "/dev/null"] [
"--arch",
a,
self._find_file(k, a, c),
"--export-graph",
"/dev/null",
]
) )
output = StringIO() output = StringIO()
osaca.run(args, output_file=output) osaca.run(args, output_file=output)
@@ -204,17 +222,13 @@ class TestCLI(unittest.TestCase):
) )
output = StringIO() output = StringIO()
osaca.run(args, output_file=output) osaca.run(args, output_file=output)
self.assertTrue( self.assertTrue(output.getvalue().count("WARNING: LCD analysis timed out") == 1)
output.getvalue().count("WARNING: LCD analysis timed out") == 1
)
args = parser.parse_args( args = parser.parse_args(
["--ignore-unknown", "--lcd-timeout", "-1", self._find_test_file(kernel)] ["--ignore-unknown", "--lcd-timeout", "-1", self._find_test_file(kernel)]
) )
output = StringIO() output = StringIO()
osaca.run(args, output_file=output) osaca.run(args, output_file=output)
self.assertTrue( self.assertTrue(output.getvalue().count("WARNING: LCD analysis timed out") == 0)
output.getvalue().count("WARNING: LCD analysis timed out") == 0
)
def test_lines_arg(self): def test_lines_arg(self):
# Run tests with --lines option # Run tests with --lines option
@@ -227,12 +241,24 @@ class TestCLI(unittest.TestCase):
args = [] args = []
args.append( args.append(
parser.parse_args( parser.parse_args(
["--lines", "146-154", "--arch", "csx", self._find_test_file(kernel_x86)] [
"--lines",
"146-154",
"--arch",
"csx",
self._find_test_file(kernel_x86),
]
) )
) )
args.append( args.append(
parser.parse_args( parser.parse_args(
["--lines", "146:154", "--arch", "csx", self._find_test_file(kernel_x86)] [
"--lines",
"146:154",
"--arch",
"csx",
self._find_test_file(kernel_x86),
]
) )
) )
args.append( args.append(

View File

@@ -17,7 +17,13 @@ class TestDBInterface(unittest.TestCase):
sample_entry = { sample_entry = {
"name": "DoItRightAndDoItFast", "name": "DoItRightAndDoItFast",
"operands": [ "operands": [
{"class": "memory", "offset": "imd", "base": "gpr", "index": "gpr", "scale": 8}, {
"class": "memory",
"offset": "imd",
"base": "gpr",
"index": "gpr",
"scale": 8,
},
{"class": "register", "name": "xmm"}, {"class": "register", "name": "xmm"},
], ],
"throughput": 1.25, "throughput": 1.25,
@@ -35,7 +41,12 @@ class TestDBInterface(unittest.TestCase):
del self.entry_tx2["operands"][1]["name"] del self.entry_tx2["operands"][1]["name"]
self.entry_tx2["operands"][1]["prefix"] = "x" self.entry_tx2["operands"][1]["prefix"] = "x"
# self.entry_zen1['port_pressure'] = [1, 1, 1, 1, 0, 1, 0, 0, 0, 0.5, 1, 0.5, 1] # self.entry_zen1['port_pressure'] = [1, 1, 1, 1, 0, 1, 0, 0, 0, 0.5, 1, 0.5, 1]
self.entry_zen1["port_pressure"] = [[4, "0123"], [1, "4"], [1, "89"], [2, ["8D", "9D"]]] self.entry_zen1["port_pressure"] = [
[4, "0123"],
[1, "4"],
[1, "89"],
[2, ["8D", "9D"]],
]
########### ###########
# Tests # Tests

View File

@@ -1,15 +1,15 @@
# OSACA-BEGIN # OSACA-BEGIN
.L4: .L4:
vmovsd %xmm0, 8(%rax) vmovsd %xmm0, 8(%rax) # line 3 <----------------------------------+
addq $8, %rax addq $8, %rax # |
vmovsd %xmm0, 8(%rax,%rcx,8) vmovsd %xmm0, 8(%rax,%rcx,8) # line 5 <-----------------------------------------------+
vaddsd (%rax), %xmm0, %xmm0 # depends on line 3, 8(%rax) == (%rax+8) vaddsd (%rax), %xmm0, %xmm0 # depends on line 3, 8(%rax) == (%rax+8) ---+ |
subq $-8, %rax subq $-8, %rax # | |
vaddsd -8(%rax), %xmm0, %xmm0 # depends on line 3, 8(%rax) == -8(%rax+16) vaddsd -8(%rax), %xmm0, %xmm0 # depends on line 3, 8(%rax) == -8(%rax+16) ---+ |
dec %rcx dec %rcx # |
vaddsd 8(%rax,%rcx,8), %xmm0, %xmm0 # depends on line 5, 8(%rax,%rdx,8) == 8(%rax+8,%rdx-1,8) vaddsd 8(%rax,%rcx,8), %xmm0, %xmm0 # depends on line 5, 8(%rax,%rdx,8) == 8(%rax+8,%rdx-1,8) --+
movq %rcx, %rdx movq %rcx, %rdx # |
vaddsd 8(%rax,%rdx,8), %xmm0, %xmm0 # depends on line 5, 8(%rax,%rdx,8) == 8(%rax+8,%rdx-1,8) vaddsd 8(%rax,%rdx,8), %xmm0, %xmm0 # depends on line 5, 8(%rax,%rdx,8) == 8(%rax+8,%rdx-1,8) --+
vmulsd %xmm1, %xmm0, %xmm0 vmulsd %xmm1, %xmm0, %xmm0
addq $8, %rax addq $8, %rax
cmpq %rsi, %rax cmpq %rsi, %rax

View File

@@ -34,7 +34,8 @@ class TestFrontend(unittest.TestCase):
) )
self.machine_model_tx2 = MachineModel(arch="tx2") self.machine_model_tx2 = MachineModel(arch="tx2")
self.semantics_csx = ArchSemantics( self.semantics_csx = ArchSemantics(
self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, "isa/x86.yml") self.machine_model_csx,
path_to_yaml=os.path.join(self.MODULE_DATA_DIR, "isa/x86.yml"),
) )
self.semantics_tx2 = ArchSemantics( self.semantics_tx2 = ArchSemantics(
self.machine_model_tx2, self.machine_model_tx2,
@@ -71,7 +72,11 @@ class TestFrontend(unittest.TestCase):
def test_frontend_AArch64(self): def test_frontend_AArch64(self):
dg = KernelDG( dg = KernelDG(
self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2, self.semantics_tx2) self.kernel_AArch64,
self.parser_AArch64,
self.machine_model_tx2,
self.semantics_tx2,
)
fe = Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, "tx2.yml")) fe = Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, "tx2.yml"))
fe.full_analysis(self.kernel_AArch64, dg, verbose=True) fe.full_analysis(self.kernel_AArch64, dg, verbose=True)
# TODO compare output with checked string # TODO compare output with checked string

View File

@@ -109,7 +109,8 @@ class TestMarkerUtils(unittest.TestCase):
kernel_start = len( kernel_start = len(
list( list(
filter( filter(
None, (prologue + mov_start_var + bytes_var_1).split("\n") None,
(prologue + mov_start_var + bytes_var_1).split("\n"),
) )
) )
) )
@@ -142,7 +143,12 @@ class TestMarkerUtils(unittest.TestCase):
epilogue = ".LE9:\t\t#12.2\n" "call dummy\n" epilogue = ".LE9:\t\t#12.2\n" "call dummy\n"
kernel_length = len(list(filter(None, kernel.split("\n")))) kernel_length = len(list(filter(None, kernel.split("\n"))))
bytes_variations = [bytes_1_line, bytes_2_lines_1, bytes_2_lines_2, bytes_3_lines] bytes_variations = [
bytes_1_line,
bytes_2_lines_1,
bytes_2_lines_2,
bytes_3_lines,
]
mov_start_variations = [mov_start_1, mov_start_2] mov_start_variations = [mov_start_1, mov_start_2]
mov_end_variations = [mov_end_1, mov_end_2] mov_end_variations = [mov_end_1, mov_end_2]
# actual tests # actual tests
@@ -171,7 +177,8 @@ class TestMarkerUtils(unittest.TestCase):
kernel_start = len( kernel_start = len(
list( list(
filter( filter(
None, (prologue + mov_start_var + bytes_var_1).split("\n") None,
(prologue + mov_start_var + bytes_var_1).split("\n"),
) )
) )
) )

View File

@@ -24,7 +24,9 @@ class TestParserAArch64(unittest.TestCase):
def test_comment_parser(self): def test_comment_parser(self):
self.assertEqual(self._get_comment(self.parser, "// some comments"), "some comments") self.assertEqual(self._get_comment(self.parser, "// some comments"), "some comments")
self.assertEqual(self._get_comment(self.parser, "\t\t//AA BB CC \t end \t"), "AA BB CC end") self.assertEqual(
self._get_comment(self.parser, "\t\t//AA BB CC \t end \t"), "AA BB CC end"
)
self.assertEqual( self.assertEqual(
self._get_comment(self.parser, "\t//// comment //// comment"), self._get_comment(self.parser, "\t//// comment //// comment"),
"// comment //// comment", "// comment //// comment",
@@ -36,7 +38,8 @@ class TestParserAArch64(unittest.TestCase):
self.assertEqual(self._get_label(self.parser, ".2.3_2_pack.3:").name, ".2.3_2_pack.3") self.assertEqual(self._get_label(self.parser, ".2.3_2_pack.3:").name, ".2.3_2_pack.3")
self.assertEqual(self._get_label(self.parser, ".L1:\t\t\t//label1").name, ".L1") self.assertEqual(self._get_label(self.parser, ".L1:\t\t\t//label1").name, ".L1")
self.assertEqual( self.assertEqual(
" ".join(self._get_label(self.parser, ".L1:\t\t\t//label1").comment), "label1" " ".join(self._get_label(self.parser, ".L1:\t\t\t//label1").comment),
"label1",
) )
with self.assertRaises(ParseException): with self.assertRaises(ParseException):
self._get_label(self.parser, "\t.cfi_startproc") self._get_label(self.parser, "\t.cfi_startproc")
@@ -316,7 +319,8 @@ class TestParserAArch64(unittest.TestCase):
value1 = self.parser.normalize_imd(imd_decimal_1) value1 = self.parser.normalize_imd(imd_decimal_1)
self.assertEqual(value1, self.parser.normalize_imd(imd_hex_1)) self.assertEqual(value1, self.parser.normalize_imd(imd_hex_1))
self.assertEqual( self.assertEqual(
self.parser.normalize_imd(imd_decimal_2), self.parser.normalize_imd(imd_hex_2) self.parser.normalize_imd(imd_decimal_2),
self.parser.normalize_imd(imd_hex_2),
) )
self.assertEqual(self.parser.normalize_imd(imd_float_11), value1) self.assertEqual(self.parser.normalize_imd(imd_float_11), value1)
self.assertEqual(self.parser.normalize_imd(imd_float_12), value1) self.assertEqual(self.parser.normalize_imd(imd_float_12), value1)

View File

@@ -26,7 +26,8 @@ class TestParserX86ATT(unittest.TestCase):
self.assertEqual(self._get_comment(self.parser, "# some comments"), "some comments") self.assertEqual(self._get_comment(self.parser, "# some comments"), "some comments")
self.assertEqual(self._get_comment(self.parser, "\t\t#AA BB CC \t end \t"), "AA BB CC end") self.assertEqual(self._get_comment(self.parser, "\t\t#AA BB CC \t end \t"), "AA BB CC end")
self.assertEqual( self.assertEqual(
self._get_comment(self.parser, "\t## comment ## comment"), "# comment ## comment" self._get_comment(self.parser, "\t## comment ## comment"),
"# comment ## comment",
) )
def test_label_parser(self): def test_label_parser(self):
@@ -35,7 +36,8 @@ class TestParserX86ATT(unittest.TestCase):
self.assertEqual(self._get_label(self.parser, ".2.3_2_pack.3:").name, ".2.3_2_pack.3") self.assertEqual(self._get_label(self.parser, ".2.3_2_pack.3:").name, ".2.3_2_pack.3")
self.assertEqual(self._get_label(self.parser, ".L1:\t\t\t#label1").name, ".L1") self.assertEqual(self._get_label(self.parser, ".L1:\t\t\t#label1").name, ".L1")
self.assertEqual( self.assertEqual(
" ".join(self._get_label(self.parser, ".L1:\t\t\t#label1").comment), "label1" " ".join(self._get_label(self.parser, ".L1:\t\t\t#label1").comment),
"label1",
) )
with self.assertRaises(ParseException): with self.assertRaises(ParseException):
self._get_label(self.parser, "\t.cfi_startproc") self._get_label(self.parser, "\t.cfi_startproc")
@@ -47,7 +49,8 @@ class TestParserX86ATT(unittest.TestCase):
self.assertEqual(len(self._get_directive(self.parser, "\t.align\t16,0x90").parameters), 2) self.assertEqual(len(self._get_directive(self.parser, "\t.align\t16,0x90").parameters), 2)
self.assertEqual(len(self._get_directive(self.parser, ".text").parameters), 0) self.assertEqual(len(self._get_directive(self.parser, ".text").parameters), 0)
self.assertEqual( self.assertEqual(
len(self._get_directive(self.parser, '.file\t1 "path/to/file.c"').parameters), 2 len(self._get_directive(self.parser, '.file\t1 "path/to/file.c"').parameters),
2,
) )
self.assertEqual( self.assertEqual(
self._get_directive(self.parser, '.file\t1 "path/to/file.c"').parameters[1], self._get_directive(self.parser, '.file\t1 "path/to/file.c"').parameters[1],
@@ -62,7 +65,12 @@ class TestParserX86ATT(unittest.TestCase):
self.parser, self.parser,
"\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support", "\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support",
).parameters, ).parameters,
["__TEXT", "__eh_frame", "coalesced", "no_toc+strip_static_syms+live_support"], [
"__TEXT",
"__eh_frame",
"coalesced",
"no_toc+strip_static_syms+live_support",
],
) )
self.assertEqual( self.assertEqual(
self._get_directive( self._get_directive(
@@ -74,7 +82,9 @@ class TestParserX86ATT(unittest.TestCase):
self._get_directive(self.parser, "\t.align\t16,0x90").parameters[1], "0x90" self._get_directive(self.parser, "\t.align\t16,0x90").parameters[1], "0x90"
) )
self.assertEqual( self.assertEqual(
self._get_directive(self.parser, " .byte 100,103,144 #IACA START")["name"], self._get_directive(self.parser, " .byte 100,103,144 #IACA START")[
"name"
],
"byte", "byte",
) )
self.assertEqual( self.assertEqual(
@@ -242,10 +252,12 @@ class TestParserX86ATT(unittest.TestCase):
imd_decimal_2 = {"value": "8"} imd_decimal_2 = {"value": "8"}
imd_hex_2 = {"value": "8"} imd_hex_2 = {"value": "8"}
self.assertEqual( self.assertEqual(
self.parser.normalize_imd(imd_decimal_1), self.parser.normalize_imd(imd_hex_1) self.parser.normalize_imd(imd_decimal_1),
self.parser.normalize_imd(imd_hex_1),
) )
self.assertEqual( self.assertEqual(
self.parser.normalize_imd(imd_decimal_2), self.parser.normalize_imd(imd_hex_2) self.parser.normalize_imd(imd_decimal_2),
self.parser.normalize_imd(imd_hex_2),
) )
def test_reg_dependency(self): def test_reg_dependency(self):

View File

@@ -11,8 +11,14 @@ from copy import deepcopy
import networkx as nx import networkx as nx
from osaca.osaca import get_unmatched_instruction_ratio from osaca.osaca import get_unmatched_instruction_ratio
from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT
from osaca.semantics import (INSTR_FLAGS, ArchSemantics, ISASemantics, from osaca.semantics import (
KernelDG, MachineModel, reduce_to_section) INSTR_FLAGS,
ArchSemantics,
ISASemantics,
KernelDG,
MachineModel,
reduce_to_section,
)
class TestSemanticTools(unittest.TestCase): class TestSemanticTools(unittest.TestCase):
@@ -66,7 +72,8 @@ class TestSemanticTools(unittest.TestCase):
) )
cls.semantics_x86 = ISASemantics("x86") cls.semantics_x86 = ISASemantics("x86")
cls.semantics_csx = ArchSemantics( cls.semantics_csx = ArchSemantics(
cls.machine_model_csx, path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "isa/x86.yml") cls.machine_model_csx,
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "isa/x86.yml"),
) )
cls.semantics_aarch64 = ISASemantics("aarch64") cls.semantics_aarch64 = ISASemantics("aarch64")
cls.semantics_tx2 = ArchSemantics( cls.semantics_tx2 = ArchSemantics(
@@ -173,7 +180,12 @@ class TestSemanticTools(unittest.TestCase):
) )
self.assertEqual( self.assertEqual(
test_mm_x86.get_store_throughput( test_mm_x86.get_store_throughput(
{"base": {"prefix": "NOT_IN_DB"}, "offset": None, "index": "NOT_NONE", "scale": 1} {
"base": {"prefix": "NOT_IN_DB"},
"offset": None,
"index": "NOT_NONE",
"scale": 1,
}
), ),
[[1, "23"], [1, "4"]], [[1, "23"], [1, "4"]],
) )
@@ -185,7 +197,12 @@ class TestSemanticTools(unittest.TestCase):
) )
self.assertEqual( self.assertEqual(
test_mm_arm.get_store_throughput( test_mm_arm.get_store_throughput(
{"base": {"prefix": "NOT_IN_DB"}, "offset": None, "index": None, "scale": 1} {
"base": {"prefix": "NOT_IN_DB"},
"offset": None,
"index": None,
"scale": 1,
}
), ),
[[1, "34"], [1, "5"]], [[1, "34"], [1, "5"]],
) )
@@ -310,7 +327,10 @@ class TestSemanticTools(unittest.TestCase):
def test_memdependency_x86(self): def test_memdependency_x86(self):
dg = KernelDG( dg = KernelDG(
self.kernel_x86_memdep, self.parser_x86, self.machine_model_csx, self.semantics_csx self.kernel_x86_memdep,
self.parser_x86,
self.machine_model_csx,
self.semantics_csx,
) )
self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg)) self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=3)), {6, 8}) self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=3)), {6, 8})
@@ -322,7 +342,10 @@ class TestSemanticTools(unittest.TestCase):
def test_kernelDG_AArch64(self): def test_kernelDG_AArch64(self):
dg = KernelDG( dg = KernelDG(
self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2, self.semantics_tx2 self.kernel_AArch64,
self.parser_AArch64,
self.machine_model_tx2,
self.semantics_tx2,
) )
self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg)) self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=3)), {7, 8}) self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=3)), {7, 8})
@@ -400,7 +423,7 @@ class TestSemanticTools(unittest.TestCase):
# based on line 6 # based on line 6
self.assertEqual(lc_deps[6]["latency"], 28.0) self.assertEqual(lc_deps[6]["latency"], 28.0)
self.assertEqual( self.assertEqual(
[(iform.line_number, lat) for iform, lat in lc_deps[6]['dependencies']], [(iform.line_number, lat) for iform, lat in lc_deps[6]["dependencies"]],
[(6, 4.0), (10, 6.0), (11, 6.0), (12, 6.0), (13, 6.0), (14, 0)], [(6, 4.0), (10, 6.0), (11, 6.0), (12, 6.0), (13, 6.0), (14, 0)],
) )
@@ -423,7 +446,8 @@ class TestSemanticTools(unittest.TestCase):
# w/o flag dependencies: ID 5 w/ len=1 # w/o flag dependencies: ID 5 w/ len=1
# TODO discuss # TODO discuss
self.assertEqual( self.assertEqual(
lc_deps[lcd_id2]["root"], dg.dg.nodes(data=True)[lcd_id2]["instruction_form"] lc_deps[lcd_id2]["root"],
dg.dg.nodes(data=True)[lcd_id2]["instruction_form"],
) )
self.assertEqual(len(lc_deps[lcd_id2]["dependencies"]), 1) self.assertEqual(len(lc_deps[lcd_id2]["dependencies"]), 1)
self.assertEqual( self.assertEqual(
@@ -438,7 +462,7 @@ class TestSemanticTools(unittest.TestCase):
self.parser_x86, self.parser_x86,
self.machine_model_csx, self.machine_model_csx,
self.semantics_x86, self.semantics_x86,
timeout=10 timeout=10,
) )
end_time = time.perf_counter() end_time = time.perf_counter()
time_10 = end_time - start_time time_10 = end_time - start_time
@@ -448,7 +472,7 @@ class TestSemanticTools(unittest.TestCase):
self.parser_x86, self.parser_x86,
self.machine_model_csx, self.machine_model_csx,
self.semantics_x86, self.semantics_x86,
timeout=2 timeout=2,
) )
end_time = time.perf_counter() end_time = time.perf_counter()
time_2 = end_time - start_time time_2 = end_time - start_time

View File

@@ -1,33 +1,26 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys
import os import os
import re
from subprocess import check_call, check_output, CalledProcessError, STDOUT
from itertools import chain
import shutil
from functools import lru_cache
from glob import glob
from pathlib import Path
from pprint import pprint
import socket
import pickle import pickle
import re
import shutil
import socket
import sys
from copy import deepcopy from copy import deepcopy
from glob import glob
from itertools import chain
from pathlib import Path
from subprocess import STDOUT, CalledProcessError, check_call, check_output
import requests import requests
import numpy as np
import pandas as pd
from osaca.osaca import reduce_to_section
from kerncraft.models import benchmark
from kerncraft.incore_model import ( from kerncraft.incore_model import (
parse_asm,
asm_instrumentation, asm_instrumentation,
iaca_analyse_instrumented_binary, iaca_analyse_instrumented_binary,
llvm_mca_analyse_instrumented_assembly,
osaca_analyse_instrumented_assembly, osaca_analyse_instrumented_assembly,
llvm_mca_analyse_instrumented_assembly parse_asm,
) )
from kerncraft.models import benchmark
from osaca.osaca import reduce_to_section
# Scaling of inner dimension for 1D, 2D and 3D kernels # Scaling of inner dimension for 1D, 2D and 3D kernels
# * consider kernels to be compiled with multiple compilers and different options # * consider kernels to be compiled with multiple compilers and different options
@@ -39,37 +32,50 @@ from kerncraft.incore_model import (
# Collect inner loop body assembly for each kernel/compiler/options combination # Collect inner loop body assembly for each kernel/compiler/options combination
# * analyze with OSACA, IACA and LLVM-MCA # * analyze with OSACA, IACA and LLVM-MCA
hosts_arch_map = {r"skylakesp2": "SKX", hosts_arch_map = {
r"ivyep1": "IVB", r"skylakesp2": "SKX",
r"naples1": "ZEN", r"ivyep1": "IVB",
r"rome1": "ZEN2", r"naples1": "ZEN",
r"warmup": "TX2", r"rome1": "ZEN2",
r"qp4-node-[0-9]+": "A64FX"} r"warmup": "TX2",
r"qp4-node-[0-9]+": "A64FX",
}
arch_info = { arch_info = {
'SKX': { "SKX": {
'prepare': ['likwid-setFrequencies -f 2.4 -t 0'.split()], "prepare": ["likwid-setFrequencies -f 2.4 -t 0".split()],
'IACA': 'SKX', "IACA": "SKX",
'OSACA': 'SKX', "OSACA": "SKX",
'LLVM-MCA': '-mcpu=skylake-avx512', "LLVM-MCA": "-mcpu=skylake-avx512",
'Ithemal': 'skl', "Ithemal": "skl",
'isa': 'x86', "isa": "x86",
'perfevents': [], "perfevents": [],
"cflags": { "cflags": {
'icc': { "icc": {
"Ofast": "-Ofast -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline -ffreestanding -falign-loops".split(), "Ofast": (
"O3": "-O3 -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline -ffreestanding -falign-loops".split(), "-Ofast -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline "
"O2": "-O2 -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline -ffreestanding -falign-loops".split(), "-ffreestanding -falign-loops"
"O1": "-O1 -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline -ffreestanding -falign-loops".split(), ).split(),
"O3": (
"-O3 -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline "
"-ffreestanding -falign-loops"
).split(),
"O2": (
"-O2 -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline "
"-ffreestanding -falign-loops"
).split(),
"O1": (
"-O1 -fno-alias -xCORE-AVX512 -qopt-zmm-usage=high -nolib-inline "
"-ffreestanding -falign-loops"
).split(),
}, },
'clang': { "clang": {
"Ofast": "-Ofast -march=skylake-avx512 -ffreestanding".split(), "Ofast": "-Ofast -march=skylake-avx512 -ffreestanding".split(),
"O3": "-O3 -march=skylake-avx512 -ffreestanding".split(), "O3": "-O3 -march=skylake-avx512 -ffreestanding".split(),
"O2": "-O2 -march=skylake-avx512 -ffreestanding".split(), "O2": "-O2 -march=skylake-avx512 -ffreestanding".split(),
"O1": "-O1 -march=skylake-avx512 -ffreestanding".split(), "O1": "-O1 -march=skylake-avx512 -ffreestanding".split(),
}, },
'gcc': { "gcc": {
"Ofast": "-Ofast -march=skylake-avx512 -lm -ffreestanding -falign-loops=16".split(), "Ofast": "-Ofast -march=skylake-avx512 -lm -ffreestanding -falign-loops=16".split(),
"O3": "-O3 -march=skylake-avx512 -lm -ffreestanding -falign-loops=16".split(), "O3": "-O3 -march=skylake-avx512 -lm -ffreestanding -falign-loops=16".split(),
"O2": "-O2 -march=skylake-avx512 -lm -ffreestanding -falign-loops=16".split(), "O2": "-O2 -march=skylake-avx512 -lm -ffreestanding -falign-loops=16".split(),
@@ -77,17 +83,19 @@ arch_info = {
}, },
}, },
}, },
'IVB': { "IVB": {
'prepare': ['likwid-setFrequencies -f 3.0 -t 0'.split()], "prepare": ["likwid-setFrequencies -f 3.0 -t 0".split()],
'IACA': 'IVB', "IACA": "IVB",
'OSACA': 'IVB', "OSACA": "IVB",
'LLVM-MCA': '-mcpu=ivybridge', "LLVM-MCA": "-mcpu=ivybridge",
'Ithemal': 'ivb', "Ithemal": "ivb",
'isa': 'x86', "isa": "x86",
'perfevents': [], "perfevents": [],
"cflags": { "cflags": {
"icc": { "icc": {
"Ofast": "-Ofast -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "Ofast": (
"-Ofast -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops"
).split(),
"O3": "-O3 -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O3": "-O3 -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
"O2": "-O2 -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O2": "-O2 -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
"O1": "-O1 -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O1": "-O1 -xAVX -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
@@ -106,14 +114,14 @@ arch_info = {
}, },
}, },
}, },
'ZEN': { "ZEN": {
'prepare': ['likwid-setFrequencies -f 2.3 -t 0'.split()], "prepare": ["likwid-setFrequencies -f 2.3 -t 0".split()],
'IACA': None, "IACA": None,
'OSACA': 'ZEN1', "OSACA": "ZEN1",
'LLVM-MCA': '-mcpu=znver1', "LLVM-MCA": "-mcpu=znver1",
'Ithemal': None, "Ithemal": None,
'isa': 'x86', "isa": "x86",
'perfevents': [], "perfevents": [],
"cflags": { "cflags": {
"clang": { "clang": {
"Ofast": "-Ofast -march=znver1 -ffreestanding".split(), "Ofast": "-Ofast -march=znver1 -ffreestanding".split(),
@@ -128,21 +136,23 @@ arch_info = {
"O1": "-O1 -march=znver1 -ffreestanding -falign-loops=16".split(), "O1": "-O1 -march=znver1 -ffreestanding -falign-loops=16".split(),
}, },
"icc": { "icc": {
"Ofast": "-Ofast -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "Ofast": (
"-Ofast -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops"
).split(),
"O3": "-O3 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O3": "-O3 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
"O2": "-O2 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O2": "-O2 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
"O1": "-O1 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O1": "-O1 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
}, },
}, },
}, },
'ZEN2': { "ZEN2": {
'prepare': ['likwid-setFrequencies -f 2.35 -t 0'.split()], "prepare": ["likwid-setFrequencies -f 2.35 -t 0".split()],
'IACA': None, "IACA": None,
'OSACA': 'ZEN2', "OSACA": "ZEN2",
'LLVM-MCA': '-mcpu=znver2', "LLVM-MCA": "-mcpu=znver2",
'Ithemal': None, "Ithemal": None,
'isa': 'x86', "isa": "x86",
'perfevents': [], "perfevents": [],
"cflags": { "cflags": {
"clang": { "clang": {
"Ofast": "-Ofast -march=znver2 -ffreestanding".split(), "Ofast": "-Ofast -march=znver2 -ffreestanding".split(),
@@ -157,22 +167,24 @@ arch_info = {
"O1": "-O1 -march=znver2 -ffreestanding -falign-loops=16".split(), "O1": "-O1 -march=znver2 -ffreestanding -falign-loops=16".split(),
}, },
"icc": { "icc": {
"Ofast": "-Ofast -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "Ofast": (
"-Ofast -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops"
).split(),
"O3": "-O3 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O3": "-O3 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
"O2": "-O2 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O2": "-O2 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
"O1": "-O1 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(), "O1": "-O1 -xAVX2 -fno-alias -nolib-inline -ffreestanding -falign-loops".split(),
}, },
}, },
}, },
'TX2': { "TX2": {
'Clock [MHz]': 2200, # reading out via perf. counters is not supported "Clock [MHz]": 2200, # reading out via perf. counters is not supported
'IACA': None, "IACA": None,
'OSACA': 'TX2', "OSACA": "TX2",
'assign_optimal_throughput': True, "assign_optimal_throughput": True,
'LLVM-MCA': '-mcpu=thunderx2t99 -march=aarch64', "LLVM-MCA": "-mcpu=thunderx2t99 -march=aarch64",
'Ithemal': None, "Ithemal": None,
'isa': 'aarch64', "isa": "aarch64",
'perfevents': [], "perfevents": [],
"cflags": { "cflags": {
"clang": { "clang": {
"Ofast": "-Ofast -target aarch64-unknown-linux-gnu -ffreestanding".split(), "Ofast": "-Ofast -target aarch64-unknown-linux-gnu -ffreestanding".split(),
@@ -188,16 +200,16 @@ arch_info = {
}, },
}, },
}, },
'A64FX': { "A64FX": {
'Clock [MHz]': 1800, # reading out via perf. counters is not supported "Clock [MHz]": 1800, # reading out via perf. counters is not supported
'L2_volume_metric': 'L1<->L2 data volume [GBytes]', "L2_volume_metric": "L1<->L2 data volume [GBytes]",
'IACA': None, "IACA": None,
'OSACA': 'A64FX', "OSACA": "A64FX",
'assign_optimal_throughput': False, "assign_optimal_throughput": False,
'LLVM-MCA': '-mcpu=a64fx -march=aarch64', "LLVM-MCA": "-mcpu=a64fx -march=aarch64",
'Ithemal': None, "Ithemal": None,
'isa': 'aarch64', "isa": "aarch64",
'perfevents': [], "perfevents": [],
"cflags": { "cflags": {
"gcc": { "gcc": {
"Ofast": "-Ofast -msve-vector-bits=512 -march=armv8.2-a+sve -ffreestanding".split(), "Ofast": "-Ofast -msve-vector-bits=512 -march=armv8.2-a+sve -ffreestanding".split(),
@@ -211,7 +223,7 @@ arch_info = {
"O2": "-O2 -target aarch64-unknown-linux-gnu -ffreestanding".split(), "O2": "-O2 -target aarch64-unknown-linux-gnu -ffreestanding".split(),
"O1": "-O1 -target aarch64-unknown-linux-gnu -ffreestanding".split(), "O1": "-O1 -target aarch64-unknown-linux-gnu -ffreestanding".split(),
}, },
} },
}, },
} }
@@ -231,12 +243,13 @@ def get_kernels(kernels=None):
if kernels is None: if kernels is None:
kernels = [] kernels = []
for f in glob("kernels/*.c"): for f in glob("kernels/*.c"):
f = f.rsplit('.', 1)[0].split('/', 1)[1] f = f.rsplit(".", 1)[0].split("/", 1)[1]
if f == "dummy": if f == "dummy":
continue continue
kernels.append(f) kernels.append(f)
return kernels return kernels
# Columns: # Columns:
# arch # arch
# kernel # kernel
@@ -259,6 +272,7 @@ def get_kernels(kernels=None):
# allruns [list (length, repetitions, cy/it, L2 B/it)] # allruns [list (length, repetitions, cy/it, L2 B/it)]
# perfevents [dict event: counter/it] # perfevents [dict event: counter/it]
def build_mark_run_all_kernels(measurements=True, osaca=True, iaca=True, llvm_mca=True): def build_mark_run_all_kernels(measurements=True, osaca=True, iaca=True, llvm_mca=True):
arch = get_current_arch() arch = get_current_arch()
if arch is None: if arch is None:
@@ -268,90 +282,132 @@ def build_mark_run_all_kernels(measurements=True, osaca=True, iaca=True, llvm_mc
islocal = True islocal = True
arches = [arch] arches = [arch]
ainfo = arch_info.get(arch) ainfo = arch_info.get(arch)
if 'prepare' in ainfo: if "prepare" in ainfo:
for cmd in ainfo['prepare']: for cmd in ainfo["prepare"]:
check_call(cmd) check_call(cmd)
for arch in arches: for arch in arches:
ainfo = arch_info.get(arch) ainfo = arch_info.get(arch)
print(arch) print(arch)
data_path = Path(f"build/{arch}/data.pkl") data_path = Path(f"build/{arch}/data.pkl")
if data_path.exists(): if data_path.exists():
with data_path.open('rb') as f: with data_path.open("rb") as f:
data = pickle.load(f) data = pickle.load(f)
else: else:
data = [] data = []
data_lastsaved = deepcopy(data) data_lastsaved = deepcopy(data)
for compiler, compiler_cflags in ainfo['cflags'].items(): for compiler, compiler_cflags in ainfo["cflags"].items():
if not shutil.which(compiler) and islocal: if not shutil.which(compiler) and islocal:
print(compiler, "not found in path! Skipping...") print(compiler, "not found in path! Skipping...")
continue continue
for cflags_name, cflags in compiler_cflags.items(): for cflags_name, cflags in compiler_cflags.items():
for kernel in get_kernels(): for kernel in get_kernels():
print(f"{kernel:<15} {arch:>5} {compiler:>5} {cflags_name:>6}", print(
end=": ", flush=True) f"{kernel:<15} {arch:>5} {compiler:>5} {cflags_name:>6}",
row = list([r for r in data end=": ",
if r['arch'] == arch and r['kernel'] == kernel and flush=True,
r['compiler'] == compiler and r['cflags_name'] == cflags_name]) )
row = list(
[
r
for r in data
if r["arch"] == arch
and r["kernel"] == kernel
and r["compiler"] == compiler
and r["cflags_name"] == cflags_name
]
)
if row: if row:
row = row[0] row = row[0]
else: else:
orig_row = None
row = { row = {
'arch': arch, "arch": arch,
'kernel': kernel, "kernel": kernel,
'compiler': compiler, "compiler": compiler,
'cflags_name': cflags_name, "cflags_name": cflags_name,
'element_size': 8, "element_size": 8,
} }
data.append(row) data.append(row)
# Build # Build
print("build", end="", flush=True) print("build", end="", flush=True)
asm_path, exec_path, overwrite = build_kernel( asm_path, exec_path, overwrite = build_kernel(
kernel, arch, compiler, cflags, cflags_name, dontbuild=not islocal) kernel,
arch,
compiler,
cflags,
cflags_name,
dontbuild=not islocal,
)
if overwrite: if overwrite:
# clear all measurment information # clear all measurment information
row['best_length'] = None row["best_length"] = None
row['best_runtime'] = None row["best_runtime"] = None
row['L2_traffic'] = None row["L2_traffic"] = None
row['allruns'] = None row["allruns"] = None
row['perfevents'] = None row["perfevents"] = None
# Mark for IACA, OSACA and LLVM-MCA # Mark for IACA, OSACA and LLVM-MCA
print("mark", end="", flush=True) print("mark", end="", flush=True)
try: try:
marked_asmfile, marked_objfile, row['pointer_increment'], overwrite = mark( (
asm_path, compiler, cflags, isa=ainfo['isa'], overwrite=overwrite) marked_asmfile,
row['marking_error'] = None marked_objfile,
row["pointer_increment"],
overwrite,
) = mark(
asm_path,
compiler,
cflags,
isa=ainfo["isa"],
overwrite=overwrite,
)
row["marking_error"] = None
except ValueError as e: except ValueError as e:
row['marking_error'] = str(e) row["marking_error"] = str(e)
print(":", e) print(":", e)
continue continue
if overwrite: if overwrite:
# clear all model generated information # clear all model generated information
for model in ['IACA', 'OSACA', 'LLVM-MCA', 'Ithemal']: for model in ["IACA", "OSACA", "LLVM-MCA", "Ithemal"]:
for k in ['ports', 'prediction', 'throughput', 'cp', 'lcd', 'raw']: for k in [
row[model+'_'+k] = None "ports",
"prediction",
"throughput",
"cp",
"lcd",
"raw",
]:
row[model + "_" + k] = None
for model in ['IACA', 'OSACA', 'LLVM-MCA', 'Ithemal']: for model in ["IACA", "OSACA", "LLVM-MCA", "Ithemal"]:
for k in ['ports', 'prediction', 'throughput', 'cp', 'lcd', 'raw']: for k in [
if model+'_'+k not in row: "ports",
row[model+'_'+k] = None "prediction",
"throughput",
"cp",
"lcd",
"raw",
]:
if model + "_" + k not in row:
row[model + "_" + k] = None
# Analyze with IACA, if requested and configured # Analyze with IACA, if requested and configured
if iaca and ainfo['IACA'] is not None: if iaca and ainfo["IACA"] is not None:
print("IACA", end="", flush=True) print("IACA", end="", flush=True)
if not row.get('IACA_ports'): if not row.get("IACA_ports"):
row['IACA_raw'] = iaca_analyse_instrumented_binary( row["IACA_raw"] = iaca_analyse_instrumented_binary(
marked_objfile, micro_architecture=ainfo['IACA']) marked_objfile, micro_architecture=ainfo["IACA"]
row['IACA_ports'] = \ )
{k: v/(row['pointer_increment']/row['element_size']) row["IACA_ports"] = {
for k,v in row['IACA_raw']['port cycles'].items()} k: v / (row["pointer_increment"] / row["element_size"])
row['IACA_prediction'] = row['IACA_raw']['throughput']/( for k, v in row["IACA_raw"]["port cycles"].items()
row['pointer_increment']/row['element_size']) }
row['IACA_throughput'] = max(row['IACA_ports'].values()) row["IACA_prediction"] = row["IACA_raw"]["throughput"] / (
row["pointer_increment"] / row["element_size"]
)
row["IACA_throughput"] = max(row["IACA_ports"].values())
print(". ", end="", flush=True) print(". ", end="", flush=True)
else: else:
print("! ", end="", flush=True) print("! ", end="", flush=True)
@@ -359,56 +415,70 @@ def build_mark_run_all_kernels(measurements=True, osaca=True, iaca=True, llvm_mc
# Analyze with OSACA, if requested # Analyze with OSACA, if requested
if osaca: if osaca:
print("OSACA", end="", flush=True) print("OSACA", end="", flush=True)
if not row.get('OSACA_ports'): if not row.get("OSACA_ports"):
row['OSACA_raw'] = osaca_analyse_instrumented_assembly( row["OSACA_raw"] = osaca_analyse_instrumented_assembly(
marked_asmfile, micro_architecture=ainfo['OSACA'], marked_asmfile,
assign_optimal_throughput=ainfo.get('assign_optimal_throughput', micro_architecture=ainfo["OSACA"],
True)) assign_optimal_throughput=ainfo.get(
row['OSACA_ports'] = \ "assign_optimal_throughput", True
{k: v/(row['pointer_increment']/row['element_size']) ),
for k,v in row['OSACA_raw']['port cycles'].items()} )
row['OSACA_prediction'] = row['OSACA_raw']['throughput']/( row["OSACA_ports"] = {
row['pointer_increment']/row['element_size']) k: v / (row["pointer_increment"] / row["element_size"])
row['OSACA_throughput'] = max(row['OSACA_ports'].values()) for k, v in row["OSACA_raw"]["port cycles"].items()
row['OSACA_cp'] = row['OSACA_raw']['cp_latency']/( }
row['pointer_increment']/row['element_size']) row["OSACA_prediction"] = row["OSACA_raw"]["throughput"] / (
row['OSACA_lcd'] = row['OSACA_raw']['lcd']/( row["pointer_increment"] / row["element_size"]
row['pointer_increment']/row['element_size']) )
row["OSACA_throughput"] = max(row["OSACA_ports"].values())
row["OSACA_cp"] = row["OSACA_raw"]["cp_latency"] / (
row["pointer_increment"] / row["element_size"]
)
row["OSACA_lcd"] = row["OSACA_raw"]["lcd"] / (
row["pointer_increment"] / row["element_size"]
)
print(". ", end="", flush=True) print(". ", end="", flush=True)
else: else:
print("! ", end="", flush=True) print("! ", end="", flush=True)
# Analyze with LLVM-MCA, if requested and configured # Analyze with LLVM-MCA, if requested and configured
if llvm_mca and ainfo['LLVM-MCA'] is not None: if llvm_mca and ainfo["LLVM-MCA"] is not None:
print("LLVM-MCA", end="", flush=True) print("LLVM-MCA", end="", flush=True)
if not row.get('LLVM-MCA_ports'): if not row.get("LLVM-MCA_ports"):
row['LLVM-MCA_raw'] = llvm_mca_analyse_instrumented_assembly( row["LLVM-MCA_raw"] = llvm_mca_analyse_instrumented_assembly(
marked_asmfile, marked_asmfile,
micro_architecture=ainfo['LLVM-MCA'], micro_architecture=ainfo["LLVM-MCA"],
isa=ainfo['isa']) isa=ainfo["isa"],
row['LLVM-MCA_ports'] = \ )
{k: v/(row['pointer_increment']/row['element_size']) row["LLVM-MCA_ports"] = {
for k,v in row['LLVM-MCA_raw']['port cycles'].items()} k: v / (row["pointer_increment"] / row["element_size"])
row['LLVM-MCA_prediction'] =row['LLVM-MCA_raw']['throughput']/( for k, v in row["LLVM-MCA_raw"]["port cycles"].items()
row['pointer_increment']/row['element_size']) }
row['LLVM-MCA_throughput'] = max(row['LLVM-MCA_ports'].values()) row["LLVM-MCA_prediction"] = row["LLVM-MCA_raw"]["throughput"] / (
row['LLVM-MCA_cp'] = row['LLVM-MCA_raw']['cp_latency']/( row["pointer_increment"] / row["element_size"]
row['pointer_increment']/row['element_size']) )
row['LLVM-MCA_lcd'] = row['LLVM-MCA_raw']['lcd']/( row["LLVM-MCA_throughput"] = max(row["LLVM-MCA_ports"].values())
row['pointer_increment']/row['element_size']) row["LLVM-MCA_cp"] = row["LLVM-MCA_raw"]["cp_latency"] / (
row["pointer_increment"] / row["element_size"]
)
row["LLVM-MCA_lcd"] = row["LLVM-MCA_raw"]["lcd"] / (
row["pointer_increment"] / row["element_size"]
)
print(". ", end="", flush=True) print(". ", end="", flush=True)
else: else:
print("! ", end="", flush=True) print("! ", end="", flush=True)
# Analyze with Ithemal, if not running local and configured # Analyze with Ithemal, if not running local and configured
if ainfo['Ithemal'] is not None and not islocal: if ainfo["Ithemal"] is not None and not islocal:
print("Ithemal", end="", flush=True) print("Ithemal", end="", flush=True)
if not row.get('Ithemal_prediction'): if not row.get("Ithemal_prediction"):
with open(marked_asmfile) as f: with open(marked_asmfile) as f:
parsed_code = parse_asm(f.read(), ainfo['isa']) parsed_code = parse_asm(f.read(), ainfo["isa"])
kernel = reduce_to_section(parsed_code, ainfo['isa']) kernel = reduce_to_section(parsed_code, ainfo["isa"])
row['Ithemal_prediction'] = get_ithemal_prediction( row["Ithemal_prediction"] = get_ithemal_prediction(
get_intel_style_code(marked_objfile), model=ainfo['Ithemal']) get_intel_style_code(marked_objfile),
model=ainfo["Ithemal"],
)
print(". ", end="", flush=True) print(". ", end="", flush=True)
else: else:
print("! ", end="", flush=True) print("! ", end="", flush=True)
@@ -416,43 +486,45 @@ def build_mark_run_all_kernels(measurements=True, osaca=True, iaca=True, llvm_mc
if measurements and islocal: if measurements and islocal:
# run measurements if on same hardware # run measurements if on same hardware
print("scale", end="", flush=True) print("scale", end="", flush=True)
if not row.get('allruns'): if not row.get("allruns"):
# find best length with concurrent L2 measurement # find best length with concurrent L2 measurement
scaling_runs, best = scalingrun(exec_path) scaling_runs, best = scalingrun(exec_path)
row['best_length'] = best[0] row["best_length"] = best[0]
row['best_runtime'] = best[2] row["best_runtime"] = best[2]
row['L2_traffic'] = best[3] row["L2_traffic"] = best[3]
row['allruns'] = scaling_runs row["allruns"] = scaling_runs
print(f"({best[0]}). ", end="", flush=True) print(f"({best[0]}). ", end="", flush=True)
else: else:
print(f"({row.get('best_length', None)})! ", end="", flush=True) print(
f"({row.get('best_length', None)})! ",
end="",
flush=True,
)
print() print()
# dump to file # dump to file
if data != data_lastsaved: if data != data_lastsaved:
print('saving... ', end="", flush=True) print("saving... ", end="", flush=True)
with data_path.open('wb') as f: with data_path.open("wb") as f:
try: try:
pickle.dump(data, f) pickle.dump(data, f)
data_lastsaved = deepcopy(data) data_lastsaved = deepcopy(data)
print('saved!') print("saved!")
except KeyboardInterrupt: except KeyboardInterrupt:
f.seek(0) f.seek(0)
pickle.dump(data, f) pickle.dump(data, f)
print('saved!') print("saved!")
sys.exit() sys.exit()
def scalingrun(kernel_exec, total_iterations=25000000, lengths=range(8, 1 * 1024 + 1)):
def scalingrun(kernel_exec, total_iterations=25000000, lengths=range(8, 1*1024+1)): # print('{:>8} {:>10} {:>10}'.format("x", "cy/it", "L2 B/it"))
#print('{:>8} {:>10} {:>10}'.format("x", "cy/it", "L2 B/it")) parameters = chain(*[[total_iterations // i, i] for i in lengths])
parameters = chain(*[[total_iterations//i, i] for i in lengths])
# TODO use arch specific events and grooup # TODO use arch specific events and grooup
r, o = perfctr(chain([kernel_exec], map(str, parameters)), r, o = perfctr(chain([kernel_exec], map(str, parameters)), 1, group="L2")
1, group="L2")
global_infos = {} global_infos = {}
for m in [re.match(r"(:?([a-z_\-0-9]+):)?([a-z]+): ([a-z\_\-0-9]+)", l) for l in o]: for m in [re.match(r"(:?([a-z_\-0-9]+):)?([a-z]+): ([a-z\_\-0-9]+)", line) for line in o]:
if m is not None: if m is not None:
try: try:
v = int(m.group(4)) v = int(m.group(4))
@@ -464,37 +536,45 @@ def scalingrun(kernel_exec, total_iterations=25000000, lengths=range(8, 1*1024+1
r[m.group(2)][m.group(3)] = v r[m.group(2)][m.group(3)] = v
results = [] results = []
best = (float('inf'), None) best = (float("inf"), None)
for markername, mmetrics in r.items(): for markername, mmetrics in r.items():
kernelname, repetitions, *_, xlength = markername.split('_') kernelname, repetitions, *_, xlength = markername.split("_")
repetitions = int(repetitions) repetitions = int(repetitions)
xlength = int(xlength) xlength = int(xlength)
total_iterations = mmetrics['repetitions'] * mmetrics['iterations'] total_iterations = mmetrics["repetitions"] * mmetrics["iterations"]
if 'Clock [MHz]' in mmetrics: if "Clock [MHz]" in mmetrics:
clock_hz = mmetrics['Clock [MHz]']*1e6 clock_hz = mmetrics["Clock [MHz]"] * 1e6
else: else:
clock_hz = arch_info[get_current_arch()]['Clock [MHz]']*1e6 clock_hz = arch_info[get_current_arch()]["Clock [MHz]"] * 1e6
cyperit = mmetrics['Runtime (RDTSC) [s]'] * clock_hz / total_iterations cyperit = mmetrics["Runtime (RDTSC) [s]"] * clock_hz / total_iterations
# TODO use arch specific events and grooup # TODO use arch specific events and grooup
if 'L2D load data volume [GBytes]' in mmetrics: if "L2D load data volume [GBytes]" in mmetrics:
l2perit = (mmetrics['L2D load data volume [GBytes]'] + l2perit = (
mmetrics.get('L2D evict data volume [GBytes]', 0))*1e9 / total_iterations (
mmetrics["L2D load data volume [GBytes]"]
+ mmetrics.get("L2D evict data volume [GBytes]", 0)
)
* 1e9
/ total_iterations
)
else: else:
l2perit = \ l2perit = (
mmetrics[arch_info[get_current_arch()]['L2_volume_metric']]*1e9 / total_iterations mmetrics[arch_info[get_current_arch()]["L2_volume_metric"]]
results.append( * 1e9
(xlength, repetitions, cyperit, l2perit) / total_iterations
) )
results.append((xlength, repetitions, cyperit, l2perit))
if cyperit < best[0]: if cyperit < best[0]:
best = cyperit, results[-1] best = cyperit, results[-1]
return results, best[1] return results, best[1]
def mark(asm_path, compiler, cflags, isa, overwrite=False): def mark(asm_path, compiler, cflags, isa, overwrite=False):
# Mark assembly for IACA, OSACA and LLVM-MCA # Mark assembly for IACA, OSACA and LLVM-MCA
marked_asm_path = Path(asm_path).with_suffix(".marked.s") marked_asm_path = Path(asm_path).with_suffix(".marked.s")
if not marked_asm_path.exists() or overwrite: if not marked_asm_path.exists() or overwrite:
overwrite = True overwrite = True
with open(asm_path) as fa, open(marked_asm_path, 'w') as fm: with open(asm_path) as fa, open(marked_asm_path, "w") as fm:
try: try:
_, pointer_increment = asm_instrumentation(fa, fm, isa=isa) _, pointer_increment = asm_instrumentation(fa, fm, isa=isa)
except KeyboardInterrupt: except KeyboardInterrupt:
@@ -505,37 +585,46 @@ def mark(asm_path, compiler, cflags, isa, overwrite=False):
# use maked assembly and extract asm_block and pointer_increment # use maked assembly and extract asm_block and pointer_increment
with open(marked_asm_path) as f: with open(marked_asm_path) as f:
marked_asm = f.read() marked_asm = f.read()
m = re.search(r'pointer_increment=([0-9]+)', marked_asm) m = re.search(r"pointer_increment=([0-9]+)", marked_asm)
if m: if m:
pointer_increment = int(m.group(1)) pointer_increment = int(m.group(1))
else: else:
os.unlink(marked_asm_path) os.unlink(marked_asm_path)
raise ValueError( raise ValueError(
"Could not find `pointer_increment=<byte increment>`. Plase place into file.") "Could not find `pointer_increment=<byte increment>`. Plase place into file."
)
print("! ", end="", flush=True) print("! ", end="", flush=True)
# Compile marked assembly to object for IACA # Compile marked assembly to object for IACA
marked_obj = Path(asm_path).with_suffix(".marked.o") marked_obj = Path(asm_path).with_suffix(".marked.o")
if not marked_obj.exists(): if not marked_obj.exists():
check_call([compiler] + ['-c', str(marked_asm_path), '-o', str(marked_obj)]) check_call([compiler] + ["-c", str(marked_asm_path), "-o", str(marked_obj)])
return str(marked_asm_path), str(marked_obj), pointer_increment, overwrite return str(marked_asm_path), str(marked_obj), pointer_increment, overwrite
def build_kernel(kernel, architecture, compiler, cflags, cflags_name, overwrite=False, def build_kernel(
dontbuild=False): kernel,
architecture,
compiler,
cflags,
cflags_name,
overwrite=False,
dontbuild=False,
):
build_path = f"build/{architecture}/{compiler}/{cflags_name}" build_path = f"build/{architecture}/{compiler}/{cflags_name}"
kernel_assembly = f"{build_path}/{kernel}.s" kernel_assembly = f"{build_path}/{kernel}.s"
kernel_object= f"{build_path}/{kernel}.o" kernel_object = f"{build_path}/{kernel}.o"
executable = f"{build_path}/{kernel}" executable = f"{build_path}/{kernel}"
Path(build_path).mkdir(parents=True, exist_ok=True) Path(build_path).mkdir(parents=True, exist_ok=True)
if not overwrite: if not overwrite:
# Overwrite if any kernel specific file is missing # Overwrite if any kernel specific file is missing
overwrite = ( overwrite = (
not os.path.exists(kernel_object) or not os.path.exists(kernel_object)
not os.path.exists(kernel_assembly) or or not os.path.exists(kernel_assembly)
not os.path.exists(executable)) or not os.path.exists(executable)
)
if dontbuild and overwrite: if dontbuild and overwrite:
raise ValueError("Must build, but not allowed.") raise ValueError("Must build, but not allowed.")
@@ -545,31 +634,35 @@ def build_kernel(kernel, architecture, compiler, cflags, cflags_name, overwrite=
if not Path(f"{build_path}/compiler_version").exists(): if not Path(f"{build_path}/compiler_version").exists():
# Document compiler version # Document compiler version
with open(f"{build_path}/compiler_version", 'w') as f: with open(f"{build_path}/compiler_version", "w") as f:
f.write(check_output([compiler, "-v"], encoding='utf8', stderr=STDOUT)) f.write(check_output([compiler, "-v"], encoding="utf8", stderr=STDOUT))
if overwrite: if overwrite:
# build object + assembly # build object + assembly
check_call([compiler] + check_call([compiler] + cflags + ["-c", f"kernels/{kernel}.c", "-o", kernel_object])
cflags + check_call(
["-c", f"kernels/{kernel}.c", "-o", kernel_object]) [compiler] + cflags + ["-c", f"kernels/{kernel}.c", "-S", "-o", kernel_assembly]
check_call([compiler] + )
cflags +
["-c", f"kernels/{kernel}.c", "-S", "-o", kernel_assembly])
# build main and link executable # build main and link executable
executable_cflags = [ executable_cflags = [
os.environ["LIKWID_DEFINES"], os.environ["LIKWID_DEFINES"],
os.environ["LIKWID_INC"], os.environ["LIKWID_INC"],
os.environ["LIKWID_LIB"] os.environ["LIKWID_LIB"],
] + ['-Ofast'] ] + ["-Ofast"]
check_call([compiler] + executable_cflags + [ check_call(
f"{build_path}/dummy.o", [compiler]
kernel_object, + executable_cflags
"-DMAIN", + [
f"kernels/{kernel}.c", f"{build_path}/dummy.o",
"-llikwid", kernel_object,
"-o", executable]) "-DMAIN",
f"kernels/{kernel}.c",
"-llikwid",
"-o",
executable,
]
)
print(". ", end="", flush=True) print(". ", end="", flush=True)
else: else:
print("! ", end="", flush=True) print("! ", end="", flush=True)
@@ -577,7 +670,7 @@ def build_kernel(kernel, architecture, compiler, cflags, cflags_name, overwrite=
return kernel_assembly, executable, overwrite return kernel_assembly, executable, overwrite
def perfctr(cmd, cores, group='MEM', code_markers=True, verbose=0): def perfctr(cmd, cores, group="MEM", code_markers=True, verbose=0):
""" """
Run *cmd* with likwid-perfctr and returns result as dict. Run *cmd* with likwid-perfctr and returns result as dict.
@@ -586,30 +679,32 @@ def perfctr(cmd, cores, group='MEM', code_markers=True, verbose=0):
if CLI argument cores > 1, running with multi-core, otherwise single-core if CLI argument cores > 1, running with multi-core, otherwise single-core
""" """
# Making sure likwid-perfctr is available: # Making sure likwid-perfctr is available:
if benchmark.find_executable('likwid-perfctr') is None: if benchmark.find_executable("likwid-perfctr") is None:
print("likwid-perfctr was not found. Make sure likwid is installed and found in PATH.", print(
file=sys.stderr) "likwid-perfctr was not found. Make sure likwid is installed and found in PATH.",
file=sys.stderr,
)
sys.exit(1) sys.exit(1)
# FIXME currently only single core measurements support! # FIXME currently only single core measurements support!
perf_cmd = ['likwid-perfctr', '-f', '-O', '-g', group] perf_cmd = ["likwid-perfctr", "-f", "-O", "-g", group]
cpu = 'S0:0' cpu = "S0:0"
if cores > 1: if cores > 1:
cpu += '-'+str(cores-1) cpu += "-" + str(cores - 1)
# Pinned and measured on cpu # Pinned and measured on cpu
perf_cmd += ['-C', cpu] perf_cmd += ["-C", cpu]
# code must be marked using likwid markers # code must be marked using likwid markers
perf_cmd.append('-m') perf_cmd.append("-m")
perf_cmd += cmd perf_cmd += cmd
if verbose > 1: if verbose > 1:
print(' '.join(perf_cmd)) print(" ".join(perf_cmd))
try: try:
with benchmark.fix_env_variable('OMP_NUM_THREADS', None): with benchmark.fix_env_variable("OMP_NUM_THREADS", None):
output = check_output(perf_cmd).decode('utf-8').split('\n') output = check_output(perf_cmd).decode("utf-8").split("\n")
except CalledProcessError as e: except CalledProcessError as e:
print("Executing benchmark failed: {!s}".format(e), file=sys.stderr) print("Executing benchmark failed: {!s}".format(e), file=sys.stderr)
sys.exit(1) sys.exit(1)
@@ -626,7 +721,7 @@ def perfctr(cmd, cores, group='MEM', code_markers=True, verbose=0):
m = re.match(r"TABLE,Region ([a-z\-0-9_]+),", line) m = re.match(r"TABLE,Region ([a-z\-0-9_]+),", line)
if m: if m:
cur_region_name = m.group(1) cur_region_name = m.group(1)
line = line.split(',') line = line.split(",")
try: try:
# Metrics # Metrics
cur_region_data[line[0]] = float(line[1]) cur_region_data[line[0]] = float(line[1])
@@ -639,12 +734,13 @@ def perfctr(cmd, cores, group='MEM', code_markers=True, verbose=0):
continue continue
try: try:
# Event counters # Event counters
if line[2] == '-' or line[2] == 'nan': if line[2] == "-" or line[2] == "nan":
counter_value = 0 counter_value = 0
else: else:
counter_value = int(line[2]) counter_value = int(line[2])
if re.fullmatch(r'[A-Z0-9_]+', line[0]) and \ if re.fullmatch(r"[A-Z0-9_]+", line[0]) and re.fullmatch(
re.fullmatch(r'[A-Z0-9]+(:[A-Z0-9]+=[0-9A-Fa-fx]+)*', line[1]): r"[A-Z0-9]+(:[A-Z0-9]+=[0-9A-Fa-fx]+)*", line[1]
):
cur_region_data.setdefault(line[0], {}) cur_region_data.setdefault(line[0], {})
cur_region_data[line[0]][line[1]] = counter_value cur_region_data[line[0]][line[1]] = counter_value
continue continue
@@ -659,49 +755,52 @@ def perfctr(cmd, cores, group='MEM', code_markers=True, verbose=0):
def remove_html_tags(text): def remove_html_tags(text):
return re.sub('<.*?>', '', text) return re.sub("<.*?>", "", text)
def get_intel_style_code(marked_objfile): def get_intel_style_code(marked_objfile):
# Disassembl with Intel syntax # Disassembl with Intel syntax
cmd = ("objdump -d --demangle --no-leading-addr --no-leading-headers --no-show-raw-insn " cmd = (
"--x86-asm-syntax=intel").split(" ") + [marked_objfile] "objdump -d --demangle --no-leading-addr --no-leading-headers --no-show-raw-insn "
"--x86-asm-syntax=intel"
).split(" ") + [marked_objfile]
asm_raw = check_output(cmd).decode() asm_raw = check_output(cmd).decode()
asm_raw = '\n'.join([l.strip() for l in asm_raw.split('\n')]) asm_raw = "\n".join([line.strip() for line in asm_raw.split("\n")])
kernel_raw = asm_raw[ kernel_raw = asm_raw[
asm_raw.index('mov\tebx, 111\nnop')+len('mov\tebx, 111\nnop') : asm_raw.index("mov\tebx, 111\nnop")
asm_raw.index('mov\tebx, 222\nnop') + len("mov\tebx, 111\nnop") : asm_raw.index("mov\tebx, 222\nnop")
] ]
kernel_lines = kernel_raw.split('\n') kernel_lines = kernel_raw.split("\n")
# Ignore label and jump # Ignore label and jump
return '\n'.join(kernel_lines[:-2]) return "\n".join(kernel_lines[:-2])
def get_ithemal_prediction(code, model='skl'): def get_ithemal_prediction(code, model="skl"):
url = "http://3.18.198.23/predict" url = "http://3.18.198.23/predict"
assert model in ['skl', 'hsw', 'ivb'] assert model in ["skl", "hsw", "ivb"]
r = requests.post(url, {'code': code, 'model': model}) r = requests.post(url, {"code": code, "model": model})
raw_text = remove_html_tags(r.text) raw_text = remove_html_tags(r.text)
m = re.search("Could not generate a prediction: (.*)", raw_text) m = re.search("Could not generate a prediction: (.*)", raw_text)
if m: if m:
print(" error:", m.group(1).strip(), end=' ') print(" error:", m.group(1).strip(), end=" ")
return float('nan') return float("nan")
m = re.search("Prediction: ([0-9\.]+) cycles per iteration", raw_text) m = re.search("Prediction: ([0-9.]+) cycles per iteration", raw_text)
if m: if m:
return float(m.group(1)) return float(m.group(1))
else: else:
return float('nan') return float("nan")
def main(): def main():
# Check for correct LLVM-MCA version # Check for correct LLVM-MCA version
try: try:
llvm_mca = 'LLVM version 12.0.0' in check_output(['llvm-mca', '-version']).decode() llvm_mca = "LLVM version 12.0.0" in check_output(["llvm-mca", "-version"]).decode()
except FileNotFoundError: except FileNotFoundError:
llvm_mca = False llvm_mca = False
build_mark_run_all_kernels(measurements='--no-measurements' not in sys.argv, llvm_mca=llvm_mca) build_mark_run_all_kernels(measurements="--no-measurements" not in sys.argv, llvm_mca=llvm_mca)
sys.exit() sys.exit()
if __name__ == "__main__": if __name__ == "__main__":
main() main()