Files
OSACA/osaca/data/model_importer.py
Julian 08440ed5e1 Validation (#71)
Validating of OSACA predictions for IVB, SKX, ZEN1, ZEN2, A64FX and TX2 with different kernels.

build_and_run.py contains the configuration used at RRZE's testcluster and UR's qpace4, Analysis.ipynb contains the analysis script and results. Raw data from measurements (122MB) will be attached to next OSACA release.

For now, find the raw data here: https://hawo.net/~sijuhamm/d/UPIhBOtz/validation-data.tar.gz

The analysis report can be viewed at https://nbviewer.jupyter.org/github/RRZE-HPC/OSACA/blob/validation/validation/Analysis.ipynb

Quite a few changes on OSACA included:

Feature: register change tracking via semantic understanding of operations
Feature: recording LCD latency along path and exposing this to frontend
Feature: support for memory reference aliases
Feature: store throughput scaling (similar to load throughput scaling)
Fix: model importer works with latest uops.info export
Fix: immediate type tracking on ARM now preserves type in internal representaion
Removed unused KerncraftAPI
2021-04-15 14:42:37 +02:00

300 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import os.path
import sys
import xml.etree.ElementTree as ET
from distutils.version import StrictVersion
from osaca.parser import get_parser
from osaca.semantics import MachineModel
intel_archs = [
"CON",
"WOL",
"NHM",
"WSM",
"SNB",
"IVB",
"HSW",
"BDW",
"SKL",
"SKX",
"KBL",
"CFL",
"CNL",
"ICL",
]
amd_archs = ["ZEN1", "ZEN+", "ZEN2"]
def port_pressure_from_tag_attributes(attrib):
# '1*p015+1*p1+1*p23+1*p4+3*p5' ->
# [[1, '015'], [1, '1'], [1, '23'], [1, '4'], [3, '5']]
port_occupation = []
for p in attrib["ports"].split("+"):
cycles, ports = p.split("*")
ports = ports.lstrip("p")
ports = ports.lstrip("FP")
port_occupation.append([int(cycles), ports])
# Also consider div on DIV pipeline
if "div_cycles" in attrib:
port_occupation.append([int(attrib["div_cycles"]), ["DIV"]])
return port_occupation
def extract_paramters(instruction_tag, parser, isa):
# Extract parameter components
parameters = [] # used to store string representations
parameter_tags = sorted(instruction_tag.findall("operand"), key=lambda p: int(p.attrib["idx"]))
for parameter_tag in parameter_tags:
parameter = {}
# Ignore parameters with suppressed=1
if int(parameter_tag.attrib.get("suppressed", "0")):
continue
p_type = parameter_tag.attrib["type"]
if p_type == "imm":
parameter["class"] = "immediate"
parameter["imd"] = "int"
parameters.append(parameter)
elif p_type == "mem":
parameter["class"] = "memory"
parameter["base"] = "*"
parameter["offset"] = "*"
parameter["index"] = "*"
parameter["scale"] = "*"
parameters.append(parameter)
elif p_type == "reg":
parameter["class"] = "register"
possible_regs = [parser.parse_register("%" + r) for r in parameter_tag.text.split(",")]
if possible_regs[0] is None:
raise ValueError(
"Unknown register type for {} with {}.".format(
parameter_tag.attrib, parameter_tag.text
)
)
if isa == "x86":
if parser.is_vector_register(possible_regs[0]["register"]):
possible_regs[0]["register"]["name"] = possible_regs[0]["register"][
"name"
].lower()[:3]
if "mask" in possible_regs[0]["register"]:
possible_regs[0]["register"]["mask"] = True
else:
possible_regs[0]["register"]["name"] = "gpr"
elif isa == "aarch64":
del possible_regs["register"]["name"]
for key in possible_regs[0]["register"]:
parameter[key] = possible_regs[0]["register"][key]
parameters.append(parameter)
elif p_type == "relbr":
parameter["class"] = "identifier"
parameters.append(parameter)
elif p_type == "agen":
parameter["class"] = "memory"
parameter["base"] = "*"
parameter["offset"] = "*"
parameter["index"] = "*"
parameter["scale"] = "*"
parameters.append(parameter)
else:
raise ValueError("Unknown paramter type {}".format(parameter_tag.attrib))
return parameters
def extract_model(tree, arch, skip_mem=True):
try:
isa = MachineModel.get_isa_for_arch(arch)
except Exception:
print("Skipping...", file=sys.stderr)
return None
mm = MachineModel(isa=isa)
parser = get_parser(isa)
for instruction_tag in tree.findall(".//instruction"):
ignore = False
mnemonic = instruction_tag.attrib["asm"]
iform = instruction_tag.attrib["iform"]
# reduce to second part if mnemonic contain space (e.g., "REX CRC32")
if " " in mnemonic:
mnemonic = mnemonic.split(" ", 1)[1]
# Extract parameter components
try:
parameters = extract_paramters(instruction_tag, parser, isa)
if isa == "x86":
parameters.reverse()
except ValueError as e:
print(e, file=sys.stderr)
# Extract port occupation, throughput and latency
port_pressure, throughput, latency, uops = [], None, None, None
arch_tag = instruction_tag.find('architecture[@name="' + arch.upper() + '"]')
if arch_tag is None:
continue
# skip any instructions without port utilization
if not any(["ports" in x.attrib for x in arch_tag.findall("measurement")]):
print("Couldn't find port utilization, skip: ", iform, file=sys.stderr)
continue
# skip if measured TP is smaller than computed
if [float(x.attrib["TP_ports"]) > min(float(x.attrib["TP_loop"]),
float(x.attrib["TP_unrolled"]))
for x in arch_tag.findall("measurement")][0]:
print(
"Calculated TP is greater than measured TP.",
iform,
file=sys.stderr,
)
# skip if instruction contains memory operand
if skip_mem and any(
[x.attrib["type"] == "mem" for x in instruction_tag.findall("operand")]
):
print("Contains memory operand, skip: ", iform, file=sys.stderr)
continue
# We collect all measurement and IACA information and compare them later
for measurement_tag in arch_tag.iter("measurement"):
if "TP_ports" in measurement_tag.attrib:
throughput = float(measurement_tag.attrib["TP_ports"])
else:
throughput = min(
measurement_tag.attrib.get("TP_loop", float('inf')),
measurement_tag.attrib.get("TP_unroll", float('inf')),
measurement_tag.attrib.get("TP", float('inf')),
)
if throughput == float('inf'):
throughput = None
uops = int(measurement_tag.attrib["uops"]) if "uops" in measurement_tag.attrib else None
if "ports" in measurement_tag.attrib:
port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib))
latencies = [
int(l_tag.attrib["cycles"])
for l_tag in measurement_tag.iter("latency")
if "cycles" in l_tag.attrib
]
if len(latencies) == 0:
latencies = [
int(l_tag.attrib["max_cycles"])
for l_tag in measurement_tag.iter("latency")
if "max_cycles" in l_tag.attrib
]
if latencies[1:] != latencies[:-1]:
print(
"Contradicting latencies found, using smallest:",
iform,
latencies,
file=sys.stderr,
)
if latencies:
latency = min(latencies)
if ignore:
continue
# Ordered by IACA version (newest last)
for iaca_tag in sorted(
arch_tag.iter("IACA"), key=lambda i: StrictVersion(i.attrib["version"])
):
if "ports" in iaca_tag.attrib:
port_pressure.append(port_pressure_from_tag_attributes(iaca_tag.attrib))
# Check if all are equal
if port_pressure:
if port_pressure[1:] != port_pressure[:-1]:
print("Contradicting port occupancies, using latest IACA:", iform, file=sys.stderr)
port_pressure = port_pressure[-1]
else:
# print("No data available for this architecture:", mnemonic, file=sys.stderr)
continue
# Adding Intel's 2D and 3D pipelines on Intel µarchs, without Ice Lake:
if arch.upper() in intel_archs and not arch.upper() in ["ICL"]:
if any([p["class"] == "memory" for p in parameters]):
# We have a memory parameter, if ports 2 & 3 are present, also add 2D & 3D
# TODO remove port7 on 'hsw' onward and split entries depending on addressing mode
port_23 = False
port_4 = False
for i, pp in enumerate(port_pressure):
if "2" in pp[1] and "3" in pp[1]:
port_23 = True
if "4" in pp[1]:
port_4 = True
# Add (x, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
if port_23 and not port_4:
if arch.upper() in ["SNB", "IVB"] and any(
[p.get('name', '') == 'ymm' for p in parameters]) and \
not '128' in mnemonic:
# x = 2 if SNB or IVB and ymm regiser in any operand and not '128' in
# instruction name
port2D3D_pressure = 2
else:
# otherwiese x = 1
port2D3D_pressure = 1
port_pressure.append((port2D3D_pressure, ["2D", "3D"]))
# Add missing ports:
for ports in [pp[1] for pp in port_pressure]:
for p in ports:
mm.add_port(p)
throughput = max(mm.average_port_pressure(port_pressure))
mm.set_instruction(mnemonic, parameters, latency, port_pressure, throughput, uops)
# TODO eliminate entries which could be covered by automatic load / store expansion
return mm
def rhs_comment(uncommented_string, comment):
max_length = max([len(line) for line in uncommented_string.split("\n")])
commented_string = ""
for line in uncommented_string.split("\n"):
commented_string += ("{:<" + str(max_length) + "} # {}\n").format(line, comment)
return commented_string
def architectures(tree):
return set([a.attrib["name"] for a in tree.findall(".//architecture")])
def main():
parser = argparse.ArgumentParser()
parser.add_argument("xml", help="path of instructions.xml from http://uops.info")
parser.add_argument(
"arch",
nargs="?",
help="architecture to extract, use IACA abbreviations (e.g., SNB). "
"if not given, all will be extracted and saved to file in CWD.",
)
parser.add_argument(
"--mem",
dest="skip_mem",
action="store_false",
help="add instruction forms including memory addressing operands, which are "
"skipped by default",
)
args = parser.parse_args()
basename = os.path.basename(__file__)
tree = ET.parse(args.xml)
print("# Available architectures:", ", ".join(architectures(tree)))
if args.arch:
print("# Chosen architecture: {}".format(args.arch))
model = extract_model(tree, args.arch, args.skip_mem)
if model is not None:
print(rhs_comment(model.dump(), "uops.info import"))
else:
for arch in architectures(tree):
print(arch, end="")
model = extract_model(tree, arch.lower(), args.skip_mem)
if model:
model_string = rhs_comment(model.dump(), basename + " " + arch)
with open("{}.yml".format(arch.lower()), "w") as f:
f.write(model_string)
print(".")
if __name__ == "__main__":
main()