mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2026-01-06 19:20:07 +01:00
Validating of OSACA predictions for IVB, SKX, ZEN1, ZEN2, A64FX and TX2 with different kernels. build_and_run.py contains the configuration used at RRZE's testcluster and UR's qpace4, Analysis.ipynb contains the analysis script and results. Raw data from measurements (122MB) will be attached to next OSACA release. For now, find the raw data here: https://hawo.net/~sijuhamm/d/UPIhBOtz/validation-data.tar.gz The analysis report can be viewed at https://nbviewer.jupyter.org/github/RRZE-HPC/OSACA/blob/validation/validation/Analysis.ipynb Quite a few changes on OSACA included: Feature: register change tracking via semantic understanding of operations Feature: recording LCD latency along path and exposing this to frontend Feature: support for memory reference aliases Feature: store throughput scaling (similar to load throughput scaling) Fix: model importer works with latest uops.info export Fix: immediate type tracking on ARM now preserves type in internal representaion Removed unused KerncraftAPI
429 lines
21 KiB
Python
Executable File
429 lines
21 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Semantics opbject responsible for architecture specific semantic operations"""
|
|
|
|
import warnings
|
|
from itertools import chain
|
|
from operator import itemgetter
|
|
|
|
from .hw_model import MachineModel
|
|
from .isa_semantics import INSTR_FLAGS, ISASemantics
|
|
|
|
|
|
class ArchSemantics(ISASemantics):
|
|
GAS_SUFFIXES = "bswlqt"
|
|
|
|
def __init__(self, machine_model: MachineModel, path_to_yaml=None):
|
|
super().__init__(machine_model.get_ISA().lower(), path_to_yaml=path_to_yaml)
|
|
self._machine_model = machine_model
|
|
self._isa = machine_model.get_ISA().lower()
|
|
|
|
# SUMMARY FUNCTION
|
|
def add_semantics(self, kernel):
|
|
"""
|
|
Applies performance data (throughput, latency, port pressure) and source/destination
|
|
distribution to each instruction of a given kernel.
|
|
|
|
:param list kernel: kernel to apply semantics
|
|
"""
|
|
for instruction_form in kernel:
|
|
self.assign_src_dst(instruction_form)
|
|
self.assign_tp_lt(instruction_form)
|
|
if self._machine_model.has_hidden_loads():
|
|
self.set_hidden_loads(kernel)
|
|
|
|
def assign_optimal_throughput(self, kernel):
|
|
"""
|
|
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
|
|
|
|
:param list kernel: kernel to apply optimal port utilization
|
|
"""
|
|
INC = 0.01
|
|
kernel.reverse()
|
|
port_list = self._machine_model.get_ports()
|
|
for instruction_form in kernel:
|
|
for uop in instruction_form["port_uops"]:
|
|
cycles = uop[0]
|
|
ports = list(uop[1])
|
|
indices = [port_list.index(p) for p in ports]
|
|
# check if port sum of used ports for uop are unbalanced
|
|
port_sums = self._to_list(itemgetter(*indices)(self.get_throughput_sum(kernel)))
|
|
instr_ports = self._to_list(itemgetter(*indices)(instruction_form["port_pressure"]))
|
|
if len(set(port_sums)) > 1:
|
|
# balance ports
|
|
# init list for keeping track of the current change
|
|
differences = [cycles / len(ports) for p in ports]
|
|
for _ in range(int(cycles * (1 / INC))):
|
|
if len(instr_ports) == 1:
|
|
# no balancing possible anymore
|
|
break
|
|
max_port_idx = port_sums.index(max(port_sums))
|
|
min_port_idx = port_sums.index(min(port_sums))
|
|
instr_ports[max_port_idx] -= INC
|
|
instr_ports[min_port_idx] += INC
|
|
differences[max_port_idx] -= INC
|
|
differences[min_port_idx] += INC
|
|
# instr_ports = [round(p, 2) for p in instr_ports]
|
|
self._itemsetter(*indices)(instruction_form["port_pressure"], *instr_ports)
|
|
# check if min port is zero
|
|
if round(min(instr_ports), 2) <= 0:
|
|
# if port_pressure is not exactly 0.00, add the residual to
|
|
# the former port
|
|
if min(instr_ports) != 0.0:
|
|
min_port_idx = port_sums.index(min(port_sums))
|
|
instr_ports[min_port_idx] += min(instr_ports)
|
|
differences[min_port_idx] += min(instr_ports)
|
|
# we don't need to decrease difference for other port, just
|
|
# delete it
|
|
del differences[instr_ports.index(min(instr_ports))]
|
|
self._itemsetter(*indices)(
|
|
instruction_form["port_pressure"], *instr_ports
|
|
)
|
|
zero_index = [
|
|
p
|
|
for p in indices
|
|
if round(instruction_form["port_pressure"][p], 2) == 0
|
|
][0]
|
|
instruction_form["port_pressure"][zero_index] = 0.0
|
|
# Remove from further balancing
|
|
indices = [
|
|
p for p in indices if instruction_form["port_pressure"][p] > 0
|
|
]
|
|
instr_ports = self._to_list(
|
|
itemgetter(*indices)(instruction_form["port_pressure"])
|
|
)
|
|
# never remove more than the fixed utilization per uop and port, i.e.,
|
|
# cycles/len(ports)
|
|
if round(min(differences), 2) <= 0:
|
|
# don't worry if port_pressure isn't exactly 0 and just
|
|
# remove from further balancing by deleting index since
|
|
# pressure is not 0
|
|
del indices[differences.index(min(differences))]
|
|
instr_ports = self._to_list(
|
|
itemgetter(*indices)(instruction_form["port_pressure"])
|
|
)
|
|
del differences[differences.index(min(differences))]
|
|
port_sums = self._to_list(
|
|
itemgetter(*indices)(self.get_throughput_sum(kernel))
|
|
)
|
|
kernel.reverse()
|
|
|
|
def set_hidden_loads(self, kernel):
|
|
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
|
|
loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr["flags"]]
|
|
stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr["flags"]]
|
|
# Filter instructions including load and store
|
|
load_ids = [instr["line_number"] for instr in loads]
|
|
store_ids = [instr["line_number"] for instr in stores]
|
|
shared_ldst = list(set(load_ids).intersection(set(store_ids)))
|
|
loads = [instr for instr in loads if instr["line_number"] not in shared_ldst]
|
|
stores = [instr for instr in stores if instr["line_number"] not in shared_ldst]
|
|
|
|
if len(stores) == 0 or len(loads) == 0:
|
|
# nothing to do
|
|
return
|
|
if len(loads) <= len(stores):
|
|
# Hide all loads
|
|
for load in loads:
|
|
load["flags"] += [INSTR_FLAGS.HIDDEN_LD]
|
|
load["port_pressure"] = self._nullify_data_ports(load["port_pressure"])
|
|
else:
|
|
for store in stores:
|
|
# Get 'closest' load instruction
|
|
min_distance_load = min(
|
|
[
|
|
(
|
|
abs(load_instr["line_number"] - store["line_number"]),
|
|
load_instr["line_number"],
|
|
)
|
|
for load_instr in loads
|
|
if INSTR_FLAGS.HIDDEN_LD not in load_instr["flags"]
|
|
]
|
|
)
|
|
load = [instr for instr in kernel if instr["line_number"] == min_distance_load[1]][
|
|
0
|
|
]
|
|
# Hide load
|
|
load["flags"] += [INSTR_FLAGS.HIDDEN_LD]
|
|
load["port_pressure"] = self._nullify_data_ports(load["port_pressure"])
|
|
|
|
# get parser result and assign throughput and latency value to instruction form
|
|
# mark instruction form with semantic flags
|
|
def assign_tp_lt(self, instruction_form):
|
|
"""Assign throughput and latency to an instruction form."""
|
|
flags = []
|
|
port_number = len(self._machine_model["ports"])
|
|
if instruction_form["instruction"] is None:
|
|
# No instruction (label, comment, ...) --> ignore
|
|
throughput = 0.0
|
|
latency = 0.0
|
|
latency_wo_load = latency
|
|
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
|
|
instruction_form["port_uops"] = []
|
|
else:
|
|
instruction_data = self._machine_model.get_instruction(
|
|
instruction_form["instruction"], instruction_form["operands"]
|
|
)
|
|
if (
|
|
not instruction_data
|
|
and self._isa == "x86"
|
|
and instruction_form["instruction"][-1] in self.GAS_SUFFIXES
|
|
):
|
|
# check for instruction without GAS suffix
|
|
instruction_data = self._machine_model.get_instruction(
|
|
instruction_form["instruction"][:-1], instruction_form["operands"]
|
|
)
|
|
if instruction_data:
|
|
# instruction form in DB
|
|
(
|
|
throughput,
|
|
port_pressure,
|
|
latency,
|
|
latency_wo_load,
|
|
) = self._handle_instruction_found(
|
|
instruction_data, port_number, instruction_form, flags
|
|
)
|
|
else:
|
|
# instruction could not be found in DB
|
|
assign_unknown = True
|
|
# check for equivalent register-operands DB entry if LD
|
|
if (
|
|
INSTR_FLAGS.HAS_LD in instruction_form["flags"]
|
|
or INSTR_FLAGS.HAS_ST in instruction_form["flags"]
|
|
):
|
|
# dynamically combine LD/ST and reg form of instruction form
|
|
# substitute mem and look for reg-only variant
|
|
operands = self.substitute_mem_address(instruction_form["operands"])
|
|
instruction_data_reg = self._machine_model.get_instruction(
|
|
instruction_form["instruction"], operands
|
|
)
|
|
if (
|
|
not instruction_data_reg
|
|
and self._isa == "x86"
|
|
and instruction_form["instruction"][-1] in self.GAS_SUFFIXES
|
|
):
|
|
# check for instruction without GAS suffix
|
|
instruction_data_reg = self._machine_model.get_instruction(
|
|
instruction_form["instruction"][:-1], operands
|
|
)
|
|
if instruction_data_reg:
|
|
assign_unknown = False
|
|
reg_type = self._parser.get_reg_type(
|
|
instruction_data_reg["operands"][
|
|
operands.index(self._create_reg_wildcard())
|
|
]
|
|
)
|
|
data_port_pressure = [0.0 for _ in range(port_number)]
|
|
data_port_uops = []
|
|
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
|
|
# LOAD performance data
|
|
data_port_uops = self._machine_model.get_load_throughput(
|
|
[
|
|
x["memory"]
|
|
for x in instruction_form["semantic_operands"]["source"]
|
|
+ instruction_form["semantic_operands"]["src_dst"]
|
|
if "memory" in x
|
|
][0]
|
|
)
|
|
data_port_pressure = self._machine_model.average_port_pressure(
|
|
data_port_uops
|
|
)
|
|
if "load_throughput_multiplier" in self._machine_model:
|
|
multiplier = self._machine_model["load_throughput_multiplier"][
|
|
reg_type
|
|
]
|
|
data_port_pressure = [pp * multiplier for pp in data_port_pressure]
|
|
if INSTR_FLAGS.HAS_ST in instruction_form["flags"]:
|
|
# STORE performance data
|
|
destinations = (
|
|
instruction_form["semantic_operands"]["destination"]
|
|
+ instruction_form["semantic_operands"]["src_dst"]
|
|
)
|
|
st_data_port_uops = self._machine_model.get_store_throughput(
|
|
[x["memory"] for x in destinations if "memory" in x][0]
|
|
)
|
|
# zero data port pressure and remove HAS_ST flag if
|
|
# - no mem operand in dst &&
|
|
# - all mem operands in src_dst are pre-/post-indexed
|
|
# since it is no mem store
|
|
if (
|
|
self._isa == "aarch64"
|
|
and "memory"
|
|
not in instruction_form["semantic_operands"]["destination"]
|
|
and all(
|
|
[
|
|
"post_indexed" in op["memory"]
|
|
or "pre_indexed" in op["memory"]
|
|
for op in instruction_form["semantic_operands"]["src_dst"]
|
|
if "memory" in op
|
|
]
|
|
)
|
|
):
|
|
st_data_port_uops = []
|
|
instruction_form["flags"].remove(INSTR_FLAGS.HAS_ST)
|
|
|
|
# sum up all data ports in case for LOAD and STORE
|
|
st_data_port_pressure = self._machine_model.average_port_pressure(
|
|
st_data_port_uops
|
|
)
|
|
if "store_throughput_multiplier" in self._machine_model:
|
|
multiplier = self._machine_model["store_throughput_multiplier"][
|
|
reg_type
|
|
]
|
|
st_data_port_pressure = [
|
|
pp * multiplier for pp in st_data_port_pressure]
|
|
data_port_pressure = [
|
|
sum(x) for x in zip(data_port_pressure, st_data_port_pressure)
|
|
]
|
|
data_port_uops += st_data_port_uops
|
|
throughput = max(
|
|
max(data_port_pressure), instruction_data_reg["throughput"]
|
|
)
|
|
latency = instruction_data_reg["latency"]
|
|
# Add LD and ST latency
|
|
latency += (
|
|
self._machine_model.get_load_latency(reg_type)
|
|
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]
|
|
else 0
|
|
)
|
|
latency += (
|
|
self._machine_model.get_store_latency(reg_type)
|
|
if INSTR_FLAGS.HAS_ST in instruction_form["flags"]
|
|
else 0
|
|
)
|
|
latency_wo_load = instruction_data_reg["latency"]
|
|
# add latency of ADD if post- or pre-indexed load
|
|
# TODO more investigation: check dot-graph, wrong latency distribution!
|
|
# if (
|
|
# latency_wo_load == 0
|
|
# and self._isa == 'aarch64'
|
|
# and any(
|
|
# [
|
|
# 'post_indexed' in op['memory'] or
|
|
# 'pre_indexed' in op['memory']
|
|
# for op in instruction_form['operands']
|
|
# if 'memory' in op
|
|
# ]
|
|
# )
|
|
# ):
|
|
# latency_wo_load = 1.0
|
|
instruction_form["port_pressure"] = [
|
|
sum(x)
|
|
for x in zip(
|
|
data_port_pressure,
|
|
self._machine_model.average_port_pressure(
|
|
instruction_data_reg["port_pressure"]
|
|
),
|
|
)
|
|
]
|
|
instruction_form["port_uops"] = list(
|
|
chain(instruction_data_reg["port_pressure"], data_port_uops)
|
|
)
|
|
|
|
if assign_unknown:
|
|
# --> mark as unknown and assume 0 cy for latency/throughput
|
|
throughput = 0.0
|
|
latency = 0.0
|
|
latency_wo_load = latency
|
|
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
|
|
instruction_form["port_uops"] = []
|
|
flags += [INSTR_FLAGS.TP_UNKWN, INSTR_FLAGS.LT_UNKWN]
|
|
# flatten flag list
|
|
flags = list(set(flags))
|
|
if "flags" not in instruction_form:
|
|
instruction_form["flags"] = flags
|
|
else:
|
|
instruction_form["flags"] += flags
|
|
instruction_form["throughput"] = throughput
|
|
instruction_form["latency"] = latency
|
|
instruction_form["latency_wo_load"] = latency_wo_load
|
|
# for later CP and loop-carried dependency analysis
|
|
instruction_form["latency_cp"] = 0
|
|
instruction_form["latency_lcd"] = 0
|
|
|
|
def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags):
|
|
"""Apply performance data to instruction if it was found in the archDB"""
|
|
throughput = instruction_data["throughput"]
|
|
port_pressure = self._machine_model.average_port_pressure(instruction_data["port_pressure"])
|
|
instruction_form["port_uops"] = instruction_data["port_pressure"]
|
|
try:
|
|
assert isinstance(port_pressure, list)
|
|
assert len(port_pressure) == port_number
|
|
instruction_form["port_pressure"] = port_pressure
|
|
if sum(port_pressure) == 0 and throughput is not None:
|
|
# port pressure on all ports 0 --> not bound to a port
|
|
flags.append(INSTR_FLAGS.NOT_BOUND)
|
|
except AssertionError:
|
|
warnings.warn(
|
|
"Port pressure could not be imported correctly from database. "
|
|
+ "Please check entry for:\n {}".format(instruction_form)
|
|
)
|
|
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
|
|
instruction_form["port_uops"] = []
|
|
flags.append(INSTR_FLAGS.TP_UNKWN)
|
|
if throughput is None:
|
|
# assume 0 cy and mark as unknown
|
|
throughput = 0.0
|
|
flags.append(INSTR_FLAGS.TP_UNKWN)
|
|
latency = instruction_data["latency"]
|
|
latency_wo_load = latency
|
|
if latency is None:
|
|
# assume 0 cy and mark as unknown
|
|
latency = 0.0
|
|
latency_wo_load = latency
|
|
flags.append(INSTR_FLAGS.LT_UNKWN)
|
|
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
|
|
flags.append(INSTR_FLAGS.LD)
|
|
return throughput, port_pressure, latency, latency_wo_load
|
|
|
|
def convert_op_to_reg(self, reg_type, reg_id="0"):
|
|
"""Create register operand for a memory addressing operand"""
|
|
if self._isa == "x86":
|
|
if reg_type == "gpr":
|
|
register = {"register": {"name": "r" + str(int(reg_id) + 9)}}
|
|
else:
|
|
register = {"register": {"name": reg_type + reg_id}}
|
|
elif self._isa == "aarch64":
|
|
register = {"register": {"prefix": reg_type, "name": reg_id}}
|
|
return register
|
|
|
|
def _nullify_data_ports(self, port_pressure):
|
|
"""Set all ports to 0.0 for the ports of a machine model"""
|
|
data_ports = self._machine_model.get_data_ports()
|
|
for port in data_ports:
|
|
index = self._machine_model.get_ports().index(port)
|
|
port_pressure[index] = 0.0
|
|
return port_pressure
|
|
|
|
def _itemsetter(self, *items):
|
|
if len(items) == 1:
|
|
item = items[0]
|
|
|
|
def g(obj, value):
|
|
obj[item] = value
|
|
|
|
else:
|
|
|
|
def g(obj, *values):
|
|
for item, value in zip(items, values):
|
|
obj[item] = value
|
|
|
|
return g
|
|
|
|
def _to_list(self, obj):
|
|
if isinstance(obj, tuple):
|
|
return list(obj)
|
|
else:
|
|
return [obj]
|
|
|
|
@staticmethod
|
|
def get_throughput_sum(kernel):
|
|
"""Get the overall throughput sum separated by port of all instructions of a kernel."""
|
|
# ignoring all lines with throughput == 0.0, because there won't be anything to sum up
|
|
# typically comment, label and non-instruction lines
|
|
port_pressures = [instr["port_pressure"] for instr in kernel if instr["throughput"] != 0.0]
|
|
# Essentially summing up each columns of port_pressures, where each column is one port
|
|
# and each row is one line of the kernel
|
|
# round is necessary to ensure termination of ArchsSemantics.assign_optimal_throughput
|
|
tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)]
|
|
return tp_sum
|