Files
OSACA/osaca/semantics/arch_semantics.py
Julian 08440ed5e1 Validation (#71)
Validating of OSACA predictions for IVB, SKX, ZEN1, ZEN2, A64FX and TX2 with different kernels.

build_and_run.py contains the configuration used at RRZE's testcluster and UR's qpace4, Analysis.ipynb contains the analysis script and results. Raw data from measurements (122MB) will be attached to next OSACA release.

For now, find the raw data here: https://hawo.net/~sijuhamm/d/UPIhBOtz/validation-data.tar.gz

The analysis report can be viewed at https://nbviewer.jupyter.org/github/RRZE-HPC/OSACA/blob/validation/validation/Analysis.ipynb

Quite a few changes on OSACA included:

Feature: register change tracking via semantic understanding of operations
Feature: recording LCD latency along path and exposing this to frontend
Feature: support for memory reference aliases
Feature: store throughput scaling (similar to load throughput scaling)
Fix: model importer works with latest uops.info export
Fix: immediate type tracking on ARM now preserves type in internal representaion
Removed unused KerncraftAPI
2021-04-15 14:42:37 +02:00

429 lines
21 KiB
Python
Executable File

#!/usr/bin/env python3
"""Semantics opbject responsible for architecture specific semantic operations"""
import warnings
from itertools import chain
from operator import itemgetter
from .hw_model import MachineModel
from .isa_semantics import INSTR_FLAGS, ISASemantics
class ArchSemantics(ISASemantics):
GAS_SUFFIXES = "bswlqt"
def __init__(self, machine_model: MachineModel, path_to_yaml=None):
super().__init__(machine_model.get_ISA().lower(), path_to_yaml=path_to_yaml)
self._machine_model = machine_model
self._isa = machine_model.get_ISA().lower()
# SUMMARY FUNCTION
def add_semantics(self, kernel):
"""
Applies performance data (throughput, latency, port pressure) and source/destination
distribution to each instruction of a given kernel.
:param list kernel: kernel to apply semantics
"""
for instruction_form in kernel:
self.assign_src_dst(instruction_form)
self.assign_tp_lt(instruction_form)
if self._machine_model.has_hidden_loads():
self.set_hidden_loads(kernel)
def assign_optimal_throughput(self, kernel):
"""
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
:param list kernel: kernel to apply optimal port utilization
"""
INC = 0.01
kernel.reverse()
port_list = self._machine_model.get_ports()
for instruction_form in kernel:
for uop in instruction_form["port_uops"]:
cycles = uop[0]
ports = list(uop[1])
indices = [port_list.index(p) for p in ports]
# check if port sum of used ports for uop are unbalanced
port_sums = self._to_list(itemgetter(*indices)(self.get_throughput_sum(kernel)))
instr_ports = self._to_list(itemgetter(*indices)(instruction_form["port_pressure"]))
if len(set(port_sums)) > 1:
# balance ports
# init list for keeping track of the current change
differences = [cycles / len(ports) for p in ports]
for _ in range(int(cycles * (1 / INC))):
if len(instr_ports) == 1:
# no balancing possible anymore
break
max_port_idx = port_sums.index(max(port_sums))
min_port_idx = port_sums.index(min(port_sums))
instr_ports[max_port_idx] -= INC
instr_ports[min_port_idx] += INC
differences[max_port_idx] -= INC
differences[min_port_idx] += INC
# instr_ports = [round(p, 2) for p in instr_ports]
self._itemsetter(*indices)(instruction_form["port_pressure"], *instr_ports)
# check if min port is zero
if round(min(instr_ports), 2) <= 0:
# if port_pressure is not exactly 0.00, add the residual to
# the former port
if min(instr_ports) != 0.0:
min_port_idx = port_sums.index(min(port_sums))
instr_ports[min_port_idx] += min(instr_ports)
differences[min_port_idx] += min(instr_ports)
# we don't need to decrease difference for other port, just
# delete it
del differences[instr_ports.index(min(instr_ports))]
self._itemsetter(*indices)(
instruction_form["port_pressure"], *instr_ports
)
zero_index = [
p
for p in indices
if round(instruction_form["port_pressure"][p], 2) == 0
][0]
instruction_form["port_pressure"][zero_index] = 0.0
# Remove from further balancing
indices = [
p for p in indices if instruction_form["port_pressure"][p] > 0
]
instr_ports = self._to_list(
itemgetter(*indices)(instruction_form["port_pressure"])
)
# never remove more than the fixed utilization per uop and port, i.e.,
# cycles/len(ports)
if round(min(differences), 2) <= 0:
# don't worry if port_pressure isn't exactly 0 and just
# remove from further balancing by deleting index since
# pressure is not 0
del indices[differences.index(min(differences))]
instr_ports = self._to_list(
itemgetter(*indices)(instruction_form["port_pressure"])
)
del differences[differences.index(min(differences))]
port_sums = self._to_list(
itemgetter(*indices)(self.get_throughput_sum(kernel))
)
kernel.reverse()
def set_hidden_loads(self, kernel):
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr["flags"]]
stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr["flags"]]
# Filter instructions including load and store
load_ids = [instr["line_number"] for instr in loads]
store_ids = [instr["line_number"] for instr in stores]
shared_ldst = list(set(load_ids).intersection(set(store_ids)))
loads = [instr for instr in loads if instr["line_number"] not in shared_ldst]
stores = [instr for instr in stores if instr["line_number"] not in shared_ldst]
if len(stores) == 0 or len(loads) == 0:
# nothing to do
return
if len(loads) <= len(stores):
# Hide all loads
for load in loads:
load["flags"] += [INSTR_FLAGS.HIDDEN_LD]
load["port_pressure"] = self._nullify_data_ports(load["port_pressure"])
else:
for store in stores:
# Get 'closest' load instruction
min_distance_load = min(
[
(
abs(load_instr["line_number"] - store["line_number"]),
load_instr["line_number"],
)
for load_instr in loads
if INSTR_FLAGS.HIDDEN_LD not in load_instr["flags"]
]
)
load = [instr for instr in kernel if instr["line_number"] == min_distance_load[1]][
0
]
# Hide load
load["flags"] += [INSTR_FLAGS.HIDDEN_LD]
load["port_pressure"] = self._nullify_data_ports(load["port_pressure"])
# get parser result and assign throughput and latency value to instruction form
# mark instruction form with semantic flags
def assign_tp_lt(self, instruction_form):
"""Assign throughput and latency to an instruction form."""
flags = []
port_number = len(self._machine_model["ports"])
if instruction_form["instruction"] is None:
# No instruction (label, comment, ...) --> ignore
throughput = 0.0
latency = 0.0
latency_wo_load = latency
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
instruction_form["port_uops"] = []
else:
instruction_data = self._machine_model.get_instruction(
instruction_form["instruction"], instruction_form["operands"]
)
if (
not instruction_data
and self._isa == "x86"
and instruction_form["instruction"][-1] in self.GAS_SUFFIXES
):
# check for instruction without GAS suffix
instruction_data = self._machine_model.get_instruction(
instruction_form["instruction"][:-1], instruction_form["operands"]
)
if instruction_data:
# instruction form in DB
(
throughput,
port_pressure,
latency,
latency_wo_load,
) = self._handle_instruction_found(
instruction_data, port_number, instruction_form, flags
)
else:
# instruction could not be found in DB
assign_unknown = True
# check for equivalent register-operands DB entry if LD
if (
INSTR_FLAGS.HAS_LD in instruction_form["flags"]
or INSTR_FLAGS.HAS_ST in instruction_form["flags"]
):
# dynamically combine LD/ST and reg form of instruction form
# substitute mem and look for reg-only variant
operands = self.substitute_mem_address(instruction_form["operands"])
instruction_data_reg = self._machine_model.get_instruction(
instruction_form["instruction"], operands
)
if (
not instruction_data_reg
and self._isa == "x86"
and instruction_form["instruction"][-1] in self.GAS_SUFFIXES
):
# check for instruction without GAS suffix
instruction_data_reg = self._machine_model.get_instruction(
instruction_form["instruction"][:-1], operands
)
if instruction_data_reg:
assign_unknown = False
reg_type = self._parser.get_reg_type(
instruction_data_reg["operands"][
operands.index(self._create_reg_wildcard())
]
)
data_port_pressure = [0.0 for _ in range(port_number)]
data_port_uops = []
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
# LOAD performance data
data_port_uops = self._machine_model.get_load_throughput(
[
x["memory"]
for x in instruction_form["semantic_operands"]["source"]
+ instruction_form["semantic_operands"]["src_dst"]
if "memory" in x
][0]
)
data_port_pressure = self._machine_model.average_port_pressure(
data_port_uops
)
if "load_throughput_multiplier" in self._machine_model:
multiplier = self._machine_model["load_throughput_multiplier"][
reg_type
]
data_port_pressure = [pp * multiplier for pp in data_port_pressure]
if INSTR_FLAGS.HAS_ST in instruction_form["flags"]:
# STORE performance data
destinations = (
instruction_form["semantic_operands"]["destination"]
+ instruction_form["semantic_operands"]["src_dst"]
)
st_data_port_uops = self._machine_model.get_store_throughput(
[x["memory"] for x in destinations if "memory" in x][0]
)
# zero data port pressure and remove HAS_ST flag if
# - no mem operand in dst &&
# - all mem operands in src_dst are pre-/post-indexed
# since it is no mem store
if (
self._isa == "aarch64"
and "memory"
not in instruction_form["semantic_operands"]["destination"]
and all(
[
"post_indexed" in op["memory"]
or "pre_indexed" in op["memory"]
for op in instruction_form["semantic_operands"]["src_dst"]
if "memory" in op
]
)
):
st_data_port_uops = []
instruction_form["flags"].remove(INSTR_FLAGS.HAS_ST)
# sum up all data ports in case for LOAD and STORE
st_data_port_pressure = self._machine_model.average_port_pressure(
st_data_port_uops
)
if "store_throughput_multiplier" in self._machine_model:
multiplier = self._machine_model["store_throughput_multiplier"][
reg_type
]
st_data_port_pressure = [
pp * multiplier for pp in st_data_port_pressure]
data_port_pressure = [
sum(x) for x in zip(data_port_pressure, st_data_port_pressure)
]
data_port_uops += st_data_port_uops
throughput = max(
max(data_port_pressure), instruction_data_reg["throughput"]
)
latency = instruction_data_reg["latency"]
# Add LD and ST latency
latency += (
self._machine_model.get_load_latency(reg_type)
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]
else 0
)
latency += (
self._machine_model.get_store_latency(reg_type)
if INSTR_FLAGS.HAS_ST in instruction_form["flags"]
else 0
)
latency_wo_load = instruction_data_reg["latency"]
# add latency of ADD if post- or pre-indexed load
# TODO more investigation: check dot-graph, wrong latency distribution!
# if (
# latency_wo_load == 0
# and self._isa == 'aarch64'
# and any(
# [
# 'post_indexed' in op['memory'] or
# 'pre_indexed' in op['memory']
# for op in instruction_form['operands']
# if 'memory' in op
# ]
# )
# ):
# latency_wo_load = 1.0
instruction_form["port_pressure"] = [
sum(x)
for x in zip(
data_port_pressure,
self._machine_model.average_port_pressure(
instruction_data_reg["port_pressure"]
),
)
]
instruction_form["port_uops"] = list(
chain(instruction_data_reg["port_pressure"], data_port_uops)
)
if assign_unknown:
# --> mark as unknown and assume 0 cy for latency/throughput
throughput = 0.0
latency = 0.0
latency_wo_load = latency
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
instruction_form["port_uops"] = []
flags += [INSTR_FLAGS.TP_UNKWN, INSTR_FLAGS.LT_UNKWN]
# flatten flag list
flags = list(set(flags))
if "flags" not in instruction_form:
instruction_form["flags"] = flags
else:
instruction_form["flags"] += flags
instruction_form["throughput"] = throughput
instruction_form["latency"] = latency
instruction_form["latency_wo_load"] = latency_wo_load
# for later CP and loop-carried dependency analysis
instruction_form["latency_cp"] = 0
instruction_form["latency_lcd"] = 0
def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags):
"""Apply performance data to instruction if it was found in the archDB"""
throughput = instruction_data["throughput"]
port_pressure = self._machine_model.average_port_pressure(instruction_data["port_pressure"])
instruction_form["port_uops"] = instruction_data["port_pressure"]
try:
assert isinstance(port_pressure, list)
assert len(port_pressure) == port_number
instruction_form["port_pressure"] = port_pressure
if sum(port_pressure) == 0 and throughput is not None:
# port pressure on all ports 0 --> not bound to a port
flags.append(INSTR_FLAGS.NOT_BOUND)
except AssertionError:
warnings.warn(
"Port pressure could not be imported correctly from database. "
+ "Please check entry for:\n {}".format(instruction_form)
)
instruction_form["port_pressure"] = [0.0 for i in range(port_number)]
instruction_form["port_uops"] = []
flags.append(INSTR_FLAGS.TP_UNKWN)
if throughput is None:
# assume 0 cy and mark as unknown
throughput = 0.0
flags.append(INSTR_FLAGS.TP_UNKWN)
latency = instruction_data["latency"]
latency_wo_load = latency
if latency is None:
# assume 0 cy and mark as unknown
latency = 0.0
latency_wo_load = latency
flags.append(INSTR_FLAGS.LT_UNKWN)
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
flags.append(INSTR_FLAGS.LD)
return throughput, port_pressure, latency, latency_wo_load
def convert_op_to_reg(self, reg_type, reg_id="0"):
"""Create register operand for a memory addressing operand"""
if self._isa == "x86":
if reg_type == "gpr":
register = {"register": {"name": "r" + str(int(reg_id) + 9)}}
else:
register = {"register": {"name": reg_type + reg_id}}
elif self._isa == "aarch64":
register = {"register": {"prefix": reg_type, "name": reg_id}}
return register
def _nullify_data_ports(self, port_pressure):
"""Set all ports to 0.0 for the ports of a machine model"""
data_ports = self._machine_model.get_data_ports()
for port in data_ports:
index = self._machine_model.get_ports().index(port)
port_pressure[index] = 0.0
return port_pressure
def _itemsetter(self, *items):
if len(items) == 1:
item = items[0]
def g(obj, value):
obj[item] = value
else:
def g(obj, *values):
for item, value in zip(items, values):
obj[item] = value
return g
def _to_list(self, obj):
if isinstance(obj, tuple):
return list(obj)
else:
return [obj]
@staticmethod
def get_throughput_sum(kernel):
"""Get the overall throughput sum separated by port of all instructions of a kernel."""
# ignoring all lines with throughput == 0.0, because there won't be anything to sum up
# typically comment, label and non-instruction lines
port_pressures = [instr["port_pressure"] for instr in kernel if instr["throughput"] != 0.0]
# Essentially summing up each columns of port_pressures, where each column is one port
# and each row is one line of the kernel
# round is necessary to ensure termination of ArchsSemantics.assign_optimal_throughput
tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)]
return tp_sum