added Zen3 support

This commit is contained in:
JanLJL
2022-09-27 18:39:14 +02:00
parent 4f8e37d9fd
commit 7724ce27c7
6 changed files with 5328 additions and 16 deletions

5217
osaca/data/zen3.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -30,6 +30,7 @@ SUPPORTED_ARCHS = [
"ICX", "ICX",
"ZEN1", "ZEN1",
"ZEN2", "ZEN2",
"ZEN3",
"TX2", "TX2",
"N1", "N1",
"A64FX", "A64FX",
@@ -97,7 +98,7 @@ def create_parser(parser=None):
parser.add_argument( parser.add_argument(
"--arch", "--arch",
type=str, type=str,
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, TX2, N1, " help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, ZEN3, TX2, N1, "
"A64FX, TSV110, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.", "A64FX, TSV110, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
) )
parser.add_argument( parser.add_argument(
@@ -329,6 +330,7 @@ def inspect(args, output_file=sys.stdout):
# Do optimal schedule for kernel throughput if wished # Do optimal schedule for kernel throughput if wished
if not args.fixed: if not args.fixed:
semantics.assign_optimal_throughput(kernel) semantics.assign_optimal_throughput(kernel)
semantics.assign_optimal_throughput(kernel)
# Create DiGrahps # Create DiGrahps
kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout) kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)

View File

@@ -1,9 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Semantics opbject responsible for architecture specific semantic operations""" """Semantics opbject responsible for architecture specific semantic operations"""
import sys
import warnings import warnings
from itertools import chain from itertools import chain
from operator import itemgetter from operator import itemgetter
from copy import deepcopy
from .hw_model import MachineModel from .hw_model import MachineModel
from .isa_semantics import INSTR_FLAGS, ISASemantics from .isa_semantics import INSTR_FLAGS, ISASemantics
@@ -31,7 +33,7 @@ class ArchSemantics(ISASemantics):
if self._machine_model.has_hidden_loads(): if self._machine_model.has_hidden_loads():
self.set_hidden_loads(kernel) self.set_hidden_loads(kernel)
def assign_optimal_throughput(self, kernel): def assign_optimal_throughput(self, kernel, start=0):
""" """
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``. Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
@@ -40,7 +42,26 @@ class ArchSemantics(ISASemantics):
INC = 0.01 INC = 0.01
kernel.reverse() kernel.reverse()
port_list = self._machine_model.get_ports() port_list = self._machine_model.get_ports()
for instruction_form in kernel: for idx, instruction_form in enumerate(kernel[start:], start):
multiple_assignments = False
# if iform has multiple possible port assignments, check all in a DFS manner and take the best
if isinstance(instruction_form["port_uops"], dict):
best_kernel = None
best_kernel_tp = sys.maxsize
for port_util_alt in list(instruction_form["port_uops"].values())[1:]:
k_tmp = deepcopy(kernel)
k_tmp[idx]["port_uops"] = deepcopy(port_util_alt)
k_tmp[idx]["port_pressure"] = self._machine_model.average_port_pressure(
k_tmp[idx]["port_uops"]
)
k_tmp.reverse()
self.assign_optimal_throughput(k_tmp, idx)
if max(self.get_throughput_sum(k_tmp)) < best_kernel_tp:
best_kernel = k_tmp
best_kernel_tp = max(self.get_throughput_sum(best_kernel))
# check the first option in the main branch and compare against the best option later
multiple_assignments = True
kernel[idx]["port_uops"] = list(instruction_form["port_uops"].values())[0]
for uop in instruction_form["port_uops"]: for uop in instruction_form["port_uops"]:
cycles = uop[0] cycles = uop[0]
ports = list(uop[1]) ports = list(uop[1])
@@ -84,6 +105,7 @@ class ArchSemantics(ISASemantics):
p p
for p in indices for p in indices
if round(instruction_form["port_pressure"][p], 2) == 0 if round(instruction_form["port_pressure"][p], 2) == 0
or instruction_form["port_pressure"][p] < 0.00
][0] ][0]
instruction_form["port_pressure"][zero_index] = 0.0 instruction_form["port_pressure"][zero_index] = 0.0
# Remove from further balancing # Remove from further balancing
@@ -108,6 +130,11 @@ class ArchSemantics(ISASemantics):
itemgetter(*indices)(self.get_throughput_sum(kernel)) itemgetter(*indices)(self.get_throughput_sum(kernel))
) )
kernel.reverse() kernel.reverse()
if multiple_assignments:
if max(self.get_throughput_sum(kernel)) > best_kernel_tp:
for i, instr in enumerate(best_kernel):
kernel[i]["port_uops"] = best_kernel[i]["port_uops"]
kernel[i]["port_pressure"] = best_kernel[i]["port_pressure"]
def set_hidden_loads(self, kernel): def set_hidden_loads(self, kernel):
"""Hide loads behind stores if architecture supports hidden loads (depricated)""" """Hide loads behind stores if architecture supports hidden loads (depricated)"""
@@ -209,11 +236,12 @@ class ArchSemantics(ISASemantics):
operands.index(self._create_reg_wildcard()) operands.index(self._create_reg_wildcard())
] ]
) )
dummy_reg = {"class": "register", "name": reg_type}
data_port_pressure = [0.0 for _ in range(port_number)] data_port_pressure = [0.0 for _ in range(port_number)]
data_port_uops = [] data_port_uops = []
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]: if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
# LOAD performance data # LOAD performance data
data_port_uops = self._machine_model.get_load_throughput( load_perf_data = self._machine_model.get_load_throughput(
[ [
x["memory"] x["memory"]
for x in instruction_form["semantic_operands"]["source"] for x in instruction_form["semantic_operands"]["source"]
@@ -221,6 +249,19 @@ class ArchSemantics(ISASemantics):
if "memory" in x if "memory" in x
][0] ][0]
) )
# if multiple options, choose based on reg type
data_port_uops = [
ldp["port_pressure"]
for ldp in load_perf_data
if "dst" in ldp
and self._machine_model._check_operands(
dummy_reg, {"register": {"name": ldp["dst"]}}
)
]
if len(data_port_uops) < 1:
data_port_uops = load_perf_data[0]["port_pressure"]
else:
data_port_uops = data_port_uops[0]
data_port_pressure = self._machine_model.average_port_pressure( data_port_pressure = self._machine_model.average_port_pressure(
data_port_uops data_port_uops
) )
@@ -235,9 +276,22 @@ class ArchSemantics(ISASemantics):
instruction_form["semantic_operands"]["destination"] instruction_form["semantic_operands"]["destination"]
+ instruction_form["semantic_operands"]["src_dst"] + instruction_form["semantic_operands"]["src_dst"]
) )
st_data_port_uops = self._machine_model.get_store_throughput( store_perf_data = self._machine_model.get_store_throughput(
[x["memory"] for x in destinations if "memory" in x][0] [x["memory"] for x in destinations if "memory" in x][0]
) )
# if multiple options, choose based on reg type
st_data_port_uops = [
stp["port_pressure"]
for stp in store_perf_data
if "src" in stp
and self._machine_model._check_operands(
dummy_reg, {"register": {"name": stp["src"]}}
)
]
if len(data_port_uops) < 1:
st_data_port_uops = store_perf_data[0]["port_pressure"]
else:
st_data_port_uops = st_data_port_uops[0]
# zero data port pressure and remove HAS_ST flag if # zero data port pressure and remove HAS_ST flag if
# - no mem operand in dst && # - no mem operand in dst &&
# - all mem operands in src_dst are pre-/post-indexed # - all mem operands in src_dst are pre-/post-indexed

View File

@@ -143,11 +143,16 @@ class MachineModel(object):
print("\nname: {}\noperands: {}".format(name, operands)) print("\nname: {}\noperands: {}".format(name, operands))
raise TypeError from e raise TypeError from e
def average_port_pressure(self, port_pressure): def average_port_pressure(self, port_pressure, option=0):
"""Construct average port pressure list from instruction data.""" """Construct average port pressure list from instruction data."""
port_list = self._data["ports"] port_list = self._data["ports"]
average_pressure = [0.0] * len(port_list) average_pressure = [0.0] * len(port_list)
for cycles, ports in port_pressure: # if there are multiple port utilization options and none is selected, choose first one
if isinstance(port_pressure, dict):
used_pp = port_pressure[option]
else:
used_pp = port_pressure
for cycles, ports in used_pp:
for p in ports: for p in ports:
try: try:
average_pressure[port_list.index(p)] += cycles / len(ports) average_pressure[port_list.index(p)] += cycles / len(ports)
@@ -221,8 +226,8 @@ class MachineModel(object):
"""Return load thorughput for given register type.""" """Return load thorughput for given register type."""
ld_tp = [m for m in self._data["load_throughput"] if self._match_mem_entries(memory, m)] ld_tp = [m for m in self._data["load_throughput"] if self._match_mem_entries(memory, m)]
if len(ld_tp) > 0: if len(ld_tp) > 0:
return ld_tp[0]["port_pressure"].copy() return ld_tp.copy()
return self._data["load_throughput_default"].copy() return [{"port_pressure": self._data["load_throughput_default"].copy()}]
def get_store_latency(self, reg_type): def get_store_latency(self, reg_type):
"""Return store latency for given register type.""" """Return store latency for given register type."""
@@ -233,8 +238,8 @@ class MachineModel(object):
"""Return store throughput for given register type.""" """Return store throughput for given register type."""
st_tp = [m for m in self._data["store_throughput"] if self._match_mem_entries(memory, m)] st_tp = [m for m in self._data["store_throughput"] if self._match_mem_entries(memory, m)]
if len(st_tp) > 0: if len(st_tp) > 0:
return st_tp[0]["port_pressure"].copy() return st_tp.copy()
return self._data["store_throughput_default"].copy() return [{"port_pressure": self._data["store_throughput_default"].copy()}]
def _match_mem_entries(self, mem, i_mem): def _match_mem_entries(self, mem, i_mem):
"""Check if memory addressing ``mem`` and ``i_mem`` are of the same type.""" """Check if memory addressing ``mem`` and ``i_mem`` are of the same type."""
@@ -273,6 +278,7 @@ class MachineModel(object):
"zen1": "x86", "zen1": "x86",
"zen+": "x86", "zen+": "x86",
"zen2": "x86", "zen2": "x86",
"zen3": "x86",
"con": "x86", # Intel Conroe "con": "x86", # Intel Conroe
"wol": "x86", # Intel Wolfdale "wol": "x86", # Intel Wolfdale
"snb": "x86", "snb": "x86",

View File

@@ -61,6 +61,24 @@ port_model_scheme: |
+-------+ | VNNI | +-------+ | VNNI |
+-------+ +-------+
instruction_forms: instruction_forms:
- name: fantasyinstr1
operands:
- class: register
name: gpr
- class: register
name: gpr
port_pressure: {0: [[1, '015']], 1: [[1, '56']]}
throughput: 0.333333
latency: 1.0
- name: fantasyinstr2
operands:
- class: register
name: gpr
- class: register
name: gpr
port_pressure: [[1, '0'], [1, '1'], [1, '5']]
throughput: 0.5
latency: 1.0
- name: LEA - name: LEA
operands: operands:
- class: memory - class: memory

View File

@@ -175,7 +175,7 @@ class TestSemanticTools(unittest.TestCase):
self.assertEqual( self.assertEqual(
test_mm_x86.get_store_throughput( test_mm_x86.get_store_throughput(
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1} {"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
), )[0]["port_pressure"],
[[2, "237"], [2, "4"]], [[2, "237"], [2, "4"]],
) )
self.assertEqual( self.assertEqual(
@@ -186,13 +186,13 @@ class TestSemanticTools(unittest.TestCase):
"index": "NOT_NONE", "index": "NOT_NONE",
"scale": 1, "scale": 1,
} }
), )[0]["port_pressure"],
[[1, "23"], [1, "4"]], [[1, "23"], [1, "4"]],
) )
self.assertEqual( self.assertEqual(
test_mm_arm.get_store_throughput( test_mm_arm.get_store_throughput(
{"base": {"prefix": "x"}, "offset": None, "index": None, "scale": 1} {"base": {"prefix": "x"}, "offset": None, "index": None, "scale": 1}
), )[0]["port_pressure"],
[[2, "34"], [2, "5"]], [[2, "34"], [2, "5"]],
) )
self.assertEqual( self.assertEqual(
@@ -203,7 +203,7 @@ class TestSemanticTools(unittest.TestCase):
"index": None, "index": None,
"scale": 1, "scale": 1,
} }
), )[0]["port_pressure"],
[[1, "34"], [1, "5"]], [[1, "34"], [1, "5"]],
) )
@@ -228,7 +228,7 @@ class TestSemanticTools(unittest.TestCase):
self.assertEqual( self.assertEqual(
test_mm_x86.get_load_throughput( test_mm_x86.get_load_throughput(
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1} {"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
), )[0]["port_pressure"],
[[1, "23"], [1, ["2D", "3D"]]], [[1, "23"], [1, ["2D", "3D"]]],
) )
@@ -288,6 +288,21 @@ class TestSemanticTools(unittest.TestCase):
tp_optimal = self.semantics_csx.get_throughput_sum(kernel_optimal) tp_optimal = self.semantics_csx.get_throughput_sum(kernel_optimal)
self.assertNotEqual(tp_fixed, tp_optimal) self.assertNotEqual(tp_fixed, tp_optimal)
self.assertTrue(max(tp_optimal) <= max(tp_fixed)) self.assertTrue(max(tp_optimal) <= max(tp_fixed))
# test multiple port assignment options
test_mm_x86 = MachineModel(path_to_yaml=self._find_file("test_db_x86.yml"))
tmp_semantics = ArchSemantics(test_mm_x86)
tmp_code_1 = "fantasyinstr1 %rax, %rax\n"
tmp_code_2 = "fantasyinstr1 %rax, %rax\nfantasyinstr2 %rbx, %rbx\n"
tmp_kernel_1 = self.parser_x86.parse_file(tmp_code_1)
tmp_kernel_2 = self.parser_x86.parse_file(tmp_code_2)
tmp_semantics.add_semantics(tmp_kernel_1)
tmp_semantics.add_semantics(tmp_kernel_2)
tmp_semantics.assign_optimal_throughput(tmp_kernel_1)
tmp_semantics.assign_optimal_throughput(tmp_kernel_2)
k1i1_pp = [round(x, 2) for x in tmp_kernel_1[0]["port_pressure"]]
k2i1_pp = [round(x, 2) for x in tmp_kernel_2[0]["port_pressure"]]
self.assertEqual(k1i1_pp, [0.33, 0.0, 0.33, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33, 0.0, 0.0])
self.assertEqual(k2i1_pp, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])
# arm # arm
kernel_fixed = deepcopy(self.kernel_AArch64) kernel_fixed = deepcopy(self.kernel_AArch64)