mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2026-01-05 10:40:06 +01:00
added Zen3 support
This commit is contained in:
5217
osaca/data/zen3.yml
Normal file
5217
osaca/data/zen3.yml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -30,6 +30,7 @@ SUPPORTED_ARCHS = [
|
|||||||
"ICX",
|
"ICX",
|
||||||
"ZEN1",
|
"ZEN1",
|
||||||
"ZEN2",
|
"ZEN2",
|
||||||
|
"ZEN3",
|
||||||
"TX2",
|
"TX2",
|
||||||
"N1",
|
"N1",
|
||||||
"A64FX",
|
"A64FX",
|
||||||
@@ -97,7 +98,7 @@ def create_parser(parser=None):
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--arch",
|
"--arch",
|
||||||
type=str,
|
type=str,
|
||||||
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, TX2, N1, "
|
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, ZEN3, TX2, N1, "
|
||||||
"A64FX, TSV110, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
|
"A64FX, TSV110, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -329,6 +330,7 @@ def inspect(args, output_file=sys.stdout):
|
|||||||
# Do optimal schedule for kernel throughput if wished
|
# Do optimal schedule for kernel throughput if wished
|
||||||
if not args.fixed:
|
if not args.fixed:
|
||||||
semantics.assign_optimal_throughput(kernel)
|
semantics.assign_optimal_throughput(kernel)
|
||||||
|
semantics.assign_optimal_throughput(kernel)
|
||||||
|
|
||||||
# Create DiGrahps
|
# Create DiGrahps
|
||||||
kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)
|
kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Semantics opbject responsible for architecture specific semantic operations"""
|
"""Semantics opbject responsible for architecture specific semantic operations"""
|
||||||
|
|
||||||
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
from .hw_model import MachineModel
|
from .hw_model import MachineModel
|
||||||
from .isa_semantics import INSTR_FLAGS, ISASemantics
|
from .isa_semantics import INSTR_FLAGS, ISASemantics
|
||||||
@@ -31,7 +33,7 @@ class ArchSemantics(ISASemantics):
|
|||||||
if self._machine_model.has_hidden_loads():
|
if self._machine_model.has_hidden_loads():
|
||||||
self.set_hidden_loads(kernel)
|
self.set_hidden_loads(kernel)
|
||||||
|
|
||||||
def assign_optimal_throughput(self, kernel):
|
def assign_optimal_throughput(self, kernel, start=0):
|
||||||
"""
|
"""
|
||||||
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
|
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
|
||||||
|
|
||||||
@@ -40,7 +42,26 @@ class ArchSemantics(ISASemantics):
|
|||||||
INC = 0.01
|
INC = 0.01
|
||||||
kernel.reverse()
|
kernel.reverse()
|
||||||
port_list = self._machine_model.get_ports()
|
port_list = self._machine_model.get_ports()
|
||||||
for instruction_form in kernel:
|
for idx, instruction_form in enumerate(kernel[start:], start):
|
||||||
|
multiple_assignments = False
|
||||||
|
# if iform has multiple possible port assignments, check all in a DFS manner and take the best
|
||||||
|
if isinstance(instruction_form["port_uops"], dict):
|
||||||
|
best_kernel = None
|
||||||
|
best_kernel_tp = sys.maxsize
|
||||||
|
for port_util_alt in list(instruction_form["port_uops"].values())[1:]:
|
||||||
|
k_tmp = deepcopy(kernel)
|
||||||
|
k_tmp[idx]["port_uops"] = deepcopy(port_util_alt)
|
||||||
|
k_tmp[idx]["port_pressure"] = self._machine_model.average_port_pressure(
|
||||||
|
k_tmp[idx]["port_uops"]
|
||||||
|
)
|
||||||
|
k_tmp.reverse()
|
||||||
|
self.assign_optimal_throughput(k_tmp, idx)
|
||||||
|
if max(self.get_throughput_sum(k_tmp)) < best_kernel_tp:
|
||||||
|
best_kernel = k_tmp
|
||||||
|
best_kernel_tp = max(self.get_throughput_sum(best_kernel))
|
||||||
|
# check the first option in the main branch and compare against the best option later
|
||||||
|
multiple_assignments = True
|
||||||
|
kernel[idx]["port_uops"] = list(instruction_form["port_uops"].values())[0]
|
||||||
for uop in instruction_form["port_uops"]:
|
for uop in instruction_form["port_uops"]:
|
||||||
cycles = uop[0]
|
cycles = uop[0]
|
||||||
ports = list(uop[1])
|
ports = list(uop[1])
|
||||||
@@ -84,6 +105,7 @@ class ArchSemantics(ISASemantics):
|
|||||||
p
|
p
|
||||||
for p in indices
|
for p in indices
|
||||||
if round(instruction_form["port_pressure"][p], 2) == 0
|
if round(instruction_form["port_pressure"][p], 2) == 0
|
||||||
|
or instruction_form["port_pressure"][p] < 0.00
|
||||||
][0]
|
][0]
|
||||||
instruction_form["port_pressure"][zero_index] = 0.0
|
instruction_form["port_pressure"][zero_index] = 0.0
|
||||||
# Remove from further balancing
|
# Remove from further balancing
|
||||||
@@ -108,6 +130,11 @@ class ArchSemantics(ISASemantics):
|
|||||||
itemgetter(*indices)(self.get_throughput_sum(kernel))
|
itemgetter(*indices)(self.get_throughput_sum(kernel))
|
||||||
)
|
)
|
||||||
kernel.reverse()
|
kernel.reverse()
|
||||||
|
if multiple_assignments:
|
||||||
|
if max(self.get_throughput_sum(kernel)) > best_kernel_tp:
|
||||||
|
for i, instr in enumerate(best_kernel):
|
||||||
|
kernel[i]["port_uops"] = best_kernel[i]["port_uops"]
|
||||||
|
kernel[i]["port_pressure"] = best_kernel[i]["port_pressure"]
|
||||||
|
|
||||||
def set_hidden_loads(self, kernel):
|
def set_hidden_loads(self, kernel):
|
||||||
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
|
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
|
||||||
@@ -209,11 +236,12 @@ class ArchSemantics(ISASemantics):
|
|||||||
operands.index(self._create_reg_wildcard())
|
operands.index(self._create_reg_wildcard())
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
dummy_reg = {"class": "register", "name": reg_type}
|
||||||
data_port_pressure = [0.0 for _ in range(port_number)]
|
data_port_pressure = [0.0 for _ in range(port_number)]
|
||||||
data_port_uops = []
|
data_port_uops = []
|
||||||
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
|
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
|
||||||
# LOAD performance data
|
# LOAD performance data
|
||||||
data_port_uops = self._machine_model.get_load_throughput(
|
load_perf_data = self._machine_model.get_load_throughput(
|
||||||
[
|
[
|
||||||
x["memory"]
|
x["memory"]
|
||||||
for x in instruction_form["semantic_operands"]["source"]
|
for x in instruction_form["semantic_operands"]["source"]
|
||||||
@@ -221,6 +249,19 @@ class ArchSemantics(ISASemantics):
|
|||||||
if "memory" in x
|
if "memory" in x
|
||||||
][0]
|
][0]
|
||||||
)
|
)
|
||||||
|
# if multiple options, choose based on reg type
|
||||||
|
data_port_uops = [
|
||||||
|
ldp["port_pressure"]
|
||||||
|
for ldp in load_perf_data
|
||||||
|
if "dst" in ldp
|
||||||
|
and self._machine_model._check_operands(
|
||||||
|
dummy_reg, {"register": {"name": ldp["dst"]}}
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if len(data_port_uops) < 1:
|
||||||
|
data_port_uops = load_perf_data[0]["port_pressure"]
|
||||||
|
else:
|
||||||
|
data_port_uops = data_port_uops[0]
|
||||||
data_port_pressure = self._machine_model.average_port_pressure(
|
data_port_pressure = self._machine_model.average_port_pressure(
|
||||||
data_port_uops
|
data_port_uops
|
||||||
)
|
)
|
||||||
@@ -235,9 +276,22 @@ class ArchSemantics(ISASemantics):
|
|||||||
instruction_form["semantic_operands"]["destination"]
|
instruction_form["semantic_operands"]["destination"]
|
||||||
+ instruction_form["semantic_operands"]["src_dst"]
|
+ instruction_form["semantic_operands"]["src_dst"]
|
||||||
)
|
)
|
||||||
st_data_port_uops = self._machine_model.get_store_throughput(
|
store_perf_data = self._machine_model.get_store_throughput(
|
||||||
[x["memory"] for x in destinations if "memory" in x][0]
|
[x["memory"] for x in destinations if "memory" in x][0]
|
||||||
)
|
)
|
||||||
|
# if multiple options, choose based on reg type
|
||||||
|
st_data_port_uops = [
|
||||||
|
stp["port_pressure"]
|
||||||
|
for stp in store_perf_data
|
||||||
|
if "src" in stp
|
||||||
|
and self._machine_model._check_operands(
|
||||||
|
dummy_reg, {"register": {"name": stp["src"]}}
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if len(data_port_uops) < 1:
|
||||||
|
st_data_port_uops = store_perf_data[0]["port_pressure"]
|
||||||
|
else:
|
||||||
|
st_data_port_uops = st_data_port_uops[0]
|
||||||
# zero data port pressure and remove HAS_ST flag if
|
# zero data port pressure and remove HAS_ST flag if
|
||||||
# - no mem operand in dst &&
|
# - no mem operand in dst &&
|
||||||
# - all mem operands in src_dst are pre-/post-indexed
|
# - all mem operands in src_dst are pre-/post-indexed
|
||||||
|
|||||||
@@ -143,11 +143,16 @@ class MachineModel(object):
|
|||||||
print("\nname: {}\noperands: {}".format(name, operands))
|
print("\nname: {}\noperands: {}".format(name, operands))
|
||||||
raise TypeError from e
|
raise TypeError from e
|
||||||
|
|
||||||
def average_port_pressure(self, port_pressure):
|
def average_port_pressure(self, port_pressure, option=0):
|
||||||
"""Construct average port pressure list from instruction data."""
|
"""Construct average port pressure list from instruction data."""
|
||||||
port_list = self._data["ports"]
|
port_list = self._data["ports"]
|
||||||
average_pressure = [0.0] * len(port_list)
|
average_pressure = [0.0] * len(port_list)
|
||||||
for cycles, ports in port_pressure:
|
# if there are multiple port utilization options and none is selected, choose first one
|
||||||
|
if isinstance(port_pressure, dict):
|
||||||
|
used_pp = port_pressure[option]
|
||||||
|
else:
|
||||||
|
used_pp = port_pressure
|
||||||
|
for cycles, ports in used_pp:
|
||||||
for p in ports:
|
for p in ports:
|
||||||
try:
|
try:
|
||||||
average_pressure[port_list.index(p)] += cycles / len(ports)
|
average_pressure[port_list.index(p)] += cycles / len(ports)
|
||||||
@@ -221,8 +226,8 @@ class MachineModel(object):
|
|||||||
"""Return load thorughput for given register type."""
|
"""Return load thorughput for given register type."""
|
||||||
ld_tp = [m for m in self._data["load_throughput"] if self._match_mem_entries(memory, m)]
|
ld_tp = [m for m in self._data["load_throughput"] if self._match_mem_entries(memory, m)]
|
||||||
if len(ld_tp) > 0:
|
if len(ld_tp) > 0:
|
||||||
return ld_tp[0]["port_pressure"].copy()
|
return ld_tp.copy()
|
||||||
return self._data["load_throughput_default"].copy()
|
return [{"port_pressure": self._data["load_throughput_default"].copy()}]
|
||||||
|
|
||||||
def get_store_latency(self, reg_type):
|
def get_store_latency(self, reg_type):
|
||||||
"""Return store latency for given register type."""
|
"""Return store latency for given register type."""
|
||||||
@@ -233,8 +238,8 @@ class MachineModel(object):
|
|||||||
"""Return store throughput for given register type."""
|
"""Return store throughput for given register type."""
|
||||||
st_tp = [m for m in self._data["store_throughput"] if self._match_mem_entries(memory, m)]
|
st_tp = [m for m in self._data["store_throughput"] if self._match_mem_entries(memory, m)]
|
||||||
if len(st_tp) > 0:
|
if len(st_tp) > 0:
|
||||||
return st_tp[0]["port_pressure"].copy()
|
return st_tp.copy()
|
||||||
return self._data["store_throughput_default"].copy()
|
return [{"port_pressure": self._data["store_throughput_default"].copy()}]
|
||||||
|
|
||||||
def _match_mem_entries(self, mem, i_mem):
|
def _match_mem_entries(self, mem, i_mem):
|
||||||
"""Check if memory addressing ``mem`` and ``i_mem`` are of the same type."""
|
"""Check if memory addressing ``mem`` and ``i_mem`` are of the same type."""
|
||||||
@@ -273,6 +278,7 @@ class MachineModel(object):
|
|||||||
"zen1": "x86",
|
"zen1": "x86",
|
||||||
"zen+": "x86",
|
"zen+": "x86",
|
||||||
"zen2": "x86",
|
"zen2": "x86",
|
||||||
|
"zen3": "x86",
|
||||||
"con": "x86", # Intel Conroe
|
"con": "x86", # Intel Conroe
|
||||||
"wol": "x86", # Intel Wolfdale
|
"wol": "x86", # Intel Wolfdale
|
||||||
"snb": "x86",
|
"snb": "x86",
|
||||||
|
|||||||
@@ -61,6 +61,24 @@ port_model_scheme: |
|
|||||||
+-------+ | VNNI |
|
+-------+ | VNNI |
|
||||||
+-------+
|
+-------+
|
||||||
instruction_forms:
|
instruction_forms:
|
||||||
|
- name: fantasyinstr1
|
||||||
|
operands:
|
||||||
|
- class: register
|
||||||
|
name: gpr
|
||||||
|
- class: register
|
||||||
|
name: gpr
|
||||||
|
port_pressure: {0: [[1, '015']], 1: [[1, '56']]}
|
||||||
|
throughput: 0.333333
|
||||||
|
latency: 1.0
|
||||||
|
- name: fantasyinstr2
|
||||||
|
operands:
|
||||||
|
- class: register
|
||||||
|
name: gpr
|
||||||
|
- class: register
|
||||||
|
name: gpr
|
||||||
|
port_pressure: [[1, '0'], [1, '1'], [1, '5']]
|
||||||
|
throughput: 0.5
|
||||||
|
latency: 1.0
|
||||||
- name: LEA
|
- name: LEA
|
||||||
operands:
|
operands:
|
||||||
- class: memory
|
- class: memory
|
||||||
|
|||||||
@@ -175,7 +175,7 @@ class TestSemanticTools(unittest.TestCase):
|
|||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
test_mm_x86.get_store_throughput(
|
test_mm_x86.get_store_throughput(
|
||||||
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
|
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
|
||||||
),
|
)[0]["port_pressure"],
|
||||||
[[2, "237"], [2, "4"]],
|
[[2, "237"], [2, "4"]],
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@@ -186,13 +186,13 @@ class TestSemanticTools(unittest.TestCase):
|
|||||||
"index": "NOT_NONE",
|
"index": "NOT_NONE",
|
||||||
"scale": 1,
|
"scale": 1,
|
||||||
}
|
}
|
||||||
),
|
)[0]["port_pressure"],
|
||||||
[[1, "23"], [1, "4"]],
|
[[1, "23"], [1, "4"]],
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
test_mm_arm.get_store_throughput(
|
test_mm_arm.get_store_throughput(
|
||||||
{"base": {"prefix": "x"}, "offset": None, "index": None, "scale": 1}
|
{"base": {"prefix": "x"}, "offset": None, "index": None, "scale": 1}
|
||||||
),
|
)[0]["port_pressure"],
|
||||||
[[2, "34"], [2, "5"]],
|
[[2, "34"], [2, "5"]],
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@@ -203,7 +203,7 @@ class TestSemanticTools(unittest.TestCase):
|
|||||||
"index": None,
|
"index": None,
|
||||||
"scale": 1,
|
"scale": 1,
|
||||||
}
|
}
|
||||||
),
|
)[0]["port_pressure"],
|
||||||
[[1, "34"], [1, "5"]],
|
[[1, "34"], [1, "5"]],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -228,7 +228,7 @@ class TestSemanticTools(unittest.TestCase):
|
|||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
test_mm_x86.get_load_throughput(
|
test_mm_x86.get_load_throughput(
|
||||||
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
|
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
|
||||||
),
|
)[0]["port_pressure"],
|
||||||
[[1, "23"], [1, ["2D", "3D"]]],
|
[[1, "23"], [1, ["2D", "3D"]]],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -288,6 +288,21 @@ class TestSemanticTools(unittest.TestCase):
|
|||||||
tp_optimal = self.semantics_csx.get_throughput_sum(kernel_optimal)
|
tp_optimal = self.semantics_csx.get_throughput_sum(kernel_optimal)
|
||||||
self.assertNotEqual(tp_fixed, tp_optimal)
|
self.assertNotEqual(tp_fixed, tp_optimal)
|
||||||
self.assertTrue(max(tp_optimal) <= max(tp_fixed))
|
self.assertTrue(max(tp_optimal) <= max(tp_fixed))
|
||||||
|
# test multiple port assignment options
|
||||||
|
test_mm_x86 = MachineModel(path_to_yaml=self._find_file("test_db_x86.yml"))
|
||||||
|
tmp_semantics = ArchSemantics(test_mm_x86)
|
||||||
|
tmp_code_1 = "fantasyinstr1 %rax, %rax\n"
|
||||||
|
tmp_code_2 = "fantasyinstr1 %rax, %rax\nfantasyinstr2 %rbx, %rbx\n"
|
||||||
|
tmp_kernel_1 = self.parser_x86.parse_file(tmp_code_1)
|
||||||
|
tmp_kernel_2 = self.parser_x86.parse_file(tmp_code_2)
|
||||||
|
tmp_semantics.add_semantics(tmp_kernel_1)
|
||||||
|
tmp_semantics.add_semantics(tmp_kernel_2)
|
||||||
|
tmp_semantics.assign_optimal_throughput(tmp_kernel_1)
|
||||||
|
tmp_semantics.assign_optimal_throughput(tmp_kernel_2)
|
||||||
|
k1i1_pp = [round(x, 2) for x in tmp_kernel_1[0]["port_pressure"]]
|
||||||
|
k2i1_pp = [round(x, 2) for x in tmp_kernel_2[0]["port_pressure"]]
|
||||||
|
self.assertEqual(k1i1_pp, [0.33, 0.0, 0.33, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33, 0.0, 0.0])
|
||||||
|
self.assertEqual(k2i1_pp, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])
|
||||||
|
|
||||||
# arm
|
# arm
|
||||||
kernel_fixed = deepcopy(self.kernel_AArch64)
|
kernel_fixed = deepcopy(self.kernel_AArch64)
|
||||||
|
|||||||
Reference in New Issue
Block a user