added Zen3 support

2026-01-05 10:40:06 +01:00 · 2022-09-27 18:39:14 +02:00
parent 4f8e37d9fd
commit 7724ce27c7
6 changed files with 5328 additions and 16 deletions
--- a/osaca/data/zen3.yml
+++ b/osaca/data/zen3.yml
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -30,6 +30,7 @@ SUPPORTED_ARCHS = [
    "ICX",
    "ZEN1",
    "ZEN2",
+    "ZEN3",
    "TX2",
    "N1",
    "A64FX",
@@ -97,7 +98,7 @@ def create_parser(parser=None):
    parser.add_argument(
        "--arch",
        type=str,
-        help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, TX2, N1, "
+        help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, ZEN3, TX2, N1, "
        "A64FX, TSV110, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
    )
    parser.add_argument(
@@ -329,6 +330,7 @@ def inspect(args, output_file=sys.stdout):
    # Do optimal schedule for kernel throughput if wished
    if not args.fixed:
        semantics.assign_optimal_throughput(kernel)
+        semantics.assign_optimal_throughput(kernel)

    # Create DiGrahps
    kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)
--- a/osaca/semantics/arch_semantics.py
+++ b/osaca/semantics/arch_semantics.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 """Semantics opbject responsible for architecture specific semantic operations"""

+import sys
 import warnings
 from itertools import chain
 from operator import itemgetter
+from copy import deepcopy

 from .hw_model import MachineModel
 from .isa_semantics import INSTR_FLAGS, ISASemantics
@@ -31,7 +33,7 @@ class ArchSemantics(ISASemantics):
        if self._machine_model.has_hidden_loads():
            self.set_hidden_loads(kernel)

-    def assign_optimal_throughput(self, kernel):
+    def assign_optimal_throughput(self, kernel, start=0):
        """
        Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.

@@ -40,7 +42,26 @@ class ArchSemantics(ISASemantics):
        INC = 0.01
        kernel.reverse()
        port_list = self._machine_model.get_ports()
-        for instruction_form in kernel:
+        for idx, instruction_form in enumerate(kernel[start:], start):
+            multiple_assignments = False
+            # if iform has multiple possible port assignments, check all in a DFS manner and take the best
+            if isinstance(instruction_form["port_uops"], dict):
+                best_kernel = None
+                best_kernel_tp = sys.maxsize
+                for port_util_alt in list(instruction_form["port_uops"].values())[1:]:
+                    k_tmp = deepcopy(kernel)
+                    k_tmp[idx]["port_uops"] = deepcopy(port_util_alt)
+                    k_tmp[idx]["port_pressure"] = self._machine_model.average_port_pressure(
+                        k_tmp[idx]["port_uops"]
+                    )
+                    k_tmp.reverse()
+                    self.assign_optimal_throughput(k_tmp, idx)
+                    if max(self.get_throughput_sum(k_tmp)) < best_kernel_tp:
+                        best_kernel = k_tmp
+                        best_kernel_tp = max(self.get_throughput_sum(best_kernel))
+                # check the first option in the main branch and compare against the best option later
+                multiple_assignments = True
+                kernel[idx]["port_uops"] = list(instruction_form["port_uops"].values())[0]
            for uop in instruction_form["port_uops"]:
                cycles = uop[0]
                ports = list(uop[1])
@@ -84,6 +105,7 @@ class ArchSemantics(ISASemantics):
                                    p
                                    for p in indices
                                    if round(instruction_form["port_pressure"][p], 2) == 0
+                                    or instruction_form["port_pressure"][p] < 0.00
                                ][0]
                                instruction_form["port_pressure"][zero_index] = 0.0
                            # Remove from further balancing
@@ -108,6 +130,11 @@ class ArchSemantics(ISASemantics):
                            itemgetter(*indices)(self.get_throughput_sum(kernel))
                        )
        kernel.reverse()
+        if multiple_assignments:
+            if max(self.get_throughput_sum(kernel)) > best_kernel_tp:
+                for i, instr in enumerate(best_kernel):
+                    kernel[i]["port_uops"] = best_kernel[i]["port_uops"]
+                    kernel[i]["port_pressure"] = best_kernel[i]["port_pressure"]

    def set_hidden_loads(self, kernel):
        """Hide loads behind stores if architecture supports hidden loads (depricated)"""
@@ -209,11 +236,12 @@ class ArchSemantics(ISASemantics):
                                operands.index(self._create_reg_wildcard())
                            ]
                        )
+                        dummy_reg = {"class": "register", "name": reg_type}
                        data_port_pressure = [0.0 for _ in range(port_number)]
                        data_port_uops = []
                        if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
                            # LOAD performance data
-                            data_port_uops = self._machine_model.get_load_throughput(
+                            load_perf_data = self._machine_model.get_load_throughput(
                                [
                                    x["memory"]
                                    for x in instruction_form["semantic_operands"]["source"]
@@ -221,6 +249,19 @@ class ArchSemantics(ISASemantics):
                                    if "memory" in x
                                ][0]
                            )
+                            # if multiple options, choose based on reg type
+                            data_port_uops = [
+                                ldp["port_pressure"]
+                                for ldp in load_perf_data
+                                if "dst" in ldp
+                                and self._machine_model._check_operands(
+                                    dummy_reg, {"register": {"name": ldp["dst"]}}
+                                )
+                            ]
+                            if len(data_port_uops) < 1:
+                                data_port_uops = load_perf_data[0]["port_pressure"]
+                            else:
+                                data_port_uops = data_port_uops[0]
                            data_port_pressure = self._machine_model.average_port_pressure(
                                data_port_uops
                            )
@@ -235,9 +276,22 @@ class ArchSemantics(ISASemantics):
                                instruction_form["semantic_operands"]["destination"]
                                + instruction_form["semantic_operands"]["src_dst"]
                            )
-                            st_data_port_uops = self._machine_model.get_store_throughput(
+                            store_perf_data = self._machine_model.get_store_throughput(
                                [x["memory"] for x in destinations if "memory" in x][0]
                            )
+                            # if multiple options, choose based on reg type
+                            st_data_port_uops = [
+                                stp["port_pressure"]
+                                for stp in store_perf_data
+                                if "src" in stp
+                                and self._machine_model._check_operands(
+                                    dummy_reg, {"register": {"name": stp["src"]}}
+                                )
+                            ]
+                            if len(data_port_uops) < 1:
+                                st_data_port_uops = store_perf_data[0]["port_pressure"]
+                            else:
+                                st_data_port_uops = st_data_port_uops[0]
                            # zero data port pressure and remove HAS_ST flag if
                            #   - no mem operand in dst &&
                            #   - all mem operands in src_dst are pre-/post-indexed
--- a/osaca/semantics/hw_model.py
+++ b/osaca/semantics/hw_model.py
@@ -143,11 +143,16 @@ class MachineModel(object):
            print("\nname: {}\noperands: {}".format(name, operands))
            raise TypeError from e

-    def average_port_pressure(self, port_pressure):
+    def average_port_pressure(self, port_pressure, option=0):
        """Construct average port pressure list from instruction data."""
        port_list = self._data["ports"]
        average_pressure = [0.0] * len(port_list)
-        for cycles, ports in port_pressure:
+        # if there are multiple port utilization options and none is selected, choose first one
+        if isinstance(port_pressure, dict):
+            used_pp = port_pressure[option]
+        else:
+            used_pp = port_pressure
+        for cycles, ports in used_pp:
            for p in ports:
                try:
                    average_pressure[port_list.index(p)] += cycles / len(ports)
@@ -221,8 +226,8 @@ class MachineModel(object):
        """Return load thorughput for given register type."""
        ld_tp = [m for m in self._data["load_throughput"] if self._match_mem_entries(memory, m)]
        if len(ld_tp) > 0:
-            return ld_tp[0]["port_pressure"].copy()
-        return self._data["load_throughput_default"].copy()
+            return ld_tp.copy()
+        return [{"port_pressure": self._data["load_throughput_default"].copy()}]

    def get_store_latency(self, reg_type):
        """Return store latency for given register type."""
@@ -233,8 +238,8 @@ class MachineModel(object):
        """Return store throughput for given register type."""
        st_tp = [m for m in self._data["store_throughput"] if self._match_mem_entries(memory, m)]
        if len(st_tp) > 0:
-            return st_tp[0]["port_pressure"].copy()
-        return self._data["store_throughput_default"].copy()
+            return st_tp.copy()
+        return [{"port_pressure": self._data["store_throughput_default"].copy()}]

    def _match_mem_entries(self, mem, i_mem):
        """Check if memory addressing ``mem`` and ``i_mem`` are of the same type."""
@@ -273,6 +278,7 @@ class MachineModel(object):
            "zen1": "x86",
            "zen+": "x86",
            "zen2": "x86",
+            "zen3": "x86",
            "con": "x86",  # Intel Conroe
            "wol": "x86",  # Intel Wolfdale
            "snb": "x86",
--- a/tests/test_files/test_db_x86.yml
+++ b/tests/test_files/test_db_x86.yml
@@ -61,6 +61,24 @@ port_model_scheme: |
   +-------+ |  VNNI |                         
             +-------+                         
 instruction_forms:
+- name: fantasyinstr1
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  port_pressure: {0: [[1, '015']], 1: [[1, '56']]}
+  throughput: 0.333333
+  latency: 1.0
+- name: fantasyinstr2
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  port_pressure: [[1, '0'], [1, '1'], [1, '5']]
+  throughput: 0.5
+  latency: 1.0
 - name: LEA
  operands:
  - class: memory
--- a/tests/test_semantics.py
+++ b/tests/test_semantics.py
@@ -175,7 +175,7 @@ class TestSemanticTools(unittest.TestCase):
        self.assertEqual(
            test_mm_x86.get_store_throughput(
                {"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
-            ),
+            )[0]["port_pressure"],
            [[2, "237"], [2, "4"]],
        )
        self.assertEqual(
@@ -186,13 +186,13 @@ class TestSemanticTools(unittest.TestCase):
                    "index": "NOT_NONE",
                    "scale": 1,
                }
-            ),
+            )[0]["port_pressure"],
            [[1, "23"], [1, "4"]],
        )
        self.assertEqual(
            test_mm_arm.get_store_throughput(
                {"base": {"prefix": "x"}, "offset": None, "index": None, "scale": 1}
-            ),
+            )[0]["port_pressure"],
            [[2, "34"], [2, "5"]],
        )
        self.assertEqual(
@@ -203,7 +203,7 @@ class TestSemanticTools(unittest.TestCase):
                    "index": None,
                    "scale": 1,
                }
-            ),
+            )[0]["port_pressure"],
            [[1, "34"], [1, "5"]],
        )

@@ -228,7 +228,7 @@ class TestSemanticTools(unittest.TestCase):
        self.assertEqual(
            test_mm_x86.get_load_throughput(
                {"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
-            ),
+            )[0]["port_pressure"],
            [[1, "23"], [1, ["2D", "3D"]]],
        )

@@ -288,6 +288,21 @@ class TestSemanticTools(unittest.TestCase):
        tp_optimal = self.semantics_csx.get_throughput_sum(kernel_optimal)
        self.assertNotEqual(tp_fixed, tp_optimal)
        self.assertTrue(max(tp_optimal) <= max(tp_fixed))
+        # test multiple port assignment options
+        test_mm_x86 = MachineModel(path_to_yaml=self._find_file("test_db_x86.yml"))
+        tmp_semantics = ArchSemantics(test_mm_x86)
+        tmp_code_1 = "fantasyinstr1 %rax, %rax\n"
+        tmp_code_2 = "fantasyinstr1 %rax, %rax\nfantasyinstr2 %rbx, %rbx\n"
+        tmp_kernel_1 = self.parser_x86.parse_file(tmp_code_1)
+        tmp_kernel_2 = self.parser_x86.parse_file(tmp_code_2)
+        tmp_semantics.add_semantics(tmp_kernel_1)
+        tmp_semantics.add_semantics(tmp_kernel_2)
+        tmp_semantics.assign_optimal_throughput(tmp_kernel_1)
+        tmp_semantics.assign_optimal_throughput(tmp_kernel_2)
+        k1i1_pp = [round(x, 2) for x in tmp_kernel_1[0]["port_pressure"]]
+        k2i1_pp = [round(x, 2) for x in tmp_kernel_2[0]["port_pressure"]]
+        self.assertEqual(k1i1_pp, [0.33, 0.0, 0.33, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33, 0.0, 0.0])
+        self.assertEqual(k2i1_pp, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])

        # arm
        kernel_fixed = deepcopy(self.kernel_AArch64)