Merge pull request #117 from pleroy/Load2

Properly track the dependencies of the LOAD phase of instructions
This commit is contained in:
Jan
2025-08-12 16:14:34 +02:00
committed by GitHub
3 changed files with 136 additions and 74 deletions

View File

@@ -80,6 +80,8 @@ class Frontend(object):
s += lineno_filler + self._get_port_number_line(port_len) + "\n"
s += separator + "\n"
for instruction_form in kernel:
if KernelDG.is_load_line_number(instruction_form.line_number):
continue
line = "{:4d} {} {} {}".format(
instruction_form.line_number,
self._get_port_pressure(
@@ -112,6 +114,8 @@ class Frontend(object):
"""
s = "\n\nLatency Analysis Report\n-----------------------\n"
for instruction_form in cp_kernel:
if KernelDG.is_load_line_number(instruction_form.line_number):
continue
s += (
"{:4d} {} {:4.1f} {}{}{} {}".format(
instruction_form.line_number,
@@ -147,8 +151,11 @@ class Frontend(object):
)
# TODO find a way to overcome padding for different tab-lengths
for dep in sorted(dep_dict.keys()):
s += "{:4d} {} {:4.1f} {} {:36}{} {}\n".format(
int(dep.split("-")[0]),
dep0 = float(dep.split("-")[0])
if KernelDG.is_load_line_number(dep0):
continue
s += "{:4.0f} {} {:4.1f} {} {:36}{} {}\n".format(
dep0,
separator,
dep_dict[dep]["latency"],
separator,
@@ -356,6 +363,8 @@ class Frontend(object):
if show_cmnts is False and self._is_comment(instruction_form):
continue
line_number = instruction_form.line_number
if KernelDG.is_load_line_number(line_number):
continue
used_ports = [list(uops[1]) for uops in instruction_form.port_uops]
used_ports = list(set([p for uops_ports in used_ports for p in uops_ports]))
s += "{:4d} {}{} {} {}\n".format(

View File

@@ -1,12 +1,14 @@
#!/usr/bin/env python3
import copy
from enum import Enum
import time
from itertools import chain, groupby
from multiprocessing import Manager, Process, cpu_count
import networkx as nx
from osaca.semantics import INSTR_FLAGS, ArchSemantics, MachineModel
from osaca.parser.instruction_form import InstructionForm
from osaca.parser.memory import MemoryOperand
from osaca.parser.register import RegisterOperand
from osaca.parser.immediate import ImmediateOperand
@@ -17,6 +19,11 @@ class KernelDG(nx.DiGraph):
# threshold for checking dependency graph sequential or in parallel
INSTRUCTION_THRESHOLD = 50
class ReadKind(Enum):
NOT_A_READ = 0
READ_FOR_LOAD = 1
OTHER_READ = 2
def __init__(
self,
parsed_kernel,
@@ -46,6 +53,25 @@ class KernelDG(nx.DiGraph):
dst_list.extend(tmp_list)
# print('Thread [{}-{}] done'.format(kernel[0]['line_number'], kernel[-1]['line_number']))
@staticmethod
def get_load_line_number(line_number):
# The line number of the load must be less than the line number of the instruction. The
# offset is irrelevant, but it must be a machine number with trailing zeroes to avoid silly
# rounding issues.
return line_number - 0.125
@staticmethod
def is_load_line_number(line_number):
return line_number != int(line_number)
@staticmethod
def get_real_line_number(line_number):
return (
int(line_number + 0.125)
if KernelDG.is_load_line_number(line_number)
else line_number
)
def create_DG(self, kernel, flag_dependencies=False):
"""
Create directed graph from given kernel
@@ -57,10 +83,10 @@ class KernelDG(nx.DiGraph):
:type flag_dependencies: boolean, optional
:returns: :class:`~nx.DiGraph` -- directed graph object
"""
# 1. go through kernel instruction forms and add them as node attribute
# 2. find edges (to dependend further instruction)
# 3. get LT value and set as edge weight
# Go through kernel instruction forms and add them as nodes of the graph. Create a LOAD
# node for instructions that include a memory reference.
dg = nx.DiGraph()
loads = {}
for i, instruction_form in enumerate(kernel):
dg.add_node(instruction_form.line_number)
dg.nodes[instruction_form.line_number]["instruction_form"] = instruction_form
@@ -70,14 +96,24 @@ class KernelDG(nx.DiGraph):
and INSTR_FLAGS.LD not in instruction_form.flags
):
# add new node
dg.add_node(instruction_form.line_number + 0.1)
dg.nodes[instruction_form.line_number + 0.1]["instruction_form"] = instruction_form
load_line_number = KernelDG.get_load_line_number(instruction_form.line_number)
loads[instruction_form.line_number] = load_line_number
dg.add_node(load_line_number)
dg.nodes[load_line_number]["instruction_form"] = InstructionForm(
mnemonic="_LOAD_",
line=instruction_form.line,
line_number=load_line_number
)
# and set LD latency as edge weight
dg.add_edge(
instruction_form.line_number + 0.1,
load_line_number,
instruction_form.line_number,
latency=instruction_form.latency - instruction_form.latency_wo_load,
)
# 1. find edges (to dependend further instruction)
# 2. get LT value and set as edge weight
for i, instruction_form in enumerate(kernel):
for dep, dep_flags in self.find_depending(
instruction_form, kernel[i + 1 :], flag_dependencies
):
@@ -91,11 +127,18 @@ class KernelDG(nx.DiGraph):
edge_weight += self.model.get("store_to_load_forward_latency", 0)
if "p_indexed" in dep_flags and self.model is not None:
edge_weight = self.model.get("p_index_latency", 1)
dg.add_edge(
instruction_form.line_number,
dep.line_number,
latency=edge_weight,
)
if "for_load" in dep_flags and self.model is not None and dep.line_number in loads:
dg.add_edge(
instruction_form.line_number,
loads[dep.line_number],
latency=edge_weight,
)
else:
dg.add_edge(
instruction_form.line_number,
dep.line_number,
latency=edge_weight,
)
dg.nodes[dep.line_number]["instruction_form"] = dep
return dg
@@ -204,23 +247,17 @@ class KernelDG(nx.DiGraph):
for lat_sum, involved_lines in loopcarried_deps:
dict_key = "-".join([str(il[0]) for il in involved_lines])
loopcarried_deps_dict[dict_key] = {
"root": self._get_node_by_lineno(involved_lines[0][0]),
"root": self._get_node_by_lineno(dg, involved_lines[0][0]),
"dependencies": [
(self._get_node_by_lineno(ln), lat) for ln, lat in involved_lines
(self._get_node_by_lineno(dg, ln), lat) for ln, lat in involved_lines
],
"latency": lat_sum,
}
return loopcarried_deps_dict
def _get_node_by_lineno(self, lineno, kernel=None, all=False):
"""Return instruction form with line number ``lineno`` from kernel"""
if kernel is None:
kernel = self.kernel
result = [instr for instr in kernel if instr.line_number == lineno]
if not all:
return result[0]
else:
return result
def _get_node_by_lineno(self, dg, lineno):
"""Return instruction form with line number ``lineno`` from dg"""
return dg.nodes[lineno]["instruction_form"]
def get_critical_path(self):
"""Find and return critical path after the creation of a directed graph."""
@@ -229,21 +266,21 @@ class KernelDG(nx.DiGraph):
longest_path = nx.algorithms.dag.dag_longest_path(self.dg, weight="latency")
# TODO verify that we can remove the next two lince due to earlier initialization
for line_number in longest_path:
self._get_node_by_lineno(int(line_number)).latency_cp = 0
self._get_node_by_lineno(self.dg, line_number).latency_cp = 0
# set cp latency to instruction
path_latency = 0.0
for s, d in nx.utils.pairwise(longest_path):
node = self._get_node_by_lineno(int(s))
node = self._get_node_by_lineno(self.dg, s)
node.latency_cp = self.dg.edges[(s, d)]["latency"]
path_latency += node.latency_cp
# add latency for last instruction
node = self._get_node_by_lineno(int(longest_path[-1]))
node = self._get_node_by_lineno(self.dg, longest_path[-1])
node.latency_cp = node.latency
if max_latency_instr.latency > path_latency:
max_latency_instr.latency_cp = float(max_latency_instr.latency)
return [max_latency_instr]
else:
return [x for x in self.kernel if x.line_number in longest_path]
return [self._get_node_by_lineno(self.dg, x) for x in longest_path]
else:
# split to DAG
raise NotImplementedError("Kernel is cyclic.")
@@ -284,15 +321,18 @@ class KernelDG(nx.DiGraph):
# print(" TO", instr_form.line, register_changes)
if isinstance(dst, RegisterOperand):
# read of register
if self.is_read(dst, instr_form):
read_kind = self._read_kind(dst, instr_form)
if read_kind != KernelDG.ReadKind.NOT_A_READ:
dep_flags = []
if (
dst.pre_indexed
or dst.post_indexed
or (isinstance(dst.post_indexed, dict))
):
yield instr_form, ["p_indexed"]
else:
yield instr_form, []
dep_flags = ["p_indexed"]
if read_kind == KernelDG.ReadKind.READ_FOR_LOAD:
dep_flags += ["for_load"]
yield instr_form, dep_flags
# write to register -> abort
if self.is_written(dst, instr_form):
break
@@ -363,11 +403,12 @@ class KernelDG(nx.DiGraph):
return self.dg.successors(line_number)
return iter([])
def is_read(self, register, instruction_form):
"""Check if instruction form reads from given register"""
def _read_kind(self, register, instruction_form):
"""Check if instruction form reads from given register. Returns a ReadKind."""
is_read = False
for_load = False
if instruction_form.semantic_operands is None:
return is_read
return KernelDG.ReadKind.NOT_A_READ
for src in chain(
instruction_form.semantic_operands["source"],
instruction_form.semantic_operands["src_dst"],
@@ -377,10 +418,16 @@ class KernelDG(nx.DiGraph):
if isinstance(src, FlagOperand):
is_read = self.parser.is_flag_dependend_of(register, src) or is_read
if isinstance(src, MemoryOperand):
is_memory_read = False
if src.base is not None:
is_read = self.parser.is_reg_dependend_of(register, src.base) or is_read
is_memory_read = self.parser.is_reg_dependend_of(register, src.base)
if src.index is not None and isinstance(src.index, RegisterOperand):
is_read = self.parser.is_reg_dependend_of(register, src.index) or is_read
is_memory_read = (
self.parser.is_reg_dependend_of(register, src.index)
or is_memory_read
)
for_load = is_memory_read
is_read = is_read or is_memory_read
# Check also if read in destination memory address
for dst in chain(
instruction_form.semantic_operands["destination"],
@@ -391,7 +438,16 @@ class KernelDG(nx.DiGraph):
is_read = self.parser.is_reg_dependend_of(register, dst.base) or is_read
if dst.index is not None:
is_read = self.parser.is_reg_dependend_of(register, dst.index) or is_read
return is_read
if is_read:
if for_load:
return KernelDG.ReadKind.READ_FOR_LOAD
else:
return KernelDG.ReadKind.OTHER_READ
else:
return KernelDG.ReadKind.NOT_A_READ
def is_read(self, register, instruction_form):
return self._read_kind(register, instruction_form) != KernelDG.ReadKind.NOT_A_READ
def is_memload(self, mem, instruction_form, register_changes={}):
"""Check if instruction form loads from given location, assuming register_changes"""
@@ -546,19 +602,9 @@ class KernelDG(nx.DiGraph):
graph.nodes[n]["penwidth"] = 4
# Make critical path edges bold.
for e in graph.edges:
if (
graph.nodes[e[0]]["instruction_form"].line_number in cp_line_numbers
and graph.nodes[e[1]]["instruction_form"].line_number in cp_line_numbers
and e[0] < e[1]
):
bold_edge = True
for i in range(e[0] + 1, e[1]):
if i in cp_line_numbers:
bold_edge = False
if bold_edge:
graph.edges[e]["style"] = "bold"
graph.edges[e]["penwidth"] = 3
for u, v in zip(cp_line_numbers[:-1], cp_line_numbers[1:]):
graph.edges[u, v]["style"] = "bold"
graph.edges[u, v]["penwidth"] = 3
# Color the cycles created by loop-carried dependencies, longest first, never recoloring
# any node or edge, so that the longest LCD and most long chains that are involved in the
@@ -600,8 +646,8 @@ class KernelDG(nx.DiGraph):
graph.nodes[n]["style"] += ",filled"
graph.nodes[n]["fillcolor"] = color
if (
(max_color >= 4 and color in (1, max_color)) or
(max_color >= 10 and color in (1, 2, max_color - 1 , max_color))
(max_color >= 4 and color in (1, max_color))
or (max_color >= 10 and color in (1, 2, max_color - 1 , max_color))
):
graph.nodes[n]["fontcolor"] = "white"
for (u, v), color in edge_colors.items():
@@ -613,21 +659,17 @@ class KernelDG(nx.DiGraph):
# rename node from [idx] to [idx mnemonic] and add shape
mapping = {}
for n in graph.nodes:
if int(n) != n:
mapping[n] = "{}: LOAD".format(int(n))
node = graph.nodes[n]["instruction_form"]
if node.mnemonic is not None:
mapping[n] = "{}: {}".format(KernelDG.get_real_line_number(n), node.mnemonic)
else:
label = "label" if node.label is not None else None
label = "directive" if node.directive is not None else label
label = "comment" if node.comment is not None and label is None else label
mapping[n] = "{}: {}".format(n, label)
graph.nodes[n]["fontname"] = "italic"
graph.nodes[n]["fontsize"] = 11.0
else:
node = graph.nodes[n]["instruction_form"]
if node.mnemonic is not None:
mapping[n] = "{}: {}".format(n, node.mnemonic)
else:
label = "label" if node.label is not None else None
label = "directive" if node.directive is not None else label
label = "comment" if node.comment is not None and label is None else label
mapping[n] = "{}: {}".format(n, label)
graph.nodes[n]["fontname"] = "italic"
graph.nodes[n]["fontsize"] = 11.0
if not KernelDG.is_load_line_number(n):
graph.nodes[n]["shape"] = "rectangle"
nx.relabel.relabel_nodes(graph, mapping, copy=False)

View File

@@ -89,6 +89,9 @@ class TestSemanticTools(unittest.TestCase):
cls.machine_model_csx = MachineModel(
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "csx.yml")
)
cls.machine_model_skx = MachineModel(
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "skx.yml")
)
cls.machine_model_tx2 = MachineModel(
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "tx2.yml")
)
@@ -107,6 +110,11 @@ class TestSemanticTools(unittest.TestCase):
cls.machine_model_csx,
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "isa/x86.yml"),
)
cls.semantics_skx_intel = ArchSemantics(
cls.parser_x86_intel,
cls.machine_model_skx,
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "isa/x86.yml"),
)
cls.semantics_aarch64 = ISASemantics(cls.parser_AArch64)
cls.semantics_tx2 = ArchSemantics(
cls.parser_AArch64,
@@ -136,10 +144,10 @@ class TestSemanticTools(unittest.TestCase):
for i in range(len(cls.kernel_x86_intel)):
cls.semantics_csx_intel.assign_src_dst(cls.kernel_x86_intel[i])
cls.semantics_csx_intel.assign_tp_lt(cls.kernel_x86_intel[i])
cls.semantics_csx_intel.normalize_instruction_forms(cls.kernel_x86_intel_memdep)
cls.semantics_skx_intel.normalize_instruction_forms(cls.kernel_x86_intel_memdep)
for i in range(len(cls.kernel_x86_intel_memdep)):
cls.semantics_csx_intel.assign_src_dst(cls.kernel_x86_intel_memdep[i])
cls.semantics_csx_intel.assign_tp_lt(cls.kernel_x86_intel_memdep[i])
cls.semantics_skx_intel.assign_src_dst(cls.kernel_x86_intel_memdep[i])
cls.semantics_skx_intel.assign_tp_lt(cls.kernel_x86_intel_memdep[i])
cls.semantics_tx2.normalize_instruction_forms(cls.kernel_AArch64)
for i in range(len(cls.kernel_AArch64)):
cls.semantics_tx2.assign_src_dst(cls.kernel_AArch64[i])
@@ -458,7 +466,7 @@ class TestSemanticTools(unittest.TestCase):
# / /
# 4 /
# /
# 5.1
# 4.875
#
dg = KernelDG(
self.kernel_x86_intel,
@@ -473,8 +481,8 @@ class TestSemanticTools(unittest.TestCase):
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=4)), 5)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=5))), 1)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=5)), 6)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=5.1))), 1)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=5.1)), 5)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=4.875))), 1)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=4.875)), 5)
self.assertEqual(list(dg.get_dependent_instruction_forms(line_number=6)), [])
self.assertEqual(list(dg.get_dependent_instruction_forms(line_number=7)), [])
self.assertEqual(list(dg.get_dependent_instruction_forms(line_number=8)), [])
@@ -502,12 +510,15 @@ class TestSemanticTools(unittest.TestCase):
dg = KernelDG(
self.kernel_x86_intel_memdep,
self.parser_x86_intel,
self.machine_model_csx,
self.semantics_csx_intel,
self.machine_model_skx,
self.semantics_skx_intel,
)
self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=3)), {6, 8})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=5)), {10, 12})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=18)), {18.875})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=18.875)), {19})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=19)), set())
with self.assertRaises(ValueError):
dg.get_dependent_instruction_forms()
# test dot creation