diff --git a/osaca/data/csx.yml b/osaca/data/csx.yml index 1c5ee73..0a916c3 100644 --- a/osaca/data/csx.yml +++ b/osaca/data/csx.yml @@ -6,6 +6,7 @@ ROB_size: 224 retired_uOps_per_cycle: 4 scheduler_size: 97 hidden_loads: false +load_latency: {gpr: 4.0, xmm: 4.0, ymm: 4.0, zmm: 4.0} ports: ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", "7"] port_model_scheme: | ┌------------------------------------------------------------------------┐ @@ -212,7 +213,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 8.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.5, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vaddpd operands: @@ -226,7 +227,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 8.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.5, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vaddpd operands: @@ -240,7 +241,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 8.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.5, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vaddsd operands: @@ -254,7 +255,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 8.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.5, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vaddsd operands: @@ -429,7 +430,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 8.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.5, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmulpd operands: @@ -443,7 +444,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 8.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.5, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmovapd operands: @@ -455,7 +456,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 3.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 0.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmovapd operands: @@ -479,7 +480,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 0.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmovapd operands: @@ -491,7 +492,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 0.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmovapd operands: @@ -551,7 +552,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 0.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmovupd operands: @@ -563,7 +564,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 0.5 - latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 0.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmovsd operands: @@ -575,7 +576,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 4.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 + latency: 0.0 # 0 0DV 1 2 2D 3 3D 4 5 6 7 port_pressure: [0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - name: vmovsd operands: diff --git a/osaca/data/zen1.yml b/osaca/data/zen1.yml index 6d2d1b5..199620c 100644 --- a/osaca/data/zen1.yml +++ b/osaca/data/zen1.yml @@ -2,6 +2,7 @@ osaca_version: 0.3.0 micro_architecture: "AMD Zen (family 17h)" arch_code: "ZEN1" isa: "x86" +load_latency: {gpr: 4.0, xmm: 4.0, ymm: '5.0'} hidden_loads: false ports: ["0", "1", "2", "3", "3DV", "4", "5", "6", "7", "8", "9", "8D", "9D", "ST"] port_model_scheme: | @@ -138,7 +139,7 @@ instruction_forms: - class: "register" name: "gpr" throughput: 0.5 - latency: 3.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8LD 9LD ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8LD 9LD ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: mulsd operands: @@ -265,7 +266,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 7.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 3.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vaddsd operands: @@ -279,7 +280,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 7.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 3.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vaddsd operands: @@ -293,7 +294,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 7.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 3.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vaddsd operands: @@ -307,7 +308,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 7.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 3.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vaddss operands: @@ -460,7 +461,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 8.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 4.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vmulpd operands: @@ -474,7 +475,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 8.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 4.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vmulpd operands: @@ -499,7 +500,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 1.0 - latency: 9.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 4.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0] - name: vmulpd operands: @@ -513,7 +514,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 1.0 - latency: 9.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 4.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0] - name: vmovapd operands: @@ -525,7 +526,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 3.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vmovapd operands: @@ -549,7 +550,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 1.0 - latency: 5.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0] - name: vmovapd operands: @@ -561,7 +562,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 1.0 - latency: 5.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0] - name: vmovapd operands: @@ -645,7 +646,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 2.0 - latency: 5.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0] - name: vmovupd operands: @@ -657,7 +658,7 @@ instruction_forms: - class: "register" name: "ymm" throughput: 2.0 - latency: 5.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0] - name: vmovsd operands: @@ -669,7 +670,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 4.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vmovsd operands: @@ -681,7 +682,7 @@ instruction_forms: - class: "register" name: "xmm" throughput: 0.5 - latency: 4.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST + latency: 0.0 # 0 1 2 3 3DV 4 5 6 7 8 9 8D 9D ST port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0] - name: vmovsd operands: diff --git a/osaca/parser/base_parser.py b/osaca/parser/base_parser.py index 9624737..4aa423e 100755 --- a/osaca/parser/base_parser.py +++ b/osaca/parser/base_parser.py @@ -50,6 +50,9 @@ class BaseParser(object): def is_vector_register(self, register): raise NotImplementedError + def get_reg_type(self, register): + raise NotImplementedError + def construct_parser(self): return # raise NotImplementedError diff --git a/osaca/parser/parser_AArch64v81.py b/osaca/parser/parser_AArch64v81.py index 36b1c15..6db3b4b 100755 --- a/osaca/parser/parser_AArch64v81.py +++ b/osaca/parser/parser_AArch64v81.py @@ -408,3 +408,6 @@ class ParserAArch64v81(BaseParser): if reg_a['prefix'].lower() in prefixes_vec and reg_b['prefix'].lower() in prefixes_vec: return True return False + + def get_reg_type(self, register): + return register['prefix'] diff --git a/osaca/parser/parser_x86att.py b/osaca/parser/parser_x86att.py index c9274f8..20f6d7d 100755 --- a/osaca/parser/parser_x86att.py +++ b/osaca/parser/parser_x86att.py @@ -331,3 +331,10 @@ class ParserX86ATT(BaseParser): if len(register['name']) > 2 and register['name'][1:3].lower() == 'mm': return True return False + + def get_reg_type(self, register): + if self.is_gpr(register): + return 'gpr' + elif self.is_vector_register(register): + return register['name'][:3].lower() + raise ValueError diff --git a/osaca/semantics/kernel_dg.py b/osaca/semantics/kernel_dg.py index 6cbe52c..677d5c5 100755 --- a/osaca/semantics/kernel_dg.py +++ b/osaca/semantics/kernel_dg.py @@ -16,6 +16,9 @@ class KernelDG(nx.DiGraph): self.model = hw_model self.dg = self.create_DG(self.kernel) self.loopcarried_deps = self.check_for_loopcarried_dep(self.kernel) + import pdb + + pdb.set_trace() def create_DG(self, kernel): # 1. go through kernel instruction forms and add them as node attribute @@ -25,7 +28,32 @@ class KernelDG(nx.DiGraph): for i, instruction_form in enumerate(kernel): dg.add_node(instruction_form['line_number']) dg.nodes[instruction_form['line_number']]['instruction_form'] = instruction_form - for dep in self.find_depending(instruction_form, kernel[i + 1:]): + # add load as separate node if existent + if 'performs_load' in instruction_form['flags']: + regs = [ + op for op in instruction_form['operands']['destination'] if 'register' in op + ] + if ( + len(regs) > 1 + and len(set([self.parser.get_reg_type(x['register']) for x in regs])) != 1 + ): + load_lat = max(self.model['load_latency'].values()) + else: + load_lat = self.model['load_latency'][ + self.parser.get_reg_type(regs[0]['register']) + ] + # add new node + dg.add_node(instruction_form['line_number'] + 0.1) + dg.nodes[instruction_form['line_number'] + 0.1][ + 'instruction_form' + ] = instruction_form + # and set LD latency as edge weight + dg.add_edge( + instruction_form['line_number'] + 0.1, + instruction_form['line_number'], + latency=load_lat, + ) + for dep in self.find_depending(instruction_form, kernel[i + 1 :]): dg.add_edge( instruction_form['line_number'], dep['line_number'], @@ -50,7 +78,7 @@ class KernelDG(nx.DiGraph): loopcarried_deps = [ (node, list(nx.algorithms.simple_paths.all_simple_paths(dg, node, node * multiplier))) for node in dg.nodes - if node < first_line_no * multiplier + if node < first_line_no * multiplier and node == int(node) ] # filter others and create graph loopcarried_deps = list( @@ -71,7 +99,7 @@ class KernelDG(nx.DiGraph): if set(dep[1]).issubset(set(other_dep[1])) and dep[0] in other_dep[1]: is_subset = True if not is_subset: - tmp_list.append(dep) + tmp_list.append(dep) loopcarried_deps = tmp_list for dep in loopcarried_deps: nodes = [self._get_node_by_lineno(n) for n in dep[1]] @@ -88,6 +116,12 @@ class KernelDG(nx.DiGraph): def get_critical_path(self): if nx.algorithms.dag.is_directed_acyclic_graph(self.dg): longest_path = nx.algorithms.dag.dag_longest_path(self.dg, weight='latency') + # add LD latency to instruction + for line_number in longest_path: + if line_number != int(line_number) and int(line_number) in longest_path: + self._get_node_by_lineno(int(line_number))['latency'] += self.dg.edges[ + (line_number, int(line_number)) + ]['latency'] return [x for x in self.kernel if x['line_number'] in longest_path] else: # split to DAG