#!/usr/bin/env python3 """Semantics opbject responsible for architecture specific semantic operations""" from dis import Instruction import sys import warnings from itertools import chain from operator import itemgetter from copy import deepcopy from .hw_model import MachineModel from .isa_semantics import INSTR_FLAGS, ISASemantics from osaca.parser.memory import MemoryOperand from osaca.parser.register import RegisterOperand class ArchSemantics(ISASemantics): def __init__(self, parser, machine_model: MachineModel, path_to_yaml=None): super().__init__(parser, path_to_yaml=path_to_yaml) self._machine_model = machine_model def normalize_instruction_form(self, instruction_form): self.parser.normalize_instruction_form( instruction_form, self.isa_model, self._machine_model ) def normalize_instruction_forms(self, instruction_forms): for instruction_form in instruction_forms: self.normalize_instruction_form(instruction_form) def _check_normalized(self, instruction_forms): for instruction_form in instruction_forms: instruction_form.check_normalized() # SUMMARY FUNCTION def add_semantics(self, kernel): """ Applies performance data (throughput, latency, port pressure) and source/destination distribution to each instruction of a given kernel. :param list kernel: kernel to apply semantics """ self._check_normalized(kernel) for instruction_form in kernel: self.assign_src_dst(instruction_form) self.assign_tp_lt(instruction_form) if self._machine_model.has_hidden_loads(): self.set_hidden_loads(kernel) def assign_optimal_throughput(self, kernel, start=0): """ Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``. :param list kernel: kernel to apply optimal port utilization """ self._check_normalized(kernel) INC = 0.01 kernel.reverse() port_list = self._machine_model.get_ports() multiple_assignments = False for idx, instruction_form in enumerate(kernel[start:], start): multiple_assignments = False # if iform has multiple possible port assignments, check all in a DFS manner and take the best if isinstance(instruction_form.port_uops, dict): best_kernel = None best_kernel_tp = sys.maxsize for port_util_alt in list(instruction_form.port_uops.values())[1:]: k_tmp = deepcopy(kernel) k_tmp[idx].port_uops = deepcopy(port_util_alt) k_tmp[idx].port_pressure = self._machine_model.average_port_pressure( k_tmp[idx].port_uops ) k_tmp.reverse() self.assign_optimal_throughput(k_tmp, idx) if max(self.get_throughput_sum(k_tmp)) < best_kernel_tp: best_kernel = k_tmp best_kernel_tp = max(self.get_throughput_sum(best_kernel)) # check the first option in the main branch and compare against the best option later multiple_assignments = True kernel[idx].port_uops = list(instruction_form.port_uops.values())[0] for uop in instruction_form.port_uops: cycles = uop[0] ports = list(uop[1]) indices = [port_list.index(p) for p in ports] # check if port sum of used ports for uop are unbalanced port_sums = self._to_list(itemgetter(*indices)(self.get_throughput_sum(kernel))) instr_ports = self._to_list(itemgetter(*indices)(instruction_form.port_pressure)) if len(set(port_sums)) > 1: # balance ports # init list for keeping track of the current change differences = [cycles / len(ports) for p in ports] for _ in range(int(cycles * (1 / INC))): if len(instr_ports) == 1: # no balancing possible anymore break max_port_idx = port_sums.index(max(port_sums)) min_port_idx = port_sums.index(min(port_sums)) instr_ports[max_port_idx] -= INC instr_ports[min_port_idx] += INC differences[max_port_idx] -= INC differences[min_port_idx] += INC # instr_ports = [round(p, 2) for p in instr_ports] self._itemsetter(*indices)(instruction_form.port_pressure, *instr_ports) # check if min port is zero if round(min(instr_ports), 2) <= 0: # if port_pressure is not exactly 0.00, add the residual to # the former port if min(instr_ports) != 0.0: min_port_idx = port_sums.index(min(port_sums)) instr_ports[min_port_idx] += min(instr_ports) differences[min_port_idx] += min(instr_ports) # we don't need to decrease difference for other port, just # delete it del differences[instr_ports.index(min(instr_ports))] self._itemsetter(*indices)( instruction_form.port_pressure, *instr_ports ) zero_index = [ p for p in indices if round(instruction_form.port_pressure[p], 2) == 0 or instruction_form.port_pressure[p] < 0.00 ][0] instruction_form.port_pressure[zero_index] = 0.0 # Remove from further balancing indices = [p for p in indices if instruction_form.port_pressure[p] > 0] instr_ports = self._to_list( itemgetter(*indices)(instruction_form.port_pressure) ) # never remove more than the fixed utilization per uop and port, i.e., # cycles/len(ports) if round(min(differences), 2) <= 0: # don't worry if port_pressure isn't exactly 0 and just # remove from further balancing by deleting index since # pressure is not 0 del indices[differences.index(min(differences))] instr_ports = self._to_list( itemgetter(*indices)(instruction_form.port_pressure) ) del differences[differences.index(min(differences))] port_sums = self._to_list( itemgetter(*indices)(self.get_throughput_sum(kernel)) ) kernel.reverse() if multiple_assignments: if max(self.get_throughput_sum(kernel)) > best_kernel_tp: for i, instr in enumerate(best_kernel): kernel[i].port_uops = best_kernel[i].port_uops kernel[i].port_pressure = best_kernel[i].port_pressure def set_hidden_loads(self, kernel): """Hide loads behind stores if architecture supports hidden loads (depricated)""" self._check_normalized(kernel) loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr.flags] stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr.flags] # Filter instructions including load and store load_ids = [instr.line_number for instr in loads] store_ids = [instr.line_number for instr in stores] shared_ldst = list(set(load_ids).intersection(set(store_ids))) loads = [instr for instr in loads if instr.line_number not in shared_ldst] stores = [instr for instr in stores if instr.line_number not in shared_ldst] if len(stores) == 0 or len(loads) == 0: # nothing to do return if len(loads) <= len(stores): # Hide all loads for load in loads: load.flags += [INSTR_FLAGS.HIDDEN_LD] load.port_pressure = self._nullify_data_ports(load.port_pressure) else: for store in stores: # Get 'closest' load instruction min_distance_load = min( [ ( abs(load_instr.line_number - store.line_number), load_instr.line_number, ) for load_instr in loads if INSTR_FLAGS.HIDDEN_LD not in load_instr.flags ] ) load = [instr for instr in kernel if instr.line_number == min_distance_load[1]][0] # Hide load load.flags += [INSTR_FLAGS.HIDDEN_LD] load.port_pressure = self._nullify_data_ports(load.port_pressure) # get parser result and assign throughput and latency value to instruction form # mark instruction form with semantic flags def assign_tp_lt(self, instruction_form): """Assign throughput and latency to an instruction form.""" instruction_form.check_normalized() flags = [] port_number = len(self._machine_model["ports"]) if instruction_form.mnemonic is None: # No instruction (label, comment, ...) --> ignore throughput = 0.0 latency = 0.0 latency_wo_load = latency instruction_form.port_pressure = [0.0 for i in range(port_number)] instruction_form.port_uops = [] else: instruction_data = self._machine_model.get_instruction( instruction_form.mnemonic, instruction_form.operands ) if instruction_data: # instruction form in DB ( throughput, port_pressure, latency, latency_wo_load, ) = self._handle_instruction_found( instruction_data, port_number, instruction_form, flags ) else: # instruction could not be found in DB assign_unknown = True # check for equivalent register-operands DB entry if LD if ( INSTR_FLAGS.HAS_LD in instruction_form.flags or INSTR_FLAGS.HAS_ST in instruction_form.flags ): # dynamically combine LD/ST and reg form of instruction form # substitute mem and look for reg-only variant operands = self.substitute_mem_address(instruction_form.operands) instruction_data_reg = self._machine_model.get_instruction( instruction_form.mnemonic, operands ) if instruction_data_reg: assign_unknown = False reg_type = self._parser.get_reg_type( instruction_data_reg.operands[ operands.index(self._create_reg_wildcard()) ] ) # dummy_reg = {"class": "register", "name": reg_type} dummy_reg = RegisterOperand(name=reg_type) data_port_pressure = [0.0 for _ in range(port_number)] data_port_uops = [] if INSTR_FLAGS.HAS_LD in instruction_form.flags: # LOAD performance data load_perf_data = self._machine_model.get_load_throughput( [ x for x in instruction_form.semantic_operands["source"] + instruction_form.semantic_operands["src_dst"] if isinstance(x, MemoryOperand) ][0] ) # if multiple options, choose based on reg type data_port_uops = [ ldp[1] for ldp in load_perf_data if ldp[0].dst is not None and self._machine_model._check_operands( dummy_reg, RegisterOperand(name=ldp[0].dst) ) ] if len(data_port_uops) < 1: data_port_uops = load_perf_data[0][1] else: data_port_uops = data_port_uops[0] data_port_pressure = self._machine_model.average_port_pressure( data_port_uops ) if "load_throughput_multiplier" in self._machine_model: multiplier = self._machine_model["load_throughput_multiplier"][ reg_type ] data_port_pressure = [pp * multiplier for pp in data_port_pressure] if INSTR_FLAGS.HAS_ST in instruction_form.flags: # STORE performance data destinations = ( instruction_form.semantic_operands["destination"] + instruction_form.semantic_operands["src_dst"] ) store_perf_data = self._machine_model.get_store_throughput( [x for x in destinations if isinstance(x, MemoryOperand)][0], dummy_reg, ) st_data_port_uops = store_perf_data[0][1] # zero data port pressure and remove HAS_ST flag if # - no mem operand in dst && # - all mem operands in src_dst are pre-/post_indexed # since it is no mem store if ( self._parser.isa() == "aarch64" and not isinstance( instruction_form.semantic_operands["destination"], MemoryOperand, ) and all( [ op.post_indexed or op.pre_indexed for op in instruction_form.semantic_operands["src_dst"] if isinstance(op, MemoryOperand) ] ) ): st_data_port_uops = [] instruction_form.flags.remove(INSTR_FLAGS.HAS_ST) # sum up all data ports in case for LOAD and STORE st_data_port_pressure = self._machine_model.average_port_pressure( st_data_port_uops ) if "store_throughput_multiplier" in self._machine_model: multiplier = self._machine_model["store_throughput_multiplier"][ reg_type ] st_data_port_pressure = [ pp * multiplier for pp in st_data_port_pressure ] data_port_pressure = [ sum(x) for x in zip(data_port_pressure, st_data_port_pressure) ] data_port_uops += st_data_port_uops throughput = max(max(data_port_pressure), instruction_data_reg.throughput) latency = instruction_data_reg.latency # Add LD and ST latency latency += ( self._machine_model.get_load_latency(reg_type) if INSTR_FLAGS.HAS_LD in instruction_form.flags else 0 ) latency += ( self._machine_model.get_store_latency(reg_type) if INSTR_FLAGS.HAS_ST in instruction_form.flags else 0 ) latency_wo_load = instruction_data_reg.latency # add latency of ADD if post- or pre_indexed load # TODO more investigation: check dot-graph, wrong latency distribution! # if ( # latency_wo_load == 0 # and self._isa == 'aarch64' # and any( # [ # 'post_indexed' in op['memory'] or # 'pre_indexed' in op['memory'] # for op in instruction_form.operands # if 'memory' in op # ] # ) # ): # latency_wo_load = 1.0 instruction_form.port_pressure = [ sum(x) for x in zip( data_port_pressure, self._machine_model.average_port_pressure( instruction_data_reg.port_pressure ), ) ] instruction_form.port_uops = list( chain(instruction_data_reg.port_pressure, data_port_uops) ) if assign_unknown: # --> mark as unknown and assume 0 cy for latency/throughput throughput = 0.0 latency = 0.0 latency_wo_load = latency instruction_form.port_pressure = [0.0 for i in range(port_number)] # instruction_formport_uops = [] flags += [INSTR_FLAGS.TP_UNKWN, INSTR_FLAGS.LT_UNKWN] # flatten flag list flags = list(set(flags)) if instruction_form.flags == []: instruction_form.flags = flags else: instruction_form.flags += flags instruction_form.throughput = throughput instruction_form.latency = latency instruction_form.latency_wo_load = latency_wo_load # for later CP and loop-carried dependency analysis instruction_form.latency_cp = 0 instruction_form.latency_lcd = 0 def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags): """Apply performance data to instruction if it was found in the archDB""" instruction_form.check_normalized() throughput = instruction_data.throughput port_pressure = self._machine_model.average_port_pressure(instruction_data.port_pressure) instruction_form.port_uops = instruction_data.port_pressure try: assert isinstance(port_pressure, list) assert len(port_pressure) == port_number instruction_form.port_pressure = port_pressure if sum(port_pressure) == 0 and throughput is not None: # port pressure on all ports 0 --> not bound to a port flags.append(INSTR_FLAGS.NOT_BOUND) except AssertionError: warnings.warn( "Port pressure could not be imported correctly from database. " + "Please check entry for:\n {}".format(instruction_form) ) instruction_form.port_pressure = [0.0 for i in range(port_number)] instruction_form.port_uops = [] flags.append(INSTR_FLAGS.TP_UNKWN) if throughput is None: # assume 0 cy and mark as unknown throughput = 0.0 flags.append(INSTR_FLAGS.TP_UNKWN) latency = instruction_data.latency latency_wo_load = latency if latency is None: # assume 0 cy and mark as unknown latency = 0.0 latency_wo_load = latency flags.append(INSTR_FLAGS.LT_UNKWN) if INSTR_FLAGS.HAS_LD in instruction_form.flags: flags.append(INSTR_FLAGS.LD) return throughput, port_pressure, latency, latency_wo_load def convert_op_to_reg(self, reg_type, regtype="0"): """Create register operand for a memory addressing operand""" if self._parser.isa() == "x86": if reg_type == "gpr": register = RegisterOperand(name="r" + str(int(regtype) + 9)) else: register = RegisterOperand(name=reg_type + regtype) elif self._parser.isa() == "aarch64": register = RegisterOperand(name=regtype, prefix=reg_type) return register def _nullify_data_ports(self, port_pressure): """Set all ports to 0.0 for the ports of a machine model""" data_ports = self._machine_model.get_data_ports() for port in data_ports: index = self._machine_model.get_ports().index(port) port_pressure[index] = 0.0 return port_pressure def _itemsetter(self, *items): if len(items) == 1: item = items[0] def g(obj, value): obj[item] = value else: def g(obj, *values): for item, value in zip(items, values): obj[item] = value return g def _to_list(self, obj): if isinstance(obj, tuple): return list(obj) else: return [obj] @staticmethod def get_throughput_sum(kernel): """Get the overall throughput sum separated by port of all instructions of a kernel.""" # ignoring all lines with throughput == 0.0, because there won't be anything to sum up # typically comment, label and non-instruction lines port_pressures = [instr.port_pressure for instr in kernel if instr.throughput != 0.0] # Essentially summing up each columns of port_pressures, where each column is one port # and each row is one line of the kernel # round is necessary to ensure termination of ArchsSemantics.assign_optimal_throughput tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)] return tp_sum