From 884f82742ef0e5fe9283717bcf980af51fe37ed3 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Sat, 15 Nov 2025 23:40:47 +0100 Subject: [PATCH] Initial support for Arrow Lake --- kernelNanoBench.py | 4 +- tools/cpuBench/cpuBench.py | 228 +++++++++++++++++++++++------- tools/cpuBench/simpleHTMLTable.py | 11 +- 3 files changed, 187 insertions(+), 56 deletions(-) diff --git a/kernelNanoBench.py b/kernelNanoBench.py index 52aad75..e0b1e27 100644 --- a/kernelNanoBench.py +++ b/kernelNanoBench.py @@ -7,8 +7,6 @@ import sys from collections import OrderedDict from shutil import copyfile -from x64_lib import * - PFC_START_ASM = '.quad 0xE0B513B1C2813F04' PFC_STOP_ASM = '.quad 0xF0B513B1C2813F04' @@ -49,7 +47,7 @@ def assemble(code, objFile, asmFile='/tmp/ramdisk/asm.s'): if ('same type of prefix used twice' in e.output.decode()) and ('REX64' in code): return assemble(code.replace('REX64 ', ''), objFile, asmFile) elif "register type mismatch for `lsl'" in e.output.decode(): - code, n = re.subn(r'(LSL \S*, )(\S*?);', lambda m: f'{m.group(1)}{regToSize(m.group(2),16)};', code) + code, n = re.subn(r'(LSL \S*, )E?(\S*?)(D?);', lambda m: f'{m.group(1)}{m.group(2)}{m.group(3).replace("D", "W")};', code) if n > 0: return assemble(code, objFile, asmFile) print(f"Error (assemble): {str(e)}", file=sys.stderr) diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py index ae50289..c061f86 100755 --- a/tools/cpuBench/cpuBench.py +++ b/tools/cpuBench/cpuBench.py @@ -224,7 +224,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, elif arch in ['NHM', 'WSM', 'BNL', 'GLM', 'GLP']: evt = 'UOPS_RETIRED.ANY' elif arch in ['SNB', 'SLM', 'AMT', 'ADL-E', 'MTL-E']: evt = 'UOPS_RETIRED.ALL' elif arch in ['HSW']: evt = 'UOPS_EXECUTED.CORE' - elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: evt = 'UOPS_EXECUTED.THREAD' + elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P', 'ARL-P']: evt = 'UOPS_EXECUTED.THREAD' elif arch in ['TRM']: evt = 'TOPDOWN_RETIRING.ALL' localHtmlReports.append('
  • ' + evt + ': ' + str(value) + '
  • \n') localHtmlReports.append('\n') @@ -279,17 +279,18 @@ def getEventConfig(event): if arch in ['BNL', 'SLM', 'AMT']: return 'C2.10' # UOPS_RETIRED.ANY if arch in ['HSW']: return 'B1.02' # UOPS_EXECUTED.CORE; note: may undercount due to erratum HSD30 if arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return 'B1.01' # UOPS_EXECUTED.THREAD + if arch in ['ARL-P']: return 'B1.01.CTR=3' # UOPS_EXECUTED.THREAD if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '0C1.00' if event == 'RETIRE_SLOTS': - if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return 'C2.02' + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P', 'ARL-P']: return 'C2.02' if event == 'UOPS_MITE': - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return '79.04' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P', 'ARL-P']: return '79.04' if event == 'UOPS_MITE>=1': - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return '79.04.CMSK=1' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P', 'ARL-P']: return '79.04.CMSK=1' if event == 'UOPS_MS': if arch in ['NHM', 'WSM']: return 'D1.02' if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return '79.30' - if arch in ['ADL-P', 'EMR', 'MTL-P']: return '79.20' + if arch in ['ADL-P', 'EMR', 'MTL-P', 'ARL-P']: return '79.20' if arch in ['SLM', 'AMT', 'GLM', 'GLP', 'TRM', 'ADL-E', 'MTL-E']: return 'C2.01' if arch in ['BNL']: return 'A9.01' # undocumented, but seems to work if event == 'UOPS_PORT_0': @@ -341,13 +342,37 @@ def getEventConfig(event): if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.20.CMSK=2' if event == 'UOPS_PORT_23A': if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B2.04' + if event == 'UOPS_DISPATCHED.INT_EU_ALL': + if arch in ['ARL-P']: return 'B2.01.CTR=2' + if event == 'UOPS_DISPATCHED.ALU': + if arch in ['ARL-P']: return 'B2.02.CTR=2' + if event == 'UOPS_DISPATCHED.LD': + if arch in ['ARL-P']: return 'B2.04' + if event == 'UOPS_DISPATCHED.SLOW': + if arch in ['ARL-P']: return 'B2.08' + if event == 'UOPS_DISPATCHED.STD': + if arch in ['ARL-P']: return 'B2.10' + if event == 'UOPS_DISPATCHED.SHIFT': + if arch in ['ARL-P']: return 'B2.20' + if event == 'UOPS_DISPATCHED.JMP': + if arch in ['ARL-P']: return 'B2.40' + if event == 'UOPS_DISPATCHED.STA': + if arch in ['ARL-P']: return 'B2.80' + if event == 'UOPS_DISPATCHED.V0': + if arch in ['ARL-P']: return 'B3.01' + if event == 'UOPS_DISPATCHED.V1': + if arch in ['ARL-P']: return 'B3.02' + if event == 'UOPS_DISPATCHED.V2': + if arch in ['ARL-P']: return 'B3.04' + if event == 'UOPS_DISPATCHED.V3': + if arch in ['ARL-P']: return 'B3.08' if event == 'DIV_CYCLES': if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return '14.01' # undocumented on HSW, but seems to work if arch in ['ICL', 'TGL', 'RKL']: return '14.09' if arch in ['ZEN+', 'ZEN2', 'ZEN3', 'ZEN4', 'ZEN5']: return '0D3.00' - if arch in ['ADL-P', 'EMR', 'MTL-P']: return 'B0.09.CMSK=1' + if arch in ['ADL-P', 'EMR', 'MTL-P', 'ARL-P']: return 'B0.09.CMSK=1' if event == 'ILD_STALL.LCP': - if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P']: return '87.01' + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P', 'EMR', 'MTL-P', 'ARL-P']: return '87.01' if event == 'INST_DECODED.DEC0': if arch in ['NHM', 'WSM']: return '18.01' if event == 'FpuPipeAssignment.Total0': @@ -407,7 +432,7 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, commonReg = None if not useDistinctRegs: commonRegs = findCommonRegisters(instrNode) - commonRegs -= set(doNotWriteRegs)|set(doNotReadRegs)|globalDoNotWriteRegs|(memRegs if hasMemOperand else set()) + commonRegs -= set(map(getCanonicalReg, set(doNotWriteRegs)|set(doNotReadRegs)|globalDoNotWriteRegs|(memRegs if hasMemOperand else set()))) if commonRegs: commonReg = sortRegs(commonRegs)[0] @@ -543,7 +568,7 @@ def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr): writeFile(fileName, asm) -def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstrRep, blockedPorts, config, htmlReports): +def getUopsOnBlockedPorts(instrNode, blockInstrNode, blockInstrRep, blockedPorts, config, htmlReports): instrInstance = config.independentInstrs[0] instr = instrInstance.asm readRegs = instrInstance.readRegs @@ -600,6 +625,8 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr if arch in ['NHM', 'WSM']: # Needed for workaround for broken port 5 counter events = ['UOPS_PORT_'+str(p) for p in range(0,6)] + ['UOPS'] + elif arch in ['ARL-P']: + events = ['UOPS_DISPATCHED.V0', 'UOPS_DISPATCHED.V1', 'UOPS_DISPATCHED.V2', 'UOPS_DISPATCHED.V3'] else: events = ['UOPS_PORT_'+str(p) for p in blockedPorts] @@ -637,11 +664,7 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr measurementResult['UOPS_PORT_5'] = measurementResult['UOPS_PORT_5B'] del measurementResult['UOPS_PORT_5B'] - if isIntelCPU(): - ports_dict = {p[10:]: i for p, i in measurementResult.items() if p.startswith('UOPS_PORT')} - else: - ports_dict = {p[23:]: i for p, i in measurementResult.items() if 'FpuPipeAssignment.Total' in p} - + ports_dict = {getPortNameFromEventName(p): i for p, i in measurementResult.items() if getPortNameFromEventName(p) is not None} if sum(ports_dict.values()) < blockInstrRep-.5: # something went wrong; fewer uops on ports than blockInstrRep # happens, e.g., on SKX for ports {0, 1} if AVX-512 is active @@ -650,6 +673,48 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr return int(.2+sum([uops for p, uops in ports_dict.items() if p in blockedPorts])) - blockInstrRep +# Example return value: {'ALU': 2, 'LOAD': 1, 'INT_OTHER': 2} +def getUopTypes(instrNode, config, lfenceUopTypeDict, htmlReports): + htmlReports.append('

    With lfence (to avoid incorrect counts due to replays)

    ') + + if arch in ['ARL-P']: + events = ['UOPS_DISPATCHED.INT_EU_ALL', 'UOPS_DISPATCHED.ALU', 'UOPS_DISPATCHED.LD', 'UOPS_DISPATCHED.SLOW', 'UOPS_DISPATCHED.STD', + 'UOPS_DISPATCHED.SHIFT', 'UOPS_DISPATCHED.JMP', 'UOPS_DISPATCHED.STA'] + else: + raise RuntimeError(f"getUopTypes() does not support {arch}") + configurePFCs(events) + + instrInstance = config.independentInstrs[0] + init = instrInstance.regMemInit + config.init + + htmlReports.append('\n') + + if config.preInstrCode: + htmlReports.append('\n') + for ev in events: + measurementResult[ev] -= preInstrResult[ev] + + uopTypeDict = {t.replace('UOPS_DISPATCHED.', ''): int(i + .2) for t, i in measurementResult.items() if t in events} + intAll = uopTypeDict['INT_EU_ALL'] + del uopTypeDict['INT_EU_ALL'] + uopTypeDict['INT_OTHER'] = intAll - uopTypeDict['ALU'] - uopTypeDict['SLOW'] - uopTypeDict['SHIFT'] - uopTypeDict['JMP'] + if uopTypeDict['INT_OTHER'] < 0: + print((f"unexpected uopTypeDict {config.preInstrCode} {instrInstance.asm} {measurementResult}")) + return {} + + if lfenceUopTypeDict: + for t in uopTypeDict: + uopTypeDict[t] = uopTypeDict[t] - lfenceUopTypeDict[t] + + return uopTypeDict + + # Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of # calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep. def getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, doNotReadRegs=None, doNotWriteRegs=None, initialOffset=0, immediate=2): @@ -1057,6 +1122,16 @@ def fancyRound(cycles): return round(cycles, 2) +def getPortNameFromEventName(evtName: str) -> str: + if evtName.startswith('UOPS_PORT'): + return evtName[10:] + elif evtName.startswith('UOPS_DISPATCHED.V'): + return evtName[17:] + elif evtName.startswith('FpuPipeAssignment.Total'): + return evtName[23:] + return None + + TPResult = namedtuple('TPResult', ['TP', 'TP_loop', 'TP_noLoop', 'TP_noDepBreaking_noLoop', 'TP_single', 'uops', 'fused_uops', 'uops_MITE', 'uops_MS', 'divCycles', 'ILD_stalls', 'complexDec', 'nAvailableSimpleDecoders', 'config', 'unblocked_ports', 'all_used_ports']) @@ -1138,10 +1213,10 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports else: divCycles = 0 - return TPResult(minTP, minTP, minTP, minTP_noDepBreaking_noLoop, minTP_single, unfused_uops, fused_uops, None, None, divCycles, 0, False, None, config, - ports_dict, all_used_ports) + return TPResult(TP=minTP, TP_loop=minTP, TP_noLoop=minTP, TP_noDepBreaking_noLoop=minTP_noDepBreaking_noLoop, TP_single=minTP_single, uops=unfused_uops, + fused_uops=fused_uops, uops_MITE=None, uops_MS=None, divCycles=divCycles, ILD_stalls=0, complexDec=False, nAvailableSimpleDecoders=None, + config=config, unblocked_ports=ports_dict, all_used_ports=all_used_ports) else: - hasMemWriteOperand = len(instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'))>0 uops = None uopsFused = None uopsMITE = None @@ -1249,8 +1324,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports if not useDepBreakingInstrs: minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles) for p, i in result.items(): - if (i/ic > .1) and (('UOPS_PORT' in p) or ('FpuPipeAssignment.Total' in p)): - all_used_ports.add(p[10:] if ('UOPS_PORT' in p) else p[23:]) + if (i/ic > .1) and (getPortNameFromEventName(p) is not None): + all_used_ports.add(getPortNameFromEventName(p)) else: minTP_loop = min(minTP_loop, cycles) @@ -1258,11 +1333,9 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports minConfig = config minTP_single = min(minTP_single, cycles) - if isIntelCPU(): - ports_dict = {p[10:]: i for p, i in result.items() if 'UOPS_PORT' in p} - elif isAMDCPU() and not instrNode.attrib['extension'] == 'BASE': - # We ignore BASE instructions, as they sometimes wrongly count floating point uops - ports_dict = {p[23:]: i for p, i in result.items() if 'FpuPipeAssignment.Total' in p} + if not isAMDCPU() or not instrNode.attrib['extension'] == 'BASE': + # We ignore BASE instructions for AMD, as they sometimes wrongly count floating point uops + ports_dict = {getPortNameFromEventName(p): i for p, i in result.items() if getPortNameFromEventName(p) is not None} uops = int(result['UOPS']+.2) if 'RETIRE_SLOTS' in result: @@ -1300,8 +1373,9 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports htmlReports.append('') if minTP < sys.maxsize: - return TPResult(minTP, minTP_loop, minTP_noLoop, minTP_noDepBreaking_noLoop, minTP_single, uops, uopsFused, uopsMITE, uopsMS, divCycles, ILD_stalls, - complexDec, nAvailableSimpleDecoders, minConfig, ports_dict, all_used_ports) + return TPResult(TP=minTP, TP_loop=minTP_loop, TP_noLoop=minTP_noLoop, TP_noDepBreaking_noLoop=minTP_noDepBreaking_noLoop, TP_single=minTP_single, + uops=uops, fused_uops=uopsFused, uops_MITE=uopsMITE, uops_MS=uopsMS, divCycles=divCycles, ILD_stalls=ILD_stalls, complexDec=complexDec, + nAvailableSimpleDecoders=nAvailableSimpleDecoders, config=minConfig, unblocked_ports=ports_dict, all_used_ports=all_used_ports) def canMacroFuse(flagInstrNode, branchInstrNode, htmlReports): @@ -1359,7 +1433,7 @@ def getBasicLatencies(instrNodeList): for flag in STATUSFLAGS_noAF: testSetResult = runExperiment(None, 'TEST AL, AL; SET' + flag[0] + ' AL') # we additionally test with a nop, as the result may be higher than the actual latency (e.g., on ADL-P), probably due to non-optimal port assignments - testSetResultNop = runExperiment(None, 'TEST AL, AL; SET' + flag[0] + ' AL; NOP') + testSetResultNop = runExperiment(None, 'TEST AL, AL; NOP; SET' + flag[0] + ' AL;') testSetCycles = min(int(testSetResult['Core cycles'] + .2), int(testSetResultNop['Core cycles'] + .2)) if testSetCycles == 2: @@ -3110,7 +3184,9 @@ def main(): else: configurePFCs(['UOPS', 'RETIRE_SLOTS', 'UOPS_MITE', 'UOPS_MS', 'UOPS_PORT_0', 'UOPS_PORT_1', 'UOPS_PORT_2', 'UOPS_PORT_3', 'UOPS_PORT_4', 'UOPS_PORT_5', 'UOPS_PORT_6', 'UOPS_PORT_7', 'UOPS_PORT_23', 'UOPS_PORT_49', 'UOPS_PORT_78', 'UOPS_PORT_5B', 'UOPS_PORT_5B>=2', - 'UOPS_PORT_23A', 'DIV_CYCLES', 'ILD_STALL.LCP', 'INST_DECODED.DEC0', 'UOPS_MITE>=1']) + 'UOPS_PORT_23A', 'UOPS_DISPATCHED.INT_EU_ALL', 'UOPS_DISPATCHED.ALU', 'UOPS_DISPATCHED.LOAD', 'UOPS_DISPATCHED.SLOW', + 'UOPS_DISPATCHED.STD', 'UOPS_DISPATCHED.SHIFT', 'UOPS_DISPATCHED.JMP', 'UOPS_DISPATCHED.STA', 'UOPS_DISPATCHED.V0', + 'UOPS_DISPATCHED.V1', 'UOPS_DISPATCHED.V2', 'UOPS_DISPATCHED.V3', 'DIV_CYCLES', 'ILD_STALL.LCP', 'INST_DECODED.DEC0', 'UOPS_MITE>=1']) try: subprocess.check_output('mkdir -p /tmp/ramdisk; sudo mount -t tmpfs -o size=100M none /tmp/ramdisk/', shell=True) @@ -3255,6 +3331,9 @@ def main(): portCombinationsResultDict = {} portCombinationsResultDictSameReg = {} portCombinationsResultDictIndexedAddr = {} + uopTypeResultDict = {} + uopTypeResultDictSameReg = {} + uopTypeResultDictIndexedAddr = {} if not args.noPorts: for instr, tpResult in tpDict.items(): @@ -3374,7 +3453,11 @@ def main(): sortedPortCombinationsNonAVX = sorted(blockingInstructionsDictNonAVX.keys(), key=lambda x:(len(x), sorted(x))) sortedPortCombinationsNonSSE = sorted(blockingInstructionsDictNonSSE.keys(), key=lambda x:(len(x), sorted(x))) - print('sortedPortCombinations: ' + str(sortedPortCombinationsNonAVX)) + print('sortedPortCombinationsNonAVX: ' + str(sortedPortCombinationsNonAVX)) + print('sortedPortCombinationsNonSSE: ' + str(sortedPortCombinationsNonSSE)) + + if arch in ['ARL-P']: + lfenceUopTypeDict = getUopTypes(instrNodeDict['LFENCE'], TPConfig(independentInstrs=[InstrInstance(None, '', [], [], {}, [])]), None, []) for i, instrNode in enumerate(sorted(tpDict.keys(), key=lambda x: (len(tpDict[x].config.preInstrNodes), x.attrib['string']))): #if not 'CVTPD2PI' in instrNode.attrib['string']: continue @@ -3401,6 +3484,17 @@ def main(): if not useIACA and tpResult.config.preInstrNodes: rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes) + if arch in ['ARL-P']: + uopTypeDict = getUopTypes(instrNode, tpResult.config, lfenceUopTypeDict, htmlReports) + print(f"{instrNode.attrib['string']}: {uopTypeDict}") + if not useDistinctRegs: + uopTypeResultDictSameReg[instrNode] = uopTypeDict + elif useIndexedAddr: + uopTypeResultDictIndexedAddr[instrNode] = uopTypeDict + else: + uopTypeResultDict[instrNode] = uopTypeDict + rem_uops -= sum(uopTypeDict.values()) + used_ports = tpResult.all_used_ports if debugOutput: print(instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports)) @@ -3421,6 +3515,7 @@ def main(): if used_ports.issubset(combination): uopsCombinationList = [(combination, 1)] htmlReports.append('
    Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in combination)) + rem_uops = 0 break elif (rem_uops > 0) and (arch not in ['ZEN+', 'ZEN2']): for combination in sortedPortCombinations: @@ -3445,7 +3540,7 @@ def main(): nPortsInComb = sum(len(str(x)) for x in combination) blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10) blockInstrRep = min(blockInstrRep, 100) - uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports) + uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports) if uopsOnBlockedPorts is None: #print('no uops on blocked ports: ' + str(combination)) continue @@ -3474,6 +3569,9 @@ def main(): rem_uops -= uopsOnBlockedPorts if rem_uops <= 0: break + if arch in ['ARL-P'] and rem_uops > 0: + uopTypeDict['UNKNOWN'] = rem_uops + # on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports # we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate uopsCombinationList = [(frozenset(''.join(comb)), uops) for comb, uops in uopsCombinationList] @@ -3499,18 +3597,18 @@ def main(): else: resultNode = archNode.find('./measurement') - applicableResults = [(tpDict[instrNode], portCombinationsResultDict.get(instrNode, None), '')] - for otherTPDict, otherPCDict, suffix in [(tpDictSameReg, portCombinationsResultDictSameReg, '_same_reg'), - (tpDictIndexedAddr, portCombinationsResultDictIndexedAddr, '_indexed')]: + applicableResults = [(tpDict[instrNode], portCombinationsResultDict.get(instrNode), uopTypeResultDict.get(instrNode, {}), '')] + for otherTPDict, otherPCDict, otherUopTypeDict, suffix in [(tpDictSameReg, portCombinationsResultDictSameReg, uopTypeResultDictSameReg, '_same_reg'), + (tpDictIndexedAddr, portCombinationsResultDictIndexedAddr, uopTypeResultDictIndexedAddr, '_indexed')]: if instrNode in otherTPDict: - t1 = tpDict[instrNode] + t1, p1, u1, _ = applicableResults[0] t2 = otherTPDict[instrNode] - p1 = portCombinationsResultDict.get(instrNode, None) - p2 = otherPCDict.get(instrNode, None) - if (t1.uops != t2.uops or t1.fused_uops != t2.fused_uops or t1.uops_MITE != t2.uops_MITE or ((p2 is not None) and (p1 != p2))): - applicableResults.append((t2, p2, suffix)) + p2 = otherPCDict.get(instrNode) + u2 = otherUopTypeDict.get(instrNode, {}) + if (t1.uops != t2.uops or t1.fused_uops != t2.fused_uops or t1.uops_MITE != t2.uops_MITE or ((p2 is not None) and (p1 != p2)) or (u1 != u2)): + applicableResults.append((t2, p2, u2, suffix)) - for tpResult, portUsageList, suffix in applicableResults: + for tpResult, portUsageList, uopTypeDict, suffix in applicableResults: uops = tpResult.uops uopsFused = tpResult.fused_uops uopsMITE = tpResult.uops_MITE @@ -3553,15 +3651,49 @@ def main(): divCycles = tpResult.divCycles if divCycles: resultNode.attrib['div_cycles'+suffix] = str(divCycles) - portPrefix = ('p' if isIntelCPU() else 'FP') - computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(p for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0]))) - if portUsageList: - resultNode.attrib['ports'+suffix] = computePortStr(portUsageList) - try: - resultNode.attrib['TP_ports'+suffix] = "%.2f" % getTP_LP(portUsageList) - except ValueError as err: - print('Could not solve LP for ' + instrNode.attrib['string'] + ':') - print(err) + + def computePortStr(lst): + portPrefix = '' + if isIntelCPU() and not arch in ['ARL-P']: + portPrefix = 'p' + elif arch in ['ARL-P']: + portPrefix = 'V' + elif isAMDCPU(): + portPrefix = 'FP' + elements = [] + for c, uops in sorted(lst, key=lambda x: sorted(x[0])): + elements.append(f"{uops}*{portPrefix}{''.join(p for p in sorted(c))}") + return '+'.join(elements) + + uopTypePortMapping = { + 'ARL-P': {'ALU': {'I0', 'I1', 'I2', 'I3', 'I4', 'I5'}, + 'JMP': {'I0', 'I1', 'I2'}, + 'MUL': {'I3', 'I4', 'I5'}, + 'SHIFT': {'I3', 'I4', 'I5'}, + 'SLOW': {'I3'}, + 'LD': {'M0', 'M1', 'M2'}, + 'STA': {'M3', 'M4', 'M5'}, + 'STD': {'D0', 'D1'}, + 'INT_OTHER': {}, + 'UNKNOWN': {}, + } + } + portUsageForLP = list(portUsageList or []) + uopTypeStrList = [] + for t, n in sorted(uopTypeDict.items()): + if n > 0: + uopTypeStrList.append(f'{n}*{t}') + portUsageForLP.append((frozenset(uopTypePortMapping[arch][t]), n)) + + portStr = '+'.join(uopTypeStrList + ([computePortStr(portUsageList)] if portUsageList else [])) + if portStr: + resultNode.attrib['ports'+suffix] = portStr + if (not uopTypeDict.get('INT_OTHER')) and (not uopTypeDict.get('UNKNOWN')): + try: + resultNode.attrib['TP_ports'+suffix] = "%.2f" % getTP_LP(portUsageForLP) + except ValueError as err: + print('Could not solve LP for ' + instrNode.attrib['string'] + ':') + print(err) with open(args.output or 'result_'+arch+(('_IACA_' + iacaVersion) if useIACA else '_measured')+'.xml' , "w") as f: reparsed = XMLRoot diff --git a/tools/cpuBench/simpleHTMLTable.py b/tools/cpuBench/simpleHTMLTable.py index 33ae905..53635ba 100755 --- a/tools/cpuBench/simpleHTMLTable.py +++ b/tools/cpuBench/simpleHTMLTable.py @@ -4,8 +4,8 @@ import xml.etree.ElementTree as ET import argparse from utils import * -def getLink(instrNode, text, arch, tool, linkType, anchor=None): - url = '/tmp/html-' + linkType + '/' + arch + '/' + canonicalizeInstrString(instrNode.attrib['string']) + '-' + tool + '.html' +def getLink(instrNode, text, arch, tool, linkType, baseDir, anchor=None): + url = baseDir + '/html-' + linkType + '/' + arch + '/' + canonicalizeInstrString(instrNode.attrib['string']) + '-' + tool + '.html' if anchor: url += '#' + anchor return '' + text + '' @@ -13,6 +13,7 @@ def main(): parser = argparse.ArgumentParser(description='Generates a basic HTML table with the results for a microarchitecture') parser.add_argument("-input", help="Input XML file", default='result.xml') parser.add_argument("-arch", help="Consider only this architecture") + parser.add_argument("-base_dir", help="Directory containing HTML files with details", default='/tmp') args = parser.parse_args() root = ET.parse(args.input) @@ -64,7 +65,7 @@ def main(): latTableEntry = getLatencyTableEntry(resultNode) if latTableEntry is not None: lat = str(latTableEntry[0]) - f.write(' ' + getLink(XMLInstr, lat, args.arch, 'Measurements', 'lat') + '\n') + f.write(' ' + getLink(XMLInstr, lat, args.arch, 'Measurements', 'lat', args.base_dir) + '\n') TPPorts = float(resultNode.attrib.get('TP_ports', float("inf"))) TPPortsStr = ("{:.2f}".format(TPPorts) if TPPorts < float("inf") else '') @@ -84,10 +85,10 @@ def main(): color = ' bgcolor="orange"' TPDiff += 1 - f.write(' ' + getLink(XMLInstr, TPMeasuredStr, args.arch, 'Measurements', 'tp') + '\n') + f.write(' ' + getLink(XMLInstr, TPMeasuredStr, args.arch, 'Measurements', 'tp', args.base_dir) + '\n') f.write(' ' + resultNode.attrib.get('uops', '') + '\n') - f.write(' ' + getLink(XMLInstr, resultNode.attrib.get('ports', ''), args.arch, 'Measurements', 'ports') + '\n') + f.write(' ' + getLink(XMLInstr, resultNode.attrib.get('ports', ''), args.arch, 'Measurements', 'ports', args.base_dir) + '\n') f.write(' \n') f.write('\n')