From 8f7401178ea29a93b82fcaf6f00e753df3851a60 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Fri, 31 Jul 2020 15:33:40 +0200 Subject: [PATCH] support for indexed addressing modes --- tools/cpuBench/cpuBench.py | 329 ++++++++++++++++++++++--------------- tools/cpuBench/utils.py | 4 +- 2 files changed, 194 insertions(+), 139 deletions(-) diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py index 1f38f71..5b56c79 100755 --- a/tools/cpuBench/cpuBench.py +++ b/tools/cpuBench/cpuBench.py @@ -37,8 +37,8 @@ instrNodeList = [] # list of all XML instruction nodes that are not filtered out instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode globalDoNotWriteRegs = {'R13', 'R13D', 'R13W', 'R13B', 'R14', 'R14D', 'R14W', 'R14B', 'R15', 'R15D', 'R15W', 'R15B', 'SP', 'SPL', 'ESP', 'RSP', 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15', 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'RBP', 'EBP', 'BP', 'K0'} #ToDo -#R14: reserved for memory addresses -#R13: can be written in init; will not be overwritten by other code +#R14: reserved for memory addresses (base) +#R13: reserved for memory addresses (index) #R15: loop counter specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP', @@ -60,7 +60,6 @@ def isIntelCPU(): return not isAMDCPU() - def getAddrReg(instrNode, opNode): if opNode.attrib.get('suppressed', '0') == '1': return opNode.attrib['base'] @@ -69,6 +68,14 @@ def getAddrReg(instrNode, opNode): else: return 'R14' +def getIndexReg(instrNode, opNode): + if opNode.attrib.get('VSIB', '0') != '0': + return opNode.attrib.get('VSIB') + '14' + elif instrNode.attrib.get('rex', '1') == '0': + return 'RSI' + else: + return 'R13' + # registers that are not used as implicit registers should come first; RAX (and parts of it) should come last, as some instructions have special encodings for that # prefer low registers to high registers def sortRegs(regsList): @@ -76,7 +83,7 @@ def sortRegs(regsList): # Initialize registers and memory -def getRegMemInit(instrNode, opRegDict, memOffset): +def getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr): iform = instrNode.attrib['iform'] iclass = instrNode.attrib['iclass'] @@ -124,13 +131,18 @@ def getRegMemInit(instrNode, opRegDict, memOffset): elif 'MM' in regPrefix: init += ['PXOR '+reg+', '+reg] elif opNode.attrib['type'] == 'mem': - if 'VSIB' in opNode.attrib: - vsibReg = opNode.attrib['VSIB'] + '14' - init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg] if xtype.startswith('f'): init += ['MOV RAX, 0x4000000040000000'] for i in range(0, int(opNode.attrib['width'])/8, 8): init += ['MOV [R14+' + str(i+memOffset) + '], RAX'] + for opNode in instrNode.findall('./operand[@type="mem"]'): + if opNode.attrib.get('suppressed', '0') == '1': continue + if 'VSIB' in opNode.attrib: + vsibReg = getIndexReg(instrNode, opNode) + init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg] + elif useIndexedAddr: + init += ['XOR {0}, {0}'.format(getIndexReg(instrNode, opNode))] + return init nExperiments = 0 @@ -144,7 +156,6 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, nExperiments += 1 instrCode = re.sub(';+', '; ', instrCode.strip('; ')) - if debugOutput: print 'instr: ' + instrCode codeObjFile = '/tmp/ramdisk/code.o' assemble(instrCode, codeObjFile, asmFile='/tmp/ramdisk/code.s') localHtmlReports.append('
  • Code:
    ' + getMachineCode(codeObjFile) + '
  • \n') @@ -178,6 +189,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, nanoBenchCmd += ' -asm_init "' + initCode + '"' localHtmlReports.append('
  • Show nanoBench command
  • \n') + if debugOutput: print nanoBenchCmd setNanoBenchParameters(unrollCount=unrollCount, loopCount=loopCount, warmUpCount=warmUpCount, basicMode=basicMode) @@ -313,7 +325,8 @@ def configurePFCs(events): InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit']) -def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2, computeRegMemInit=True): +def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2, + computeRegMemInit=True, useIndexedAddr=False): if not doNotWriteRegs: doNotWriteRegs = [] if not doNotReadRegs: doNotReadRegs = [] if not opRegDict: opRegDict = {} @@ -394,14 +407,12 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, if asmprefix != '': asm += ' ' - address = '' - if operandNode.attrib.get('VSIB', '0') != "0": - address = 'R14+' + operandNode.attrib.get('VSIB') + '14' - readRegs.add('R14') - readRegs.add(operandNode.attrib.get('VSIB') + '14') - else: - address = getAddrReg(instrNode, operandNode) - readRegs.add(address) + address = getAddrReg(instrNode, operandNode) + readRegs.add(address) + if useIndexedAddr or operandNode.attrib.get('VSIB', '0') != '0': + indexReg = getIndexReg(instrNode, operandNode) + address += '+' + indexReg + readRegs.add(indexReg) asm += '[' + address + ('+'+str(memOffset) if memOffset else '') + ']' @@ -444,7 +455,7 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, asm = asm + '; 1: ' regMemInit = [] - if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset) + if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr) return InstrInstance(instrNode, asm, readRegs, writtenRegs, opRegDict, regMemInit) def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr): @@ -465,7 +476,7 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr writtenRegs = instrInstance.writtenRegs if debugOutput: print ' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs) - blockInstrsList = getIndependentInstructions(blockInstrNode, True, writtenRegs|readRegs, writtenRegs|readRegs, 64) + blockInstrsList = getIndependentInstructions(blockInstrNode, True, False, writtenRegs|readRegs, writtenRegs|readRegs, 64) if debugOutput: print ' bIL: ' + str(blockInstrsList) htmlReports.append('

    With blocking instructions for port' + @@ -552,7 +563,7 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr # Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of # calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep. -def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None, doNotWriteRegs = None, initialOffset = 0, immediate = 2): +def getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, doNotReadRegs=None, doNotWriteRegs=None, initialOffset=0, immediate=2): if not doNotReadRegs: doNotReadRegs = set() if not doNotWriteRegs: doNotWriteRegs = set() doNotReadRegs |= specialRegs @@ -573,7 +584,7 @@ def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None, offset = initialOffset for _ in range(maxTPRep): - instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate) + instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate, useIndexedAddr=useIndexedAddr) if not instrI: break @@ -591,7 +602,7 @@ def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None, doNotReadRegs = doNotReadRegs | instrI.writtenRegs if not independentInstructions: - instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate) + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate, useIndexedAddr=useIndexedAddr) independentInstructions.append(instrI) return independentInstructions @@ -611,6 +622,12 @@ def hasCommonRegister(instrNode): return True return False +def hasExplicitNonVSIBMemOperand(instrNode): + for opNode in instrNode.findall('./operand[@type="mem"]'): + if opNode.attrib.get('suppressed', '') != '1' and opNode.attrib.get('VSIB', '0') == '0': + return True + return False + def getThroughputIacaNoInteriteration(instrNode, htmlReports): createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode, useDistinctRegs=True).asm) try: @@ -642,7 +659,7 @@ class TPConfig: self.preInstrNodes = ([] if preInstrNodes is None else preInstrNodes) self.note = note -def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInstrs=True): +def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeIndepAndDepBreakingInstrs=True): iform = instrNode.attrib['iform'] iclass = instrNode.attrib['iclass'] @@ -652,7 +669,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst independentInstrs = [] depBreakingInstrs = '' if computeIndepAndDepBreakingInstrs: - independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs) + independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr) depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode) # instructions with multiple configs @@ -662,7 +679,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst if instrNode.attrib['string'].replace('I8', str(immediate)) in instrNodeDict: continue config = TPConfig(note='With immediate = ' + str(immediate)) - config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=immediate) + config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, immediate=immediate) config.depBreakingInstrs = depBreakingInstrs configs.append(config) return configs @@ -708,7 +725,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst if iclass == 'FXRSTOR64': config.init = ['FXSAVE64 [R14]'] if iform in ['IN_AL_IMMb', 'IN_OeAX_IMMb', 'OUT_IMMb_AL', 'OUT_IMMb_OeAX']: - config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=0x80) + config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, immediate=0x80) if iform in ['IN_AL_DX', 'IN_OeAX_DX', 'OUT_DX_AL', 'OUT_DX_OeAX'] or instrNode.attrib['category'] in ['IOSTRINGOP']: config.init = ['mov DX, 0x80'] @@ -905,8 +922,8 @@ TPResult = namedtuple('TPResult', ['TP', 'TP_noDepBreaking_noLoop', 'TP_single', # returns TPResult # port usages are averages (when no ports are blocked by other instructions) -def getThroughputAndUops(instrNode, useDistinctRegs, htmlReports): - configs = getTPConfigs(instrNode, useDistinctRegs) +def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports): + configs = getTPConfigs(instrNode, useDistinctRegs, useIndexedAddr) minTP = sys.maxint minTP_noDepBreaking_noLoop = sys.maxint @@ -1199,7 +1216,7 @@ def getDependencyBreakingInstrs(instrNode, opRegDict, ignoreOperand = None): if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue - depBreakingInstrs[opNode] = 'TEST R13, R13' + depBreakingInstrs[opNode] = 'TEST R15, R15' return depBreakingInstrs @@ -1237,7 +1254,7 @@ def getDependencyBreakingInstrsForSuppressedOperands(instrNode): # on some CPUs, instructions that write flags conditionally also read the flags if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue - depBreakingInstrs += ['TEST R13, R13'] + depBreakingInstrs += ['TEST R15, R15'] return ';'.join(depBreakingInstrs) @@ -1858,10 +1875,10 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) elif 'MM' in reg: - instrI = getInstrInstanceFromNode(instrNode, ['R13', 'R15'], ['R13', 'R15'], True, {startNodeIdx:reg}) + instrI = getInstrInstanceFromNode(instrNode, ['R12', 'R15'], ['R12', 'R15'], True, {startNodeIdx:reg}) configList.isUpperBound = True - for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R13', reg): - chainInstrs = 'CMOV' + flag[0] + ' R13, R15; ' + chainInstrI.asm + for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R12', reg): + chainInstrs = 'CMOV' + flag[0] + ' R12, R15; ' + chainInstrI.asm chainLatency = basicLatency['CMOV' + flag[0]] + 1 configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) elif targetNode.attrib['type'] == 'mem': @@ -1952,6 +1969,7 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem return None addrReg = getAddrReg(instrNode, startNode) + indexReg = getIndexReg(instrNode, startNode) memWidth = int(startNode.attrib.get('width', 0)) if targetNode.attrib['type'] == 'reg': @@ -1969,13 +1987,15 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem print 'read from suppressed mem to non-GPR reg not yet supported' return None - if reg in GPRegs: - instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg}) + instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg}, + useIndexedAddr=(addrMem=='addr_index')) - if addrMem == 'addr': + if reg in GPRegs: + if addrMem in ['addr', 'addr_index']: # addr -> reg + chainReg = (addrReg if addrMem == 'addr' else indexReg) chainInstrs = 'MOVSX ' + regTo64(reg) + ', ' + regToSize(reg, min(32, regSize)) + ';' - chainInstrs += 'XOR {}, {};'.format(addrReg, regTo64(reg)) * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2 + chainInstrs += 'XOR {}, {};'.format(chainReg, regTo64(reg)) * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2 chainLatency = basicLatency['MOVSX'] + basicLatency['XOR'] * cRep configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) else: @@ -1989,15 +2009,14 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(regSize)] >= 12) # 0 if CPU supports zero-latency store forwarding configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) elif 'MM' in reg: - instrI = getInstrInstanceFromNode(instrNode, ['R12'], ['R12'], useDistinctRegs, {targetNodeIdx:reg}) - - if addrMem == 'addr': + if addrMem in ['addr', 'addr_index']: # addr -> reg configList.isUpperBound = True + chainReg = (addrReg if addrMem == 'addr' else indexReg) chainInstrs = 'MOVQ R12, {};'.format(getCanonicalReg(reg)) if isAVXInstr(instrNode): chainInstrs = 'V' + chainInstrs - chainInstrs += 'XOR {}, {};'.format(addrReg, 'R12') * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2 + chainInstrs += 'XOR {}, {};'.format(chainReg, 'R12') * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2 chainLatency = 1 + basicLatency['XOR'] * cRep configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) elif addrMem == 'addr_VSIB': @@ -2018,11 +2037,13 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem if not ('flag_'+flag) in targetNode.attrib: continue if not 'w' in targetNode.attrib[('flag_'+flag)]: continue - instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs) + instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs, + useIndexedAddr=(addrMem=='addr_index')) - if addrMem == 'addr': + if addrMem in ['addr', 'addr_index']: # addr -> flag - chainInstr = 'CMOV' + flag[0] + ' ' + addrReg + ', ' + addrReg + chainReg = (addrReg if addrMem == 'addr' else indexReg) + chainInstr = 'CMOV' + flag[0] + ' ' + chainReg + ', ' + chainReg chainLatency = basicLatency['CMOV' + flag[0]] configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=chainLatency)) else: @@ -2043,14 +2064,17 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem # mem -> mem ################# if startNode == targetNode: - instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs=useDistinctRegs) + instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs=useDistinctRegs, + useIndexedAddr=(addrMem=='addr_index')) - if addrMem == 'addr': + if addrMem in ['addr', 'addr_index']: # addr -> mem configList.isUpperBound = True - chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + addrReg + '];' + chainReg = (addrReg if addrMem == 'addr' else indexReg) + memStr = addrReg + ('+'+indexReg if addrMem == 'addr_index' else '') + chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + memStr + '];' chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';') * cRep - chainInstrs += 'XOR ' + addrReg + ', R12; XOR ' + addrReg + ', R12;' + ('TEST R13, R13;' if instrReadsFlags else '') + chainInstrs += 'XOR ' + chainReg + ', R12; XOR ' + chainReg + ', R12;' + ('TEST R15, R15;' if instrReadsFlags else '') chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR'] chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) @@ -2144,9 +2168,18 @@ def getLatencies(instrNode, instrNodeList, tpDict, htmlReports): addrMemList = [''] if opNode1.attrib['type']=='mem': - addrMemList = ['addr', 'mem']+(['addr_VSIB'] if 'VSIB' in opNode1.attrib else []) - elif opNode1.attrib['type']=='agen' and ('B' in instrNode.attrib['agen'] or 'I' in instrNode.attrib['agen']): addrMemList = ['addr'] + if 'VSIB' in opNode1.attrib: + addrMemList.append('addr_VSIB') + elif opNode1.attrib.get('suppressed', '') != '1': + addrMemList.append('addr_index') + addrMemList.append('mem') # mem added last; order is relevant for html output + elif opNode1.attrib['type']=='agen' and ('B' in instrNode.attrib['agen'] or 'I' in instrNode.attrib['agen']): + addrMemList = [] + if 'B' in instrNode.attrib['agen']: + addrMemList.append('addr') + if 'I' in instrNode.attrib['agen']: + addrMemList.append('addr_index') for addrMem in addrMemList: minLatDistinctRegs = 0 @@ -2613,17 +2646,19 @@ def main(): tpDict = {} tpDictSameReg = {} + tpDictIndexedAddr = {} tpDictNoInteriteration = {} if args.tpInput is not None: with open(args.tpInput, 'rb') as f: - pTpDict, pTpDictSameReg, pTpDictNoInteriteration = pickle.load(f) + pTpDict, pTpDictSameReg, pTpDictIndexedAddr, pTpDictNoInteriteration = pickle.load(f) tpDict = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDict.items()} tpDictSameReg = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictSameReg.items()} + tpDictIndexedAddr = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictIndexedAddr.items()} tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()} else: for i, instrNode in enumerate(instrNodeList): - #if not 'MOVZX_NOREX' in instrNode.attrib['string']: continue + #if not 'MOV_NOREX' in instrNode.attrib['string']: continue print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' htmlReports = ['

    ' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '

    \n
    \n'] @@ -2631,7 +2666,10 @@ def main(): hasCommonReg = hasCommonRegister(instrNode) if hasCommonReg: htmlReports.append('

    With different registers for different operands

    \n') - tpResult = getThroughputAndUops(instrNode, True, htmlReports) + hasExplMemOp = hasExplicitNonVSIBMemOperand(instrNode) + if hasExplMemOp: htmlReports.append('

    With a non-indexed addressing mode

    \n') + + tpResult = getThroughputAndUops(instrNode, True, False, htmlReports) print instrNode.attrib['string'] + " - tp: " + str(tpResult) if tpResult: @@ -2639,10 +2677,16 @@ def main(): if hasCommonReg: htmlReports.append('

    With the same register for for different operands

    \n') - tpResultSameReg = getThroughputAndUops(instrNode, False, htmlReports) + tpResultSameReg = getThroughputAndUops(instrNode, False, False, htmlReports) if tpResultSameReg: tpDictSameReg[instrNode] = tpResultSameReg + if hasExplMemOp: + htmlReports.append('

    With an indexed addressing mode

    \n') + tpResultIndexed = getThroughputAndUops(instrNode, True, True, htmlReports) + if tpResultIndexed: + tpDictIndexedAddr[instrNode] = tpResultIndexed + if useIACA and iacaVersion in ['2.1', '2.2']: htmlReports.append('

    With the -no_interiteration flag

    \n') tp = getThroughputIacaNoInteriteration(instrNode, htmlReports) @@ -2650,7 +2694,7 @@ def main(): if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports)) with open('tp_' + arch + '.pickle', 'wb') as f: - pickle.dump((tpDict, tpDictSameReg, tpDictNoInteriteration), f) + pickle.dump((tpDict, tpDictSameReg, tpDictIndexedAddr, tpDictNoInteriteration), f) num_ports = len(tpDict.values()[0].unblocked_ports) @@ -2669,7 +2713,7 @@ def main(): latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()} elif not useIACA or iacaVersion == '2.1': for i, instrNode in enumerate(instrNodeList): - #if not 'LEA' in instrNode.attrib['string']: continue + #if not 'ADC' in instrNode.attrib['string']: continue print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' htmlReports = ['

    ' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '

    \n
    \n'] @@ -2692,6 +2736,7 @@ def main(): # the elements of this set are sets of ports that either have the same functional units, or that cannot be used independently portCombinationsResultDict = {} portCombinationsResultDictSameReg = {} + portCombinationsResultDictIndexedAddr = {} if not args.noPorts: # iforms of instructions that are potentially zero-latency instructions @@ -2791,7 +2836,7 @@ def main(): for i, instrNode in enumerate(instrNodeList): if not instrNode in tpDict: - # don't iterate over the keys of unblocked_ports_dict directly because of the ordering + # don't iterate over the keys of tpDict directly because of the ordering continue print 'Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' @@ -2799,96 +2844,104 @@ def main(): htmlReports = ['

    ' + instrNode.attrib['string'] + ' - Port Usage' + (' (IACA '+iacaVersion+')' if useIACA else '') + '

    '] for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]): + for useIndexedAddr in ([False, True] if useDistinctRegs and (instrNode in tpDictIndexedAddr) else [False]): + tpResult = None - tpResult = None + if not useDistinctRegs: + tp1 = tpDict[instrNode] + tp2 = tpDictSameReg[instrNode] + if (tp1.uops == tp2.uops and tp1.fused_uops == tp2.fused_uops): continue + tpResult = tp2 + htmlReports.append('

    With the same register for different operands

    ') + elif useIndexedAddr: + tpResult = tpDictIndexedAddr[instrNode] + htmlReports.append('

    With an indexed addressing mode

    ') + else: + tpResult = tpDict[instrNode] - if useDistinctRegs: - tpResult = tpDict[instrNode] - else: - if tpDict[instrNode].uops == tpDictSameReg[instrNode].uops: continue - tpResult = tpDictSameReg[instrNode] - htmlReports.append('

    With the same register for different operands

    ') + rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2)) - rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2)) + if not useIACA and tpResult.config.preInstrNodes: + rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes) - if not useIACA and tpResult.config.preInstrNodes: - rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes) + # use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen + used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05} + if debugOutput: print instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports) - # use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen - used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05} - if debugOutput: print instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports) + if not isAVXInstr(instrNode): + blockingInstrs = blockingInstructionsDictNonAVX + sortedPortCombinations = sortedPortCombinationsNonAVX + else: + blockingInstrs = blockingInstructionsDictNonSSE + sortedPortCombinations = sortedPortCombinationsNonSSE - if not isAVXInstr(instrNode): - blockingInstrs = blockingInstructionsDictNonAVX - sortedPortCombinations = sortedPortCombinationsNonAVX - else: - blockingInstrs = blockingInstructionsDictNonSSE - sortedPortCombinations = sortedPortCombinationsNonSSE + uopsCombinationList = [] - uopsCombinationList = [] + if not used_ports: + htmlReports.append('No uops') + elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (not tpResult.ILD_stalls > 0): + # one uop instruction + uopsCombinationList = [(frozenset(used_ports), 1)] + htmlReports.append('
    Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports)) + elif rem_uops > 0 and not isAMDCPU(): + for combination in sortedPortCombinations: + if not combination.intersection(used_ports): continue - if not used_ports: - htmlReports.append('No uops') - elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (not tpResult.ILD_stalls > 0): - # one uop instruction - uopsCombinationList = [(frozenset(used_ports), 1)] - htmlReports.append('
    Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports)) - elif rem_uops > 0 and not isAMDCPU(): - for combination in sortedPortCombinations: - if not combination.intersection(used_ports): continue + prevUopsOnCombination = 0 + for prev_combination, prev_uops in uopsCombinationList: + if prev_combination.issubset(combination): + prevUopsOnCombination += prev_uops - prevUopsOnCombination = 0 - for prev_combination, prev_uops in uopsCombinationList: - if prev_combination.issubset(combination): - prevUopsOnCombination += prev_uops + if not useIACA: + if tpResult.config.preInstrNodes: + for preInstrNode in tpResult.config.preInstrNodes: + for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]: + if pre_comb.issubset(combination): + prevUopsOnCombination += pre_uops - if not useIACA: - if tpResult.config.preInstrNodes: - for preInstrNode in tpResult.config.preInstrNodes: - for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]: - if pre_comb.issubset(combination): - prevUopsOnCombination += pre_uops + nPortsInComb = sum(len(str(x)) for x in combination) + blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10) + blockInstrRep = min(blockInstrRep, 100) + uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports) + if uopsOnBlockedPorts is None: + print 'no uops on blocked ports: ' + str(combination) + continue - nPortsInComb = sum(len(str(x)) for x in combination) - blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10) - blockInstrRep = min(blockInstrRep, 100) - uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports) - if uopsOnBlockedPorts is None: - print 'no uops on blocked ports: ' + str(combination) - continue + uopsOnBlockedPorts -= prevUopsOnCombination - uopsOnBlockedPorts -= prevUopsOnCombination + if rem_uops < uopsOnBlockedPorts: + print 'More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts) - if rem_uops < uopsOnBlockedPorts: - print 'More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts) + if uopsOnBlockedPorts <= 0: continue - if uopsOnBlockedPorts <= 0: continue + if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1': + # for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which + # seems to lead to replays of the store data uops + uopsOnBlockedPorts = 1 - if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1': - # for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which - # seems to lead to replays of the store data uops - uopsOnBlockedPorts = 1 + uopsCombinationList.append((combination, uopsOnBlockedPorts)) - uopsCombinationList.append((combination, uopsOnBlockedPorts)) + htmlReports.append('⇨ ' + + ((str(uopsOnBlockedPorts) + ' μops') if (uopsOnBlockedPorts > 1) else 'One μop') + + ' that can only use port' + + ('s {' if len(combination)>1 else ' ') + + str(list(combination))[1:-1] + + ('}' if len(combination)>1 else '') + '') - htmlReports.append('⇨ ' + - ((str(uopsOnBlockedPorts) + ' μops') if (uopsOnBlockedPorts > 1) else 'One μop') + - ' that can only use port' + - ('s {' if len(combination)>1 else ' ') + - str(list(combination))[1:-1] + - ('}' if len(combination)>1 else '') + '') + rem_uops -= uopsOnBlockedPorts + if rem_uops <= 0: break - rem_uops -= uopsOnBlockedPorts - if rem_uops <= 0: break + # on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports + # we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate + uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList] - # on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports - # we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate - uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList] + if not useDistinctRegs: + portCombinationsResultDictSameReg[instrNode] = uopsCombinationList + elif useIndexedAddr: + portCombinationsResultDictIndexedAddr[instrNode] = uopsCombinationList + else: + portCombinationsResultDict[instrNode] = uopsCombinationList - if useDistinctRegs: - portCombinationsResultDict[instrNode] = uopsCombinationList - else: - portCombinationsResultDictSameReg[instrNode] = uopsCombinationList writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports)) @@ -2905,13 +2958,18 @@ def main(): else: resultNode = archNode.find('./measurement') - tpResult_dr = tpDict[instrNode] - tpResult_sr = tpDictSameReg.get(instrNode, tpResult_dr) - - for tpResult in ([tpResult_dr, tpResult_sr] if tpResult_dr.uops != tpResult_sr.uops else [tpResult_dr]): - suffix = ('' if tpResult == tpResult_dr else '_same_reg') - curPortCombinationsResultDict = (portCombinationsResultDict if tpResult == tpResult_dr else portCombinationsResultDictSameReg) + applicableResults = [(tpDict[instrNode], portCombinationsResultDict.get(instrNode, None), '')] + for otherTPDict, otherPCDict, suffix in [(tpDictSameReg, portCombinationsResultDictSameReg, '_same_reg'), + (tpDictIndexedAddr, portCombinationsResultDictIndexedAddr, '_indexed')]: + if instrNode in otherTPDict: + t1 = tpDict[instrNode] + t2 = otherTPDict[instrNode] + p1 = portCombinationsResultDict.get(instrNode, None) + p2 = otherPCDict.get(instrNode, None) + if (t1.uops != t2.uops or t1.fused_uops != t2.fused_uops or ((p2 is not None) and (p1 != p2))): + applicableResults.append((t2, p2, suffix)) + for tpResult, portUsageList, suffix in applicableResults: uops = tpResult.uops uopsFused = tpResult.fused_uops if useIACA: @@ -2937,10 +2995,7 @@ def main(): portPrefix = ('p' if isIntelCPU() else 'FP') computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(str(p) for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0]))) - if instrNode in curPortCombinationsResultDict: - portUsageList = curPortCombinationsResultDict[instrNode] - if not portUsageList: continue - + if portUsageList: resultNode.attrib['ports'+suffix] = computePortStr(portUsageList) portUsageWithDivList = list(portUsageList) diff --git a/tools/cpuBench/utils.py b/tools/cpuBench/utils.py index 1f3ef51..6a8ad0a 100755 --- a/tools/cpuBench/utils.py +++ b/tools/cpuBench/utils.py @@ -109,7 +109,7 @@ def latencyNodeToStr(latNode, sameReg, addr_mem): ret += ', with the same register for different operands' if addr_mem == 'addr': ret += ' (address, base register)' - elif addr_mem == 'addr_VSIB': + elif addr_mem in ['addr_index', 'addr_VSIB']: ret += ' (address, index register)' elif addr_mem == 'mem': ret += ' (memory)' @@ -144,7 +144,7 @@ def getLatencyTableEntry(measurementNode): for latNode in measurementNode.findall('./latency'): for sameReg in [False, True]: - for addr_mem in ['', 'addr', 'mem']: + for addr_mem in ['', 'addr', 'addr_index', 'addr_VSIB', 'mem']: suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if sameReg else '') if 'cycles'+suffix in latNode.attrib: cycles = int(latNode.attrib['cycles'+suffix])