support for indexed addressing modes

This commit is contained in:
Andreas Abel
2020-07-31 15:33:40 +02:00
parent 0997084470
commit 8f7401178e
2 changed files with 194 additions and 139 deletions

View File

@@ -37,8 +37,8 @@ instrNodeList = [] # list of all XML instruction nodes that are not filtered out
instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode
globalDoNotWriteRegs = {'R13', 'R13D', 'R13W', 'R13B', 'R14', 'R14D', 'R14W', 'R14B', 'R15', 'R15D', 'R15W', 'R15B', 'SP', 'SPL', 'ESP', 'RSP', 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15', 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'RBP', 'EBP', 'BP', 'K0'} #ToDo globalDoNotWriteRegs = {'R13', 'R13D', 'R13W', 'R13B', 'R14', 'R14D', 'R14W', 'R14B', 'R15', 'R15D', 'R15W', 'R15B', 'SP', 'SPL', 'ESP', 'RSP', 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15', 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'RBP', 'EBP', 'BP', 'K0'} #ToDo
#R14: reserved for memory addresses #R14: reserved for memory addresses (base)
#R13: can be written in init; will not be overwritten by other code #R13: reserved for memory addresses (index)
#R15: loop counter #R15: loop counter
specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP', specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP',
@@ -60,7 +60,6 @@ def isIntelCPU():
return not isAMDCPU() return not isAMDCPU()
def getAddrReg(instrNode, opNode): def getAddrReg(instrNode, opNode):
if opNode.attrib.get('suppressed', '0') == '1': if opNode.attrib.get('suppressed', '0') == '1':
return opNode.attrib['base'] return opNode.attrib['base']
@@ -69,6 +68,14 @@ def getAddrReg(instrNode, opNode):
else: else:
return 'R14' return 'R14'
def getIndexReg(instrNode, opNode):
if opNode.attrib.get('VSIB', '0') != '0':
return opNode.attrib.get('VSIB') + '14'
elif instrNode.attrib.get('rex', '1') == '0':
return 'RSI'
else:
return 'R13'
# registers that are not used as implicit registers should come first; RAX (and parts of it) should come last, as some instructions have special encodings for that # registers that are not used as implicit registers should come first; RAX (and parts of it) should come last, as some instructions have special encodings for that
# prefer low registers to high registers # prefer low registers to high registers
def sortRegs(regsList): def sortRegs(regsList):
@@ -76,7 +83,7 @@ def sortRegs(regsList):
# Initialize registers and memory # Initialize registers and memory
def getRegMemInit(instrNode, opRegDict, memOffset): def getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr):
iform = instrNode.attrib['iform'] iform = instrNode.attrib['iform']
iclass = instrNode.attrib['iclass'] iclass = instrNode.attrib['iclass']
@@ -124,13 +131,18 @@ def getRegMemInit(instrNode, opRegDict, memOffset):
elif 'MM' in regPrefix: elif 'MM' in regPrefix:
init += ['PXOR '+reg+', '+reg] init += ['PXOR '+reg+', '+reg]
elif opNode.attrib['type'] == 'mem': elif opNode.attrib['type'] == 'mem':
if 'VSIB' in opNode.attrib:
vsibReg = opNode.attrib['VSIB'] + '14'
init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg]
if xtype.startswith('f'): if xtype.startswith('f'):
init += ['MOV RAX, 0x4000000040000000'] init += ['MOV RAX, 0x4000000040000000']
for i in range(0, int(opNode.attrib['width'])/8, 8): init += ['MOV [R14+' + str(i+memOffset) + '], RAX'] for i in range(0, int(opNode.attrib['width'])/8, 8): init += ['MOV [R14+' + str(i+memOffset) + '], RAX']
for opNode in instrNode.findall('./operand[@type="mem"]'):
if opNode.attrib.get('suppressed', '0') == '1': continue
if 'VSIB' in opNode.attrib:
vsibReg = getIndexReg(instrNode, opNode)
init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg]
elif useIndexedAddr:
init += ['XOR {0}, {0}'.format(getIndexReg(instrNode, opNode))]
return init return init
nExperiments = 0 nExperiments = 0
@@ -144,7 +156,6 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
nExperiments += 1 nExperiments += 1
instrCode = re.sub(';+', '; ', instrCode.strip('; ')) instrCode = re.sub(';+', '; ', instrCode.strip('; '))
if debugOutput: print 'instr: ' + instrCode
codeObjFile = '/tmp/ramdisk/code.o' codeObjFile = '/tmp/ramdisk/code.o'
assemble(instrCode, codeObjFile, asmFile='/tmp/ramdisk/code.s') assemble(instrCode, codeObjFile, asmFile='/tmp/ramdisk/code.s')
localHtmlReports.append('<li>Code: <pre>' + getMachineCode(codeObjFile) + '</pre></li>\n') localHtmlReports.append('<li>Code: <pre>' + getMachineCode(codeObjFile) + '</pre></li>\n')
@@ -178,6 +189,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
nanoBenchCmd += ' -asm_init &quot;' + initCode + '&quot;' nanoBenchCmd += ' -asm_init &quot;' + initCode + '&quot;'
localHtmlReports.append('<li><a href="javascript:;" onclick="this.outerHTML = \'<pre>' + nanoBenchCmd + '</pre>\'">Show nanoBench command</a></li>\n') localHtmlReports.append('<li><a href="javascript:;" onclick="this.outerHTML = \'<pre>' + nanoBenchCmd + '</pre>\'">Show nanoBench command</a></li>\n')
if debugOutput: print nanoBenchCmd
setNanoBenchParameters(unrollCount=unrollCount, loopCount=loopCount, warmUpCount=warmUpCount, basicMode=basicMode) setNanoBenchParameters(unrollCount=unrollCount, loopCount=loopCount, warmUpCount=warmUpCount, basicMode=basicMode)
@@ -313,7 +325,8 @@ def configurePFCs(events):
InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit']) InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit'])
def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2, computeRegMemInit=True): def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2,
computeRegMemInit=True, useIndexedAddr=False):
if not doNotWriteRegs: doNotWriteRegs = [] if not doNotWriteRegs: doNotWriteRegs = []
if not doNotReadRegs: doNotReadRegs = [] if not doNotReadRegs: doNotReadRegs = []
if not opRegDict: opRegDict = {} if not opRegDict: opRegDict = {}
@@ -394,14 +407,12 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
if asmprefix != '': if asmprefix != '':
asm += ' ' asm += ' '
address = '' address = getAddrReg(instrNode, operandNode)
if operandNode.attrib.get('VSIB', '0') != "0": readRegs.add(address)
address = 'R14+' + operandNode.attrib.get('VSIB') + '14' if useIndexedAddr or operandNode.attrib.get('VSIB', '0') != '0':
readRegs.add('R14') indexReg = getIndexReg(instrNode, operandNode)
readRegs.add(operandNode.attrib.get('VSIB') + '14') address += '+' + indexReg
else: readRegs.add(indexReg)
address = getAddrReg(instrNode, operandNode)
readRegs.add(address)
asm += '[' + address + ('+'+str(memOffset) if memOffset else '') + ']' asm += '[' + address + ('+'+str(memOffset) if memOffset else '') + ']'
@@ -444,7 +455,7 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
asm = asm + '; 1: ' asm = asm + '; 1: '
regMemInit = [] regMemInit = []
if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset) if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr)
return InstrInstance(instrNode, asm, readRegs, writtenRegs, opRegDict, regMemInit) return InstrInstance(instrNode, asm, readRegs, writtenRegs, opRegDict, regMemInit)
def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr): def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr):
@@ -465,7 +476,7 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
writtenRegs = instrInstance.writtenRegs writtenRegs = instrInstance.writtenRegs
if debugOutput: print ' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs) if debugOutput: print ' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs)
blockInstrsList = getIndependentInstructions(blockInstrNode, True, writtenRegs|readRegs, writtenRegs|readRegs, 64) blockInstrsList = getIndependentInstructions(blockInstrNode, True, False, writtenRegs|readRegs, writtenRegs|readRegs, 64)
if debugOutput: print ' bIL: ' + str(blockInstrsList) if debugOutput: print ' bIL: ' + str(blockInstrsList)
htmlReports.append('<hr><h3>With blocking instructions for port' + htmlReports.append('<hr><h3>With blocking instructions for port' +
@@ -552,7 +563,7 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
# Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of # Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of
# calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep. # calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep.
def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None, doNotWriteRegs = None, initialOffset = 0, immediate = 2): def getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, doNotReadRegs=None, doNotWriteRegs=None, initialOffset=0, immediate=2):
if not doNotReadRegs: doNotReadRegs = set() if not doNotReadRegs: doNotReadRegs = set()
if not doNotWriteRegs: doNotWriteRegs = set() if not doNotWriteRegs: doNotWriteRegs = set()
doNotReadRegs |= specialRegs doNotReadRegs |= specialRegs
@@ -573,7 +584,7 @@ def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None,
offset = initialOffset offset = initialOffset
for _ in range(maxTPRep): for _ in range(maxTPRep):
instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate) instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate, useIndexedAddr=useIndexedAddr)
if not instrI: if not instrI:
break break
@@ -591,7 +602,7 @@ def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None,
doNotReadRegs = doNotReadRegs | instrI.writtenRegs doNotReadRegs = doNotReadRegs | instrI.writtenRegs
if not independentInstructions: if not independentInstructions:
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate) instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate, useIndexedAddr=useIndexedAddr)
independentInstructions.append(instrI) independentInstructions.append(instrI)
return independentInstructions return independentInstructions
@@ -611,6 +622,12 @@ def hasCommonRegister(instrNode):
return True return True
return False return False
def hasExplicitNonVSIBMemOperand(instrNode):
for opNode in instrNode.findall('./operand[@type="mem"]'):
if opNode.attrib.get('suppressed', '') != '1' and opNode.attrib.get('VSIB', '0') == '0':
return True
return False
def getThroughputIacaNoInteriteration(instrNode, htmlReports): def getThroughputIacaNoInteriteration(instrNode, htmlReports):
createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode, useDistinctRegs=True).asm) createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode, useDistinctRegs=True).asm)
try: try:
@@ -642,7 +659,7 @@ class TPConfig:
self.preInstrNodes = ([] if preInstrNodes is None else preInstrNodes) self.preInstrNodes = ([] if preInstrNodes is None else preInstrNodes)
self.note = note self.note = note
def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInstrs=True): def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeIndepAndDepBreakingInstrs=True):
iform = instrNode.attrib['iform'] iform = instrNode.attrib['iform']
iclass = instrNode.attrib['iclass'] iclass = instrNode.attrib['iclass']
@@ -652,7 +669,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst
independentInstrs = [] independentInstrs = []
depBreakingInstrs = '' depBreakingInstrs = ''
if computeIndepAndDepBreakingInstrs: if computeIndepAndDepBreakingInstrs:
independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs) independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr)
depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode) depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode)
# instructions with multiple configs # instructions with multiple configs
@@ -662,7 +679,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst
if instrNode.attrib['string'].replace('I8', str(immediate)) in instrNodeDict: if instrNode.attrib['string'].replace('I8', str(immediate)) in instrNodeDict:
continue continue
config = TPConfig(note='With immediate = ' + str(immediate)) config = TPConfig(note='With immediate = ' + str(immediate))
config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=immediate) config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, immediate=immediate)
config.depBreakingInstrs = depBreakingInstrs config.depBreakingInstrs = depBreakingInstrs
configs.append(config) configs.append(config)
return configs return configs
@@ -708,7 +725,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst
if iclass == 'FXRSTOR64': config.init = ['FXSAVE64 [R14]'] if iclass == 'FXRSTOR64': config.init = ['FXSAVE64 [R14]']
if iform in ['IN_AL_IMMb', 'IN_OeAX_IMMb', 'OUT_IMMb_AL', 'OUT_IMMb_OeAX']: if iform in ['IN_AL_IMMb', 'IN_OeAX_IMMb', 'OUT_IMMb_AL', 'OUT_IMMb_OeAX']:
config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=0x80) config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, immediate=0x80)
if iform in ['IN_AL_DX', 'IN_OeAX_DX', 'OUT_DX_AL', 'OUT_DX_OeAX'] or instrNode.attrib['category'] in ['IOSTRINGOP']: if iform in ['IN_AL_DX', 'IN_OeAX_DX', 'OUT_DX_AL', 'OUT_DX_OeAX'] or instrNode.attrib['category'] in ['IOSTRINGOP']:
config.init = ['mov DX, 0x80'] config.init = ['mov DX, 0x80']
@@ -905,8 +922,8 @@ TPResult = namedtuple('TPResult', ['TP', 'TP_noDepBreaking_noLoop', 'TP_single',
# returns TPResult # returns TPResult
# port usages are averages (when no ports are blocked by other instructions) # port usages are averages (when no ports are blocked by other instructions)
def getThroughputAndUops(instrNode, useDistinctRegs, htmlReports): def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports):
configs = getTPConfigs(instrNode, useDistinctRegs) configs = getTPConfigs(instrNode, useDistinctRegs, useIndexedAddr)
minTP = sys.maxint minTP = sys.maxint
minTP_noDepBreaking_noLoop = sys.maxint minTP_noDepBreaking_noLoop = sys.maxint
@@ -1199,7 +1216,7 @@ def getDependencyBreakingInstrs(instrNode, opRegDict, ignoreOperand = None):
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
depBreakingInstrs[opNode] = 'TEST R13, R13' depBreakingInstrs[opNode] = 'TEST R15, R15'
return depBreakingInstrs return depBreakingInstrs
@@ -1237,7 +1254,7 @@ def getDependencyBreakingInstrsForSuppressedOperands(instrNode):
# on some CPUs, instructions that write flags conditionally also read the flags # on some CPUs, instructions that write flags conditionally also read the flags
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
depBreakingInstrs += ['TEST R13, R13'] depBreakingInstrs += ['TEST R15, R15']
return ';'.join(depBreakingInstrs) return ';'.join(depBreakingInstrs)
@@ -1858,10 +1875,10 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
elif 'MM' in reg: elif 'MM' in reg:
instrI = getInstrInstanceFromNode(instrNode, ['R13', 'R15'], ['R13', 'R15'], True, {startNodeIdx:reg}) instrI = getInstrInstanceFromNode(instrNode, ['R12', 'R15'], ['R12', 'R15'], True, {startNodeIdx:reg})
configList.isUpperBound = True configList.isUpperBound = True
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R13', reg): for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R12', reg):
chainInstrs = 'CMOV' + flag[0] + ' R13, R15; ' + chainInstrI.asm chainInstrs = 'CMOV' + flag[0] + ' R12, R15; ' + chainInstrI.asm
chainLatency = basicLatency['CMOV' + flag[0]] + 1 chainLatency = basicLatency['CMOV' + flag[0]] + 1
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
elif targetNode.attrib['type'] == 'mem': elif targetNode.attrib['type'] == 'mem':
@@ -1952,6 +1969,7 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
return None return None
addrReg = getAddrReg(instrNode, startNode) addrReg = getAddrReg(instrNode, startNode)
indexReg = getIndexReg(instrNode, startNode)
memWidth = int(startNode.attrib.get('width', 0)) memWidth = int(startNode.attrib.get('width', 0))
if targetNode.attrib['type'] == 'reg': if targetNode.attrib['type'] == 'reg':
@@ -1969,13 +1987,15 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
print 'read from suppressed mem to non-GPR reg not yet supported' print 'read from suppressed mem to non-GPR reg not yet supported'
return None return None
if reg in GPRegs: instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg},
instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg}) useIndexedAddr=(addrMem=='addr_index'))
if addrMem == 'addr': if reg in GPRegs:
if addrMem in ['addr', 'addr_index']:
# addr -> reg # addr -> reg
chainReg = (addrReg if addrMem == 'addr' else indexReg)
chainInstrs = 'MOVSX ' + regTo64(reg) + ', ' + regToSize(reg, min(32, regSize)) + ';' chainInstrs = 'MOVSX ' + regTo64(reg) + ', ' + regToSize(reg, min(32, regSize)) + ';'
chainInstrs += 'XOR {}, {};'.format(addrReg, regTo64(reg)) * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2 chainInstrs += 'XOR {}, {};'.format(chainReg, regTo64(reg)) * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2
chainLatency = basicLatency['MOVSX'] + basicLatency['XOR'] * cRep chainLatency = basicLatency['MOVSX'] + basicLatency['XOR'] * cRep
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
else: else:
@@ -1989,15 +2009,14 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(regSize)] >= 12) # 0 if CPU supports zero-latency store forwarding chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(regSize)] >= 12) # 0 if CPU supports zero-latency store forwarding
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
elif 'MM' in reg: elif 'MM' in reg:
instrI = getInstrInstanceFromNode(instrNode, ['R12'], ['R12'], useDistinctRegs, {targetNodeIdx:reg}) if addrMem in ['addr', 'addr_index']:
if addrMem == 'addr':
# addr -> reg # addr -> reg
configList.isUpperBound = True configList.isUpperBound = True
chainReg = (addrReg if addrMem == 'addr' else indexReg)
chainInstrs = 'MOVQ R12, {};'.format(getCanonicalReg(reg)) chainInstrs = 'MOVQ R12, {};'.format(getCanonicalReg(reg))
if isAVXInstr(instrNode): if isAVXInstr(instrNode):
chainInstrs = 'V' + chainInstrs chainInstrs = 'V' + chainInstrs
chainInstrs += 'XOR {}, {};'.format(addrReg, 'R12') * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2 chainInstrs += 'XOR {}, {};'.format(chainReg, 'R12') * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2
chainLatency = 1 + basicLatency['XOR'] * cRep chainLatency = 1 + basicLatency['XOR'] * cRep
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
elif addrMem == 'addr_VSIB': elif addrMem == 'addr_VSIB':
@@ -2018,11 +2037,13 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
if not ('flag_'+flag) in targetNode.attrib: continue if not ('flag_'+flag) in targetNode.attrib: continue
if not 'w' in targetNode.attrib[('flag_'+flag)]: continue if not 'w' in targetNode.attrib[('flag_'+flag)]: continue
instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs) instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs,
useIndexedAddr=(addrMem=='addr_index'))
if addrMem == 'addr': if addrMem in ['addr', 'addr_index']:
# addr -> flag # addr -> flag
chainInstr = 'CMOV' + flag[0] + ' ' + addrReg + ', ' + addrReg chainReg = (addrReg if addrMem == 'addr' else indexReg)
chainInstr = 'CMOV' + flag[0] + ' ' + chainReg + ', ' + chainReg
chainLatency = basicLatency['CMOV' + flag[0]] chainLatency = basicLatency['CMOV' + flag[0]]
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=chainLatency)) configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=chainLatency))
else: else:
@@ -2043,14 +2064,17 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
# mem -> mem # mem -> mem
################# #################
if startNode == targetNode: if startNode == targetNode:
instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs=useDistinctRegs) instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs=useDistinctRegs,
useIndexedAddr=(addrMem=='addr_index'))
if addrMem == 'addr': if addrMem in ['addr', 'addr_index']:
# addr -> mem # addr -> mem
configList.isUpperBound = True configList.isUpperBound = True
chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + addrReg + '];' chainReg = (addrReg if addrMem == 'addr' else indexReg)
memStr = addrReg + ('+'+indexReg if addrMem == 'addr_index' else '')
chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + memStr + '];'
chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';') * cRep chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';') * cRep
chainInstrs += 'XOR ' + addrReg + ', R12; XOR ' + addrReg + ', R12;' + ('TEST R13, R13;' if instrReadsFlags else '') chainInstrs += 'XOR ' + chainReg + ', R12; XOR ' + chainReg + ', R12;' + ('TEST R15, R15;' if instrReadsFlags else '')
chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR'] chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR']
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
@@ -2144,9 +2168,18 @@ def getLatencies(instrNode, instrNodeList, tpDict, htmlReports):
addrMemList = [''] addrMemList = ['']
if opNode1.attrib['type']=='mem': if opNode1.attrib['type']=='mem':
addrMemList = ['addr', 'mem']+(['addr_VSIB'] if 'VSIB' in opNode1.attrib else [])
elif opNode1.attrib['type']=='agen' and ('B' in instrNode.attrib['agen'] or 'I' in instrNode.attrib['agen']):
addrMemList = ['addr'] addrMemList = ['addr']
if 'VSIB' in opNode1.attrib:
addrMemList.append('addr_VSIB')
elif opNode1.attrib.get('suppressed', '') != '1':
addrMemList.append('addr_index')
addrMemList.append('mem') # mem added last; order is relevant for html output
elif opNode1.attrib['type']=='agen' and ('B' in instrNode.attrib['agen'] or 'I' in instrNode.attrib['agen']):
addrMemList = []
if 'B' in instrNode.attrib['agen']:
addrMemList.append('addr')
if 'I' in instrNode.attrib['agen']:
addrMemList.append('addr_index')
for addrMem in addrMemList: for addrMem in addrMemList:
minLatDistinctRegs = 0 minLatDistinctRegs = 0
@@ -2613,17 +2646,19 @@ def main():
tpDict = {} tpDict = {}
tpDictSameReg = {} tpDictSameReg = {}
tpDictIndexedAddr = {}
tpDictNoInteriteration = {} tpDictNoInteriteration = {}
if args.tpInput is not None: if args.tpInput is not None:
with open(args.tpInput, 'rb') as f: with open(args.tpInput, 'rb') as f:
pTpDict, pTpDictSameReg, pTpDictNoInteriteration = pickle.load(f) pTpDict, pTpDictSameReg, pTpDictIndexedAddr, pTpDictNoInteriteration = pickle.load(f)
tpDict = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDict.items()} tpDict = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDict.items()}
tpDictSameReg = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictSameReg.items()} tpDictSameReg = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictSameReg.items()}
tpDictIndexedAddr = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictIndexedAddr.items()}
tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()} tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()}
else: else:
for i, instrNode in enumerate(instrNodeList): for i, instrNode in enumerate(instrNodeList):
#if not 'MOVZX_NOREX' in instrNode.attrib['string']: continue #if not 'MOV_NOREX' in instrNode.attrib['string']: continue
print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n'] htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
@@ -2631,7 +2666,10 @@ def main():
hasCommonReg = hasCommonRegister(instrNode) hasCommonReg = hasCommonRegister(instrNode)
if hasCommonReg: htmlReports.append('<h2 id="distinctRegs">With different registers for different operands</h2>\n') if hasCommonReg: htmlReports.append('<h2 id="distinctRegs">With different registers for different operands</h2>\n')
tpResult = getThroughputAndUops(instrNode, True, htmlReports) hasExplMemOp = hasExplicitNonVSIBMemOperand(instrNode)
if hasExplMemOp: htmlReports.append('<h2 id="nonIndexedAddr">With a non-indexed addressing mode</h2>\n')
tpResult = getThroughputAndUops(instrNode, True, False, htmlReports)
print instrNode.attrib['string'] + " - tp: " + str(tpResult) print instrNode.attrib['string'] + " - tp: " + str(tpResult)
if tpResult: if tpResult:
@@ -2639,10 +2677,16 @@ def main():
if hasCommonReg: if hasCommonReg:
htmlReports.append('<hr><h2 id="sameReg">With the same register for for different operands</h2>\n') htmlReports.append('<hr><h2 id="sameReg">With the same register for for different operands</h2>\n')
tpResultSameReg = getThroughputAndUops(instrNode, False, htmlReports) tpResultSameReg = getThroughputAndUops(instrNode, False, False, htmlReports)
if tpResultSameReg: if tpResultSameReg:
tpDictSameReg[instrNode] = tpResultSameReg tpDictSameReg[instrNode] = tpResultSameReg
if hasExplMemOp:
htmlReports.append('<hr><h2 id="indexedAddr">With an indexed addressing mode</h2>\n')
tpResultIndexed = getThroughputAndUops(instrNode, True, True, htmlReports)
if tpResultIndexed:
tpDictIndexedAddr[instrNode] = tpResultIndexed
if useIACA and iacaVersion in ['2.1', '2.2']: if useIACA and iacaVersion in ['2.1', '2.2']:
htmlReports.append('<hr><h2 id="noInteriteration">With the -no_interiteration flag</h2>\n') htmlReports.append('<hr><h2 id="noInteriteration">With the -no_interiteration flag</h2>\n')
tp = getThroughputIacaNoInteriteration(instrNode, htmlReports) tp = getThroughputIacaNoInteriteration(instrNode, htmlReports)
@@ -2650,7 +2694,7 @@ def main():
if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports)) if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
with open('tp_' + arch + '.pickle', 'wb') as f: with open('tp_' + arch + '.pickle', 'wb') as f:
pickle.dump((tpDict, tpDictSameReg, tpDictNoInteriteration), f) pickle.dump((tpDict, tpDictSameReg, tpDictIndexedAddr, tpDictNoInteriteration), f)
num_ports = len(tpDict.values()[0].unblocked_ports) num_ports = len(tpDict.values()[0].unblocked_ports)
@@ -2669,7 +2713,7 @@ def main():
latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()} latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()}
elif not useIACA or iacaVersion == '2.1': elif not useIACA or iacaVersion == '2.1':
for i, instrNode in enumerate(instrNodeList): for i, instrNode in enumerate(instrNodeList):
#if not 'LEA' in instrNode.attrib['string']: continue #if not 'ADC' in instrNode.attrib['string']: continue
print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n'] htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
@@ -2692,6 +2736,7 @@ def main():
# the elements of this set are sets of ports that either have the same functional units, or that cannot be used independently # the elements of this set are sets of ports that either have the same functional units, or that cannot be used independently
portCombinationsResultDict = {} portCombinationsResultDict = {}
portCombinationsResultDictSameReg = {} portCombinationsResultDictSameReg = {}
portCombinationsResultDictIndexedAddr = {}
if not args.noPorts: if not args.noPorts:
# iforms of instructions that are potentially zero-latency instructions # iforms of instructions that are potentially zero-latency instructions
@@ -2791,7 +2836,7 @@ def main():
for i, instrNode in enumerate(instrNodeList): for i, instrNode in enumerate(instrNodeList):
if not instrNode in tpDict: if not instrNode in tpDict:
# don't iterate over the keys of unblocked_ports_dict directly because of the ordering # don't iterate over the keys of tpDict directly because of the ordering
continue continue
print 'Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' print 'Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
@@ -2799,96 +2844,104 @@ def main():
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Port Usage' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>'] htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Port Usage' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>']
for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]): for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]):
for useIndexedAddr in ([False, True] if useDistinctRegs and (instrNode in tpDictIndexedAddr) else [False]):
tpResult = None
tpResult = None if not useDistinctRegs:
tp1 = tpDict[instrNode]
tp2 = tpDictSameReg[instrNode]
if (tp1.uops == tp2.uops and tp1.fused_uops == tp2.fused_uops): continue
tpResult = tp2
htmlReports.append('<hr><h2>With the same register for different operands</h2>')
elif useIndexedAddr:
tpResult = tpDictIndexedAddr[instrNode]
htmlReports.append('<hr><h2>With an indexed addressing mode</h2>')
else:
tpResult = tpDict[instrNode]
if useDistinctRegs: rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2))
tpResult = tpDict[instrNode]
else:
if tpDict[instrNode].uops == tpDictSameReg[instrNode].uops: continue
tpResult = tpDictSameReg[instrNode]
htmlReports.append('<h2>With the same register for different operands</h2>')
rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2)) if not useIACA and tpResult.config.preInstrNodes:
rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes)
if not useIACA and tpResult.config.preInstrNodes: # use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen
rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes) used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05}
if debugOutput: print instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports)
# use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen if not isAVXInstr(instrNode):
used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05} blockingInstrs = blockingInstructionsDictNonAVX
if debugOutput: print instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports) sortedPortCombinations = sortedPortCombinationsNonAVX
else:
blockingInstrs = blockingInstructionsDictNonSSE
sortedPortCombinations = sortedPortCombinationsNonSSE
if not isAVXInstr(instrNode): uopsCombinationList = []
blockingInstrs = blockingInstructionsDictNonAVX
sortedPortCombinations = sortedPortCombinationsNonAVX
else:
blockingInstrs = blockingInstructionsDictNonSSE
sortedPortCombinations = sortedPortCombinationsNonSSE
uopsCombinationList = [] if not used_ports:
htmlReports.append('No uops')
elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (not tpResult.ILD_stalls > 0):
# one uop instruction
uopsCombinationList = [(frozenset(used_ports), 1)]
htmlReports.append('<hr>Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports))
elif rem_uops > 0 and not isAMDCPU():
for combination in sortedPortCombinations:
if not combination.intersection(used_ports): continue
if not used_ports: prevUopsOnCombination = 0
htmlReports.append('No uops') for prev_combination, prev_uops in uopsCombinationList:
elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (not tpResult.ILD_stalls > 0): if prev_combination.issubset(combination):
# one uop instruction prevUopsOnCombination += prev_uops
uopsCombinationList = [(frozenset(used_ports), 1)]
htmlReports.append('<hr>Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports))
elif rem_uops > 0 and not isAMDCPU():
for combination in sortedPortCombinations:
if not combination.intersection(used_ports): continue
prevUopsOnCombination = 0 if not useIACA:
for prev_combination, prev_uops in uopsCombinationList: if tpResult.config.preInstrNodes:
if prev_combination.issubset(combination): for preInstrNode in tpResult.config.preInstrNodes:
prevUopsOnCombination += prev_uops for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]:
if pre_comb.issubset(combination):
prevUopsOnCombination += pre_uops
if not useIACA: nPortsInComb = sum(len(str(x)) for x in combination)
if tpResult.config.preInstrNodes: blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10)
for preInstrNode in tpResult.config.preInstrNodes: blockInstrRep = min(blockInstrRep, 100)
for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]: uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports)
if pre_comb.issubset(combination): if uopsOnBlockedPorts is None:
prevUopsOnCombination += pre_uops print 'no uops on blocked ports: ' + str(combination)
continue
nPortsInComb = sum(len(str(x)) for x in combination) uopsOnBlockedPorts -= prevUopsOnCombination
blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10)
blockInstrRep = min(blockInstrRep, 100)
uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports)
if uopsOnBlockedPorts is None:
print 'no uops on blocked ports: ' + str(combination)
continue
uopsOnBlockedPorts -= prevUopsOnCombination if rem_uops < uopsOnBlockedPorts:
print 'More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts)
if rem_uops < uopsOnBlockedPorts: if uopsOnBlockedPorts <= 0: continue
print 'More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts)
if uopsOnBlockedPorts <= 0: continue if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1':
# for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which
# seems to lead to replays of the store data uops
uopsOnBlockedPorts = 1
if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1': uopsCombinationList.append((combination, uopsOnBlockedPorts))
# for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which
# seems to lead to replays of the store data uops
uopsOnBlockedPorts = 1
uopsCombinationList.append((combination, uopsOnBlockedPorts)) htmlReports.append('<strong>&#8680; ' +
((str(uopsOnBlockedPorts) + ' &mu;ops') if (uopsOnBlockedPorts > 1) else 'One &mu;op') +
' that can only use port' +
('s {' if len(combination)>1 else ' ') +
str(list(combination))[1:-1] +
('}' if len(combination)>1 else '') + '</strong>')
htmlReports.append('<strong>&#8680; ' + rem_uops -= uopsOnBlockedPorts
((str(uopsOnBlockedPorts) + ' &mu;ops') if (uopsOnBlockedPorts > 1) else 'One &mu;op') + if rem_uops <= 0: break
' that can only use port' +
('s {' if len(combination)>1 else ' ') +
str(list(combination))[1:-1] +
('}' if len(combination)>1 else '') + '</strong>')
rem_uops -= uopsOnBlockedPorts # on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports
if rem_uops <= 0: break # we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate
uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList]
# on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports if not useDistinctRegs:
# we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate portCombinationsResultDictSameReg[instrNode] = uopsCombinationList
uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList] elif useIndexedAddr:
portCombinationsResultDictIndexedAddr[instrNode] = uopsCombinationList
else:
portCombinationsResultDict[instrNode] = uopsCombinationList
if useDistinctRegs:
portCombinationsResultDict[instrNode] = uopsCombinationList
else:
portCombinationsResultDictSameReg[instrNode] = uopsCombinationList
writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports)) writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
@@ -2905,13 +2958,18 @@ def main():
else: else:
resultNode = archNode.find('./measurement') resultNode = archNode.find('./measurement')
tpResult_dr = tpDict[instrNode] applicableResults = [(tpDict[instrNode], portCombinationsResultDict.get(instrNode, None), '')]
tpResult_sr = tpDictSameReg.get(instrNode, tpResult_dr) for otherTPDict, otherPCDict, suffix in [(tpDictSameReg, portCombinationsResultDictSameReg, '_same_reg'),
(tpDictIndexedAddr, portCombinationsResultDictIndexedAddr, '_indexed')]:
for tpResult in ([tpResult_dr, tpResult_sr] if tpResult_dr.uops != tpResult_sr.uops else [tpResult_dr]): if instrNode in otherTPDict:
suffix = ('' if tpResult == tpResult_dr else '_same_reg') t1 = tpDict[instrNode]
curPortCombinationsResultDict = (portCombinationsResultDict if tpResult == tpResult_dr else portCombinationsResultDictSameReg) t2 = otherTPDict[instrNode]
p1 = portCombinationsResultDict.get(instrNode, None)
p2 = otherPCDict.get(instrNode, None)
if (t1.uops != t2.uops or t1.fused_uops != t2.fused_uops or ((p2 is not None) and (p1 != p2))):
applicableResults.append((t2, p2, suffix))
for tpResult, portUsageList, suffix in applicableResults:
uops = tpResult.uops uops = tpResult.uops
uopsFused = tpResult.fused_uops uopsFused = tpResult.fused_uops
if useIACA: if useIACA:
@@ -2937,10 +2995,7 @@ def main():
portPrefix = ('p' if isIntelCPU() else 'FP') portPrefix = ('p' if isIntelCPU() else 'FP')
computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(str(p) for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0]))) computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(str(p) for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0])))
if instrNode in curPortCombinationsResultDict: if portUsageList:
portUsageList = curPortCombinationsResultDict[instrNode]
if not portUsageList: continue
resultNode.attrib['ports'+suffix] = computePortStr(portUsageList) resultNode.attrib['ports'+suffix] = computePortStr(portUsageList)
portUsageWithDivList = list(portUsageList) portUsageWithDivList = list(portUsageList)

View File

@@ -109,7 +109,7 @@ def latencyNodeToStr(latNode, sameReg, addr_mem):
ret += ', with the same register for different operands' ret += ', with the same register for different operands'
if addr_mem == 'addr': if addr_mem == 'addr':
ret += ' (address, base register)' ret += ' (address, base register)'
elif addr_mem == 'addr_VSIB': elif addr_mem in ['addr_index', 'addr_VSIB']:
ret += ' (address, index register)' ret += ' (address, index register)'
elif addr_mem == 'mem': elif addr_mem == 'mem':
ret += ' (memory)' ret += ' (memory)'
@@ -144,7 +144,7 @@ def getLatencyTableEntry(measurementNode):
for latNode in measurementNode.findall('./latency'): for latNode in measurementNode.findall('./latency'):
for sameReg in [False, True]: for sameReg in [False, True]:
for addr_mem in ['', 'addr', 'mem']: for addr_mem in ['', 'addr', 'addr_index', 'addr_VSIB', 'mem']:
suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if sameReg else '') suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if sameReg else '')
if 'cycles'+suffix in latNode.attrib: if 'cycles'+suffix in latNode.attrib:
cycles = int(latNode.attrib['cycles'+suffix]) cycles = int(latNode.attrib['cycles'+suffix])