mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-16 11:30:07 +01:00
support for indexed addressing modes
This commit is contained in:
@@ -37,8 +37,8 @@ instrNodeList = [] # list of all XML instruction nodes that are not filtered out
|
|||||||
instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode
|
instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode
|
||||||
|
|
||||||
globalDoNotWriteRegs = {'R13', 'R13D', 'R13W', 'R13B', 'R14', 'R14D', 'R14W', 'R14B', 'R15', 'R15D', 'R15W', 'R15B', 'SP', 'SPL', 'ESP', 'RSP', 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15', 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'RBP', 'EBP', 'BP', 'K0'} #ToDo
|
globalDoNotWriteRegs = {'R13', 'R13D', 'R13W', 'R13B', 'R14', 'R14D', 'R14W', 'R14B', 'R15', 'R15D', 'R15W', 'R15B', 'SP', 'SPL', 'ESP', 'RSP', 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15', 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'RBP', 'EBP', 'BP', 'K0'} #ToDo
|
||||||
#R14: reserved for memory addresses
|
#R14: reserved for memory addresses (base)
|
||||||
#R13: can be written in init; will not be overwritten by other code
|
#R13: reserved for memory addresses (index)
|
||||||
#R15: loop counter
|
#R15: loop counter
|
||||||
|
|
||||||
specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP',
|
specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP',
|
||||||
@@ -60,7 +60,6 @@ def isIntelCPU():
|
|||||||
return not isAMDCPU()
|
return not isAMDCPU()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getAddrReg(instrNode, opNode):
|
def getAddrReg(instrNode, opNode):
|
||||||
if opNode.attrib.get('suppressed', '0') == '1':
|
if opNode.attrib.get('suppressed', '0') == '1':
|
||||||
return opNode.attrib['base']
|
return opNode.attrib['base']
|
||||||
@@ -69,6 +68,14 @@ def getAddrReg(instrNode, opNode):
|
|||||||
else:
|
else:
|
||||||
return 'R14'
|
return 'R14'
|
||||||
|
|
||||||
|
def getIndexReg(instrNode, opNode):
|
||||||
|
if opNode.attrib.get('VSIB', '0') != '0':
|
||||||
|
return opNode.attrib.get('VSIB') + '14'
|
||||||
|
elif instrNode.attrib.get('rex', '1') == '0':
|
||||||
|
return 'RSI'
|
||||||
|
else:
|
||||||
|
return 'R13'
|
||||||
|
|
||||||
# registers that are not used as implicit registers should come first; RAX (and parts of it) should come last, as some instructions have special encodings for that
|
# registers that are not used as implicit registers should come first; RAX (and parts of it) should come last, as some instructions have special encodings for that
|
||||||
# prefer low registers to high registers
|
# prefer low registers to high registers
|
||||||
def sortRegs(regsList):
|
def sortRegs(regsList):
|
||||||
@@ -76,7 +83,7 @@ def sortRegs(regsList):
|
|||||||
|
|
||||||
|
|
||||||
# Initialize registers and memory
|
# Initialize registers and memory
|
||||||
def getRegMemInit(instrNode, opRegDict, memOffset):
|
def getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr):
|
||||||
iform = instrNode.attrib['iform']
|
iform = instrNode.attrib['iform']
|
||||||
iclass = instrNode.attrib['iclass']
|
iclass = instrNode.attrib['iclass']
|
||||||
|
|
||||||
@@ -124,13 +131,18 @@ def getRegMemInit(instrNode, opRegDict, memOffset):
|
|||||||
elif 'MM' in regPrefix:
|
elif 'MM' in regPrefix:
|
||||||
init += ['PXOR '+reg+', '+reg]
|
init += ['PXOR '+reg+', '+reg]
|
||||||
elif opNode.attrib['type'] == 'mem':
|
elif opNode.attrib['type'] == 'mem':
|
||||||
if 'VSIB' in opNode.attrib:
|
|
||||||
vsibReg = opNode.attrib['VSIB'] + '14'
|
|
||||||
init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg]
|
|
||||||
if xtype.startswith('f'):
|
if xtype.startswith('f'):
|
||||||
init += ['MOV RAX, 0x4000000040000000']
|
init += ['MOV RAX, 0x4000000040000000']
|
||||||
for i in range(0, int(opNode.attrib['width'])/8, 8): init += ['MOV [R14+' + str(i+memOffset) + '], RAX']
|
for i in range(0, int(opNode.attrib['width'])/8, 8): init += ['MOV [R14+' + str(i+memOffset) + '], RAX']
|
||||||
|
|
||||||
|
for opNode in instrNode.findall('./operand[@type="mem"]'):
|
||||||
|
if opNode.attrib.get('suppressed', '0') == '1': continue
|
||||||
|
if 'VSIB' in opNode.attrib:
|
||||||
|
vsibReg = getIndexReg(instrNode, opNode)
|
||||||
|
init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg]
|
||||||
|
elif useIndexedAddr:
|
||||||
|
init += ['XOR {0}, {0}'.format(getIndexReg(instrNode, opNode))]
|
||||||
|
|
||||||
return init
|
return init
|
||||||
|
|
||||||
nExperiments = 0
|
nExperiments = 0
|
||||||
@@ -144,7 +156,6 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
|
|||||||
nExperiments += 1
|
nExperiments += 1
|
||||||
|
|
||||||
instrCode = re.sub(';+', '; ', instrCode.strip('; '))
|
instrCode = re.sub(';+', '; ', instrCode.strip('; '))
|
||||||
if debugOutput: print 'instr: ' + instrCode
|
|
||||||
codeObjFile = '/tmp/ramdisk/code.o'
|
codeObjFile = '/tmp/ramdisk/code.o'
|
||||||
assemble(instrCode, codeObjFile, asmFile='/tmp/ramdisk/code.s')
|
assemble(instrCode, codeObjFile, asmFile='/tmp/ramdisk/code.s')
|
||||||
localHtmlReports.append('<li>Code: <pre>' + getMachineCode(codeObjFile) + '</pre></li>\n')
|
localHtmlReports.append('<li>Code: <pre>' + getMachineCode(codeObjFile) + '</pre></li>\n')
|
||||||
@@ -178,6 +189,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
|
|||||||
nanoBenchCmd += ' -asm_init "' + initCode + '"'
|
nanoBenchCmd += ' -asm_init "' + initCode + '"'
|
||||||
|
|
||||||
localHtmlReports.append('<li><a href="javascript:;" onclick="this.outerHTML = \'<pre>' + nanoBenchCmd + '</pre>\'">Show nanoBench command</a></li>\n')
|
localHtmlReports.append('<li><a href="javascript:;" onclick="this.outerHTML = \'<pre>' + nanoBenchCmd + '</pre>\'">Show nanoBench command</a></li>\n')
|
||||||
|
if debugOutput: print nanoBenchCmd
|
||||||
|
|
||||||
setNanoBenchParameters(unrollCount=unrollCount, loopCount=loopCount, warmUpCount=warmUpCount, basicMode=basicMode)
|
setNanoBenchParameters(unrollCount=unrollCount, loopCount=loopCount, warmUpCount=warmUpCount, basicMode=basicMode)
|
||||||
|
|
||||||
@@ -313,7 +325,8 @@ def configurePFCs(events):
|
|||||||
|
|
||||||
InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit'])
|
InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit'])
|
||||||
|
|
||||||
def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2, computeRegMemInit=True):
|
def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2,
|
||||||
|
computeRegMemInit=True, useIndexedAddr=False):
|
||||||
if not doNotWriteRegs: doNotWriteRegs = []
|
if not doNotWriteRegs: doNotWriteRegs = []
|
||||||
if not doNotReadRegs: doNotReadRegs = []
|
if not doNotReadRegs: doNotReadRegs = []
|
||||||
if not opRegDict: opRegDict = {}
|
if not opRegDict: opRegDict = {}
|
||||||
@@ -394,14 +407,12 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
|
|||||||
if asmprefix != '':
|
if asmprefix != '':
|
||||||
asm += ' '
|
asm += ' '
|
||||||
|
|
||||||
address = ''
|
address = getAddrReg(instrNode, operandNode)
|
||||||
if operandNode.attrib.get('VSIB', '0') != "0":
|
readRegs.add(address)
|
||||||
address = 'R14+' + operandNode.attrib.get('VSIB') + '14'
|
if useIndexedAddr or operandNode.attrib.get('VSIB', '0') != '0':
|
||||||
readRegs.add('R14')
|
indexReg = getIndexReg(instrNode, operandNode)
|
||||||
readRegs.add(operandNode.attrib.get('VSIB') + '14')
|
address += '+' + indexReg
|
||||||
else:
|
readRegs.add(indexReg)
|
||||||
address = getAddrReg(instrNode, operandNode)
|
|
||||||
readRegs.add(address)
|
|
||||||
|
|
||||||
asm += '[' + address + ('+'+str(memOffset) if memOffset else '') + ']'
|
asm += '[' + address + ('+'+str(memOffset) if memOffset else '') + ']'
|
||||||
|
|
||||||
@@ -444,7 +455,7 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
|
|||||||
asm = asm + '; 1: '
|
asm = asm + '; 1: '
|
||||||
|
|
||||||
regMemInit = []
|
regMemInit = []
|
||||||
if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset)
|
if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr)
|
||||||
return InstrInstance(instrNode, asm, readRegs, writtenRegs, opRegDict, regMemInit)
|
return InstrInstance(instrNode, asm, readRegs, writtenRegs, opRegDict, regMemInit)
|
||||||
|
|
||||||
def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr):
|
def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr):
|
||||||
@@ -465,7 +476,7 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
|
|||||||
writtenRegs = instrInstance.writtenRegs
|
writtenRegs = instrInstance.writtenRegs
|
||||||
|
|
||||||
if debugOutput: print ' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs)
|
if debugOutput: print ' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs)
|
||||||
blockInstrsList = getIndependentInstructions(blockInstrNode, True, writtenRegs|readRegs, writtenRegs|readRegs, 64)
|
blockInstrsList = getIndependentInstructions(blockInstrNode, True, False, writtenRegs|readRegs, writtenRegs|readRegs, 64)
|
||||||
if debugOutput: print ' bIL: ' + str(blockInstrsList)
|
if debugOutput: print ' bIL: ' + str(blockInstrsList)
|
||||||
|
|
||||||
htmlReports.append('<hr><h3>With blocking instructions for port' +
|
htmlReports.append('<hr><h3>With blocking instructions for port' +
|
||||||
@@ -552,7 +563,7 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
|
|||||||
|
|
||||||
# Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of
|
# Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of
|
||||||
# calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep.
|
# calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep.
|
||||||
def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None, doNotWriteRegs = None, initialOffset = 0, immediate = 2):
|
def getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, doNotReadRegs=None, doNotWriteRegs=None, initialOffset=0, immediate=2):
|
||||||
if not doNotReadRegs: doNotReadRegs = set()
|
if not doNotReadRegs: doNotReadRegs = set()
|
||||||
if not doNotWriteRegs: doNotWriteRegs = set()
|
if not doNotWriteRegs: doNotWriteRegs = set()
|
||||||
doNotReadRegs |= specialRegs
|
doNotReadRegs |= specialRegs
|
||||||
@@ -573,7 +584,7 @@ def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None,
|
|||||||
offset = initialOffset
|
offset = initialOffset
|
||||||
|
|
||||||
for _ in range(maxTPRep):
|
for _ in range(maxTPRep):
|
||||||
instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate)
|
instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate, useIndexedAddr=useIndexedAddr)
|
||||||
if not instrI:
|
if not instrI:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -591,7 +602,7 @@ def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None,
|
|||||||
doNotReadRegs = doNotReadRegs | instrI.writtenRegs
|
doNotReadRegs = doNotReadRegs | instrI.writtenRegs
|
||||||
|
|
||||||
if not independentInstructions:
|
if not independentInstructions:
|
||||||
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate)
|
instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate, useIndexedAddr=useIndexedAddr)
|
||||||
independentInstructions.append(instrI)
|
independentInstructions.append(instrI)
|
||||||
|
|
||||||
return independentInstructions
|
return independentInstructions
|
||||||
@@ -611,6 +622,12 @@ def hasCommonRegister(instrNode):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def hasExplicitNonVSIBMemOperand(instrNode):
|
||||||
|
for opNode in instrNode.findall('./operand[@type="mem"]'):
|
||||||
|
if opNode.attrib.get('suppressed', '') != '1' and opNode.attrib.get('VSIB', '0') == '0':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def getThroughputIacaNoInteriteration(instrNode, htmlReports):
|
def getThroughputIacaNoInteriteration(instrNode, htmlReports):
|
||||||
createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode, useDistinctRegs=True).asm)
|
createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode, useDistinctRegs=True).asm)
|
||||||
try:
|
try:
|
||||||
@@ -642,7 +659,7 @@ class TPConfig:
|
|||||||
self.preInstrNodes = ([] if preInstrNodes is None else preInstrNodes)
|
self.preInstrNodes = ([] if preInstrNodes is None else preInstrNodes)
|
||||||
self.note = note
|
self.note = note
|
||||||
|
|
||||||
def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInstrs=True):
|
def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeIndepAndDepBreakingInstrs=True):
|
||||||
iform = instrNode.attrib['iform']
|
iform = instrNode.attrib['iform']
|
||||||
iclass = instrNode.attrib['iclass']
|
iclass = instrNode.attrib['iclass']
|
||||||
|
|
||||||
@@ -652,7 +669,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst
|
|||||||
independentInstrs = []
|
independentInstrs = []
|
||||||
depBreakingInstrs = ''
|
depBreakingInstrs = ''
|
||||||
if computeIndepAndDepBreakingInstrs:
|
if computeIndepAndDepBreakingInstrs:
|
||||||
independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs)
|
independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr)
|
||||||
depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode)
|
depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode)
|
||||||
|
|
||||||
# instructions with multiple configs
|
# instructions with multiple configs
|
||||||
@@ -662,7 +679,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst
|
|||||||
if instrNode.attrib['string'].replace('I8', str(immediate)) in instrNodeDict:
|
if instrNode.attrib['string'].replace('I8', str(immediate)) in instrNodeDict:
|
||||||
continue
|
continue
|
||||||
config = TPConfig(note='With immediate = ' + str(immediate))
|
config = TPConfig(note='With immediate = ' + str(immediate))
|
||||||
config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=immediate)
|
config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, immediate=immediate)
|
||||||
config.depBreakingInstrs = depBreakingInstrs
|
config.depBreakingInstrs = depBreakingInstrs
|
||||||
configs.append(config)
|
configs.append(config)
|
||||||
return configs
|
return configs
|
||||||
@@ -708,7 +725,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInst
|
|||||||
if iclass == 'FXRSTOR64': config.init = ['FXSAVE64 [R14]']
|
if iclass == 'FXRSTOR64': config.init = ['FXSAVE64 [R14]']
|
||||||
|
|
||||||
if iform in ['IN_AL_IMMb', 'IN_OeAX_IMMb', 'OUT_IMMb_AL', 'OUT_IMMb_OeAX']:
|
if iform in ['IN_AL_IMMb', 'IN_OeAX_IMMb', 'OUT_IMMb_AL', 'OUT_IMMb_OeAX']:
|
||||||
config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=0x80)
|
config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, immediate=0x80)
|
||||||
|
|
||||||
if iform in ['IN_AL_DX', 'IN_OeAX_DX', 'OUT_DX_AL', 'OUT_DX_OeAX'] or instrNode.attrib['category'] in ['IOSTRINGOP']:
|
if iform in ['IN_AL_DX', 'IN_OeAX_DX', 'OUT_DX_AL', 'OUT_DX_OeAX'] or instrNode.attrib['category'] in ['IOSTRINGOP']:
|
||||||
config.init = ['mov DX, 0x80']
|
config.init = ['mov DX, 0x80']
|
||||||
@@ -905,8 +922,8 @@ TPResult = namedtuple('TPResult', ['TP', 'TP_noDepBreaking_noLoop', 'TP_single',
|
|||||||
|
|
||||||
# returns TPResult
|
# returns TPResult
|
||||||
# port usages are averages (when no ports are blocked by other instructions)
|
# port usages are averages (when no ports are blocked by other instructions)
|
||||||
def getThroughputAndUops(instrNode, useDistinctRegs, htmlReports):
|
def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports):
|
||||||
configs = getTPConfigs(instrNode, useDistinctRegs)
|
configs = getTPConfigs(instrNode, useDistinctRegs, useIndexedAddr)
|
||||||
|
|
||||||
minTP = sys.maxint
|
minTP = sys.maxint
|
||||||
minTP_noDepBreaking_noLoop = sys.maxint
|
minTP_noDepBreaking_noLoop = sys.maxint
|
||||||
@@ -1199,7 +1216,7 @@ def getDependencyBreakingInstrs(instrNode, opRegDict, ignoreOperand = None):
|
|||||||
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
|
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
|
||||||
|
|
||||||
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
|
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
|
||||||
depBreakingInstrs[opNode] = 'TEST R13, R13'
|
depBreakingInstrs[opNode] = 'TEST R15, R15'
|
||||||
|
|
||||||
return depBreakingInstrs
|
return depBreakingInstrs
|
||||||
|
|
||||||
@@ -1237,7 +1254,7 @@ def getDependencyBreakingInstrsForSuppressedOperands(instrNode):
|
|||||||
# on some CPUs, instructions that write flags conditionally also read the flags
|
# on some CPUs, instructions that write flags conditionally also read the flags
|
||||||
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
|
if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue
|
||||||
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
|
if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue
|
||||||
depBreakingInstrs += ['TEST R13, R13']
|
depBreakingInstrs += ['TEST R15, R15']
|
||||||
|
|
||||||
return ';'.join(depBreakingInstrs)
|
return ';'.join(depBreakingInstrs)
|
||||||
|
|
||||||
@@ -1858,10 +1875,10 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
|
|||||||
|
|
||||||
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
||||||
elif 'MM' in reg:
|
elif 'MM' in reg:
|
||||||
instrI = getInstrInstanceFromNode(instrNode, ['R13', 'R15'], ['R13', 'R15'], True, {startNodeIdx:reg})
|
instrI = getInstrInstanceFromNode(instrNode, ['R12', 'R15'], ['R12', 'R15'], True, {startNodeIdx:reg})
|
||||||
configList.isUpperBound = True
|
configList.isUpperBound = True
|
||||||
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R13', reg):
|
for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R12', reg):
|
||||||
chainInstrs = 'CMOV' + flag[0] + ' R13, R15; ' + chainInstrI.asm
|
chainInstrs = 'CMOV' + flag[0] + ' R12, R15; ' + chainInstrI.asm
|
||||||
chainLatency = basicLatency['CMOV' + flag[0]] + 1
|
chainLatency = basicLatency['CMOV' + flag[0]] + 1
|
||||||
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
||||||
elif targetNode.attrib['type'] == 'mem':
|
elif targetNode.attrib['type'] == 'mem':
|
||||||
@@ -1952,6 +1969,7 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
addrReg = getAddrReg(instrNode, startNode)
|
addrReg = getAddrReg(instrNode, startNode)
|
||||||
|
indexReg = getIndexReg(instrNode, startNode)
|
||||||
memWidth = int(startNode.attrib.get('width', 0))
|
memWidth = int(startNode.attrib.get('width', 0))
|
||||||
|
|
||||||
if targetNode.attrib['type'] == 'reg':
|
if targetNode.attrib['type'] == 'reg':
|
||||||
@@ -1969,13 +1987,15 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
|
|||||||
print 'read from suppressed mem to non-GPR reg not yet supported'
|
print 'read from suppressed mem to non-GPR reg not yet supported'
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if reg in GPRegs:
|
instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg},
|
||||||
instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg})
|
useIndexedAddr=(addrMem=='addr_index'))
|
||||||
|
|
||||||
if addrMem == 'addr':
|
if reg in GPRegs:
|
||||||
|
if addrMem in ['addr', 'addr_index']:
|
||||||
# addr -> reg
|
# addr -> reg
|
||||||
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
||||||
chainInstrs = 'MOVSX ' + regTo64(reg) + ', ' + regToSize(reg, min(32, regSize)) + ';'
|
chainInstrs = 'MOVSX ' + regTo64(reg) + ', ' + regToSize(reg, min(32, regSize)) + ';'
|
||||||
chainInstrs += 'XOR {}, {};'.format(addrReg, regTo64(reg)) * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2
|
chainInstrs += 'XOR {}, {};'.format(chainReg, regTo64(reg)) * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2
|
||||||
chainLatency = basicLatency['MOVSX'] + basicLatency['XOR'] * cRep
|
chainLatency = basicLatency['MOVSX'] + basicLatency['XOR'] * cRep
|
||||||
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
||||||
else:
|
else:
|
||||||
@@ -1989,15 +2009,14 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
|
|||||||
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(regSize)] >= 12) # 0 if CPU supports zero-latency store forwarding
|
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(regSize)] >= 12) # 0 if CPU supports zero-latency store forwarding
|
||||||
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
||||||
elif 'MM' in reg:
|
elif 'MM' in reg:
|
||||||
instrI = getInstrInstanceFromNode(instrNode, ['R12'], ['R12'], useDistinctRegs, {targetNodeIdx:reg})
|
if addrMem in ['addr', 'addr_index']:
|
||||||
|
|
||||||
if addrMem == 'addr':
|
|
||||||
# addr -> reg
|
# addr -> reg
|
||||||
configList.isUpperBound = True
|
configList.isUpperBound = True
|
||||||
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
||||||
chainInstrs = 'MOVQ R12, {};'.format(getCanonicalReg(reg))
|
chainInstrs = 'MOVQ R12, {};'.format(getCanonicalReg(reg))
|
||||||
if isAVXInstr(instrNode):
|
if isAVXInstr(instrNode):
|
||||||
chainInstrs = 'V' + chainInstrs
|
chainInstrs = 'V' + chainInstrs
|
||||||
chainInstrs += 'XOR {}, {};'.format(addrReg, 'R12') * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2
|
chainInstrs += 'XOR {}, {};'.format(chainReg, 'R12') * cRep + ('TEST R15, R15;' if instrReadsFlags else '') # cRep is a multiple of 2
|
||||||
chainLatency = 1 + basicLatency['XOR'] * cRep
|
chainLatency = 1 + basicLatency['XOR'] * cRep
|
||||||
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
||||||
elif addrMem == 'addr_VSIB':
|
elif addrMem == 'addr_VSIB':
|
||||||
@@ -2018,11 +2037,13 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
|
|||||||
if not ('flag_'+flag) in targetNode.attrib: continue
|
if not ('flag_'+flag) in targetNode.attrib: continue
|
||||||
if not 'w' in targetNode.attrib[('flag_'+flag)]: continue
|
if not 'w' in targetNode.attrib[('flag_'+flag)]: continue
|
||||||
|
|
||||||
instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs)
|
instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs,
|
||||||
|
useIndexedAddr=(addrMem=='addr_index'))
|
||||||
|
|
||||||
if addrMem == 'addr':
|
if addrMem in ['addr', 'addr_index']:
|
||||||
# addr -> flag
|
# addr -> flag
|
||||||
chainInstr = 'CMOV' + flag[0] + ' ' + addrReg + ', ' + addrReg
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
||||||
|
chainInstr = 'CMOV' + flag[0] + ' ' + chainReg + ', ' + chainReg
|
||||||
chainLatency = basicLatency['CMOV' + flag[0]]
|
chainLatency = basicLatency['CMOV' + flag[0]]
|
||||||
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=chainLatency))
|
configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=chainLatency))
|
||||||
else:
|
else:
|
||||||
@@ -2043,14 +2064,17 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
|
|||||||
# mem -> mem
|
# mem -> mem
|
||||||
#################
|
#################
|
||||||
if startNode == targetNode:
|
if startNode == targetNode:
|
||||||
instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs=useDistinctRegs)
|
instrI = getInstrInstanceFromNode(instrNode, [addrReg, indexReg, 'R12'], [addrReg, indexReg, 'R12'], useDistinctRegs=useDistinctRegs,
|
||||||
|
useIndexedAddr=(addrMem=='addr_index'))
|
||||||
|
|
||||||
if addrMem == 'addr':
|
if addrMem in ['addr', 'addr_index']:
|
||||||
# addr -> mem
|
# addr -> mem
|
||||||
configList.isUpperBound = True
|
configList.isUpperBound = True
|
||||||
chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + addrReg + '];'
|
chainReg = (addrReg if addrMem == 'addr' else indexReg)
|
||||||
|
memStr = addrReg + ('+'+indexReg if addrMem == 'addr_index' else '')
|
||||||
|
chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + memStr + '];'
|
||||||
chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';') * cRep
|
chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';') * cRep
|
||||||
chainInstrs += 'XOR ' + addrReg + ', R12; XOR ' + addrReg + ', R12;' + ('TEST R13, R13;' if instrReadsFlags else '')
|
chainInstrs += 'XOR ' + chainReg + ', R12; XOR ' + chainReg + ', R12;' + ('TEST R15, R15;' if instrReadsFlags else '')
|
||||||
chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR']
|
chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR']
|
||||||
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding
|
chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding
|
||||||
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
|
||||||
@@ -2144,9 +2168,18 @@ def getLatencies(instrNode, instrNodeList, tpDict, htmlReports):
|
|||||||
|
|
||||||
addrMemList = ['']
|
addrMemList = ['']
|
||||||
if opNode1.attrib['type']=='mem':
|
if opNode1.attrib['type']=='mem':
|
||||||
addrMemList = ['addr', 'mem']+(['addr_VSIB'] if 'VSIB' in opNode1.attrib else [])
|
|
||||||
elif opNode1.attrib['type']=='agen' and ('B' in instrNode.attrib['agen'] or 'I' in instrNode.attrib['agen']):
|
|
||||||
addrMemList = ['addr']
|
addrMemList = ['addr']
|
||||||
|
if 'VSIB' in opNode1.attrib:
|
||||||
|
addrMemList.append('addr_VSIB')
|
||||||
|
elif opNode1.attrib.get('suppressed', '') != '1':
|
||||||
|
addrMemList.append('addr_index')
|
||||||
|
addrMemList.append('mem') # mem added last; order is relevant for html output
|
||||||
|
elif opNode1.attrib['type']=='agen' and ('B' in instrNode.attrib['agen'] or 'I' in instrNode.attrib['agen']):
|
||||||
|
addrMemList = []
|
||||||
|
if 'B' in instrNode.attrib['agen']:
|
||||||
|
addrMemList.append('addr')
|
||||||
|
if 'I' in instrNode.attrib['agen']:
|
||||||
|
addrMemList.append('addr_index')
|
||||||
|
|
||||||
for addrMem in addrMemList:
|
for addrMem in addrMemList:
|
||||||
minLatDistinctRegs = 0
|
minLatDistinctRegs = 0
|
||||||
@@ -2613,17 +2646,19 @@ def main():
|
|||||||
|
|
||||||
tpDict = {}
|
tpDict = {}
|
||||||
tpDictSameReg = {}
|
tpDictSameReg = {}
|
||||||
|
tpDictIndexedAddr = {}
|
||||||
tpDictNoInteriteration = {}
|
tpDictNoInteriteration = {}
|
||||||
|
|
||||||
if args.tpInput is not None:
|
if args.tpInput is not None:
|
||||||
with open(args.tpInput, 'rb') as f:
|
with open(args.tpInput, 'rb') as f:
|
||||||
pTpDict, pTpDictSameReg, pTpDictNoInteriteration = pickle.load(f)
|
pTpDict, pTpDictSameReg, pTpDictIndexedAddr, pTpDictNoInteriteration = pickle.load(f)
|
||||||
tpDict = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDict.items()}
|
tpDict = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDict.items()}
|
||||||
tpDictSameReg = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictSameReg.items()}
|
tpDictSameReg = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictSameReg.items()}
|
||||||
|
tpDictIndexedAddr = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictIndexedAddr.items()}
|
||||||
tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()}
|
tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()}
|
||||||
else:
|
else:
|
||||||
for i, instrNode in enumerate(instrNodeList):
|
for i, instrNode in enumerate(instrNodeList):
|
||||||
#if not 'MOVZX_NOREX' in instrNode.attrib['string']: continue
|
#if not 'MOV_NOREX' in instrNode.attrib['string']: continue
|
||||||
print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
|
print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
|
||||||
|
|
||||||
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
|
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
|
||||||
@@ -2631,7 +2666,10 @@ def main():
|
|||||||
hasCommonReg = hasCommonRegister(instrNode)
|
hasCommonReg = hasCommonRegister(instrNode)
|
||||||
if hasCommonReg: htmlReports.append('<h2 id="distinctRegs">With different registers for different operands</h2>\n')
|
if hasCommonReg: htmlReports.append('<h2 id="distinctRegs">With different registers for different operands</h2>\n')
|
||||||
|
|
||||||
tpResult = getThroughputAndUops(instrNode, True, htmlReports)
|
hasExplMemOp = hasExplicitNonVSIBMemOperand(instrNode)
|
||||||
|
if hasExplMemOp: htmlReports.append('<h2 id="nonIndexedAddr">With a non-indexed addressing mode</h2>\n')
|
||||||
|
|
||||||
|
tpResult = getThroughputAndUops(instrNode, True, False, htmlReports)
|
||||||
print instrNode.attrib['string'] + " - tp: " + str(tpResult)
|
print instrNode.attrib['string'] + " - tp: " + str(tpResult)
|
||||||
|
|
||||||
if tpResult:
|
if tpResult:
|
||||||
@@ -2639,10 +2677,16 @@ def main():
|
|||||||
|
|
||||||
if hasCommonReg:
|
if hasCommonReg:
|
||||||
htmlReports.append('<hr><h2 id="sameReg">With the same register for for different operands</h2>\n')
|
htmlReports.append('<hr><h2 id="sameReg">With the same register for for different operands</h2>\n')
|
||||||
tpResultSameReg = getThroughputAndUops(instrNode, False, htmlReports)
|
tpResultSameReg = getThroughputAndUops(instrNode, False, False, htmlReports)
|
||||||
if tpResultSameReg:
|
if tpResultSameReg:
|
||||||
tpDictSameReg[instrNode] = tpResultSameReg
|
tpDictSameReg[instrNode] = tpResultSameReg
|
||||||
|
|
||||||
|
if hasExplMemOp:
|
||||||
|
htmlReports.append('<hr><h2 id="indexedAddr">With an indexed addressing mode</h2>\n')
|
||||||
|
tpResultIndexed = getThroughputAndUops(instrNode, True, True, htmlReports)
|
||||||
|
if tpResultIndexed:
|
||||||
|
tpDictIndexedAddr[instrNode] = tpResultIndexed
|
||||||
|
|
||||||
if useIACA and iacaVersion in ['2.1', '2.2']:
|
if useIACA and iacaVersion in ['2.1', '2.2']:
|
||||||
htmlReports.append('<hr><h2 id="noInteriteration">With the -no_interiteration flag</h2>\n')
|
htmlReports.append('<hr><h2 id="noInteriteration">With the -no_interiteration flag</h2>\n')
|
||||||
tp = getThroughputIacaNoInteriteration(instrNode, htmlReports)
|
tp = getThroughputIacaNoInteriteration(instrNode, htmlReports)
|
||||||
@@ -2650,7 +2694,7 @@ def main():
|
|||||||
|
|
||||||
if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
|
if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
|
||||||
with open('tp_' + arch + '.pickle', 'wb') as f:
|
with open('tp_' + arch + '.pickle', 'wb') as f:
|
||||||
pickle.dump((tpDict, tpDictSameReg, tpDictNoInteriteration), f)
|
pickle.dump((tpDict, tpDictSameReg, tpDictIndexedAddr, tpDictNoInteriteration), f)
|
||||||
|
|
||||||
num_ports = len(tpDict.values()[0].unblocked_ports)
|
num_ports = len(tpDict.values()[0].unblocked_ports)
|
||||||
|
|
||||||
@@ -2669,7 +2713,7 @@ def main():
|
|||||||
latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()}
|
latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()}
|
||||||
elif not useIACA or iacaVersion == '2.1':
|
elif not useIACA or iacaVersion == '2.1':
|
||||||
for i, instrNode in enumerate(instrNodeList):
|
for i, instrNode in enumerate(instrNodeList):
|
||||||
#if not 'LEA' in instrNode.attrib['string']: continue
|
#if not 'ADC' in instrNode.attrib['string']: continue
|
||||||
print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
|
print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
|
||||||
|
|
||||||
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
|
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
|
||||||
@@ -2692,6 +2736,7 @@ def main():
|
|||||||
# the elements of this set are sets of ports that either have the same functional units, or that cannot be used independently
|
# the elements of this set are sets of ports that either have the same functional units, or that cannot be used independently
|
||||||
portCombinationsResultDict = {}
|
portCombinationsResultDict = {}
|
||||||
portCombinationsResultDictSameReg = {}
|
portCombinationsResultDictSameReg = {}
|
||||||
|
portCombinationsResultDictIndexedAddr = {}
|
||||||
|
|
||||||
if not args.noPorts:
|
if not args.noPorts:
|
||||||
# iforms of instructions that are potentially zero-latency instructions
|
# iforms of instructions that are potentially zero-latency instructions
|
||||||
@@ -2791,7 +2836,7 @@ def main():
|
|||||||
|
|
||||||
for i, instrNode in enumerate(instrNodeList):
|
for i, instrNode in enumerate(instrNodeList):
|
||||||
if not instrNode in tpDict:
|
if not instrNode in tpDict:
|
||||||
# don't iterate over the keys of unblocked_ports_dict directly because of the ordering
|
# don't iterate over the keys of tpDict directly because of the ordering
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print 'Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
|
print 'Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
|
||||||
@@ -2799,96 +2844,104 @@ def main():
|
|||||||
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Port Usage' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>']
|
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Port Usage' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>']
|
||||||
|
|
||||||
for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]):
|
for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]):
|
||||||
|
for useIndexedAddr in ([False, True] if useDistinctRegs and (instrNode in tpDictIndexedAddr) else [False]):
|
||||||
|
tpResult = None
|
||||||
|
|
||||||
tpResult = None
|
if not useDistinctRegs:
|
||||||
|
tp1 = tpDict[instrNode]
|
||||||
|
tp2 = tpDictSameReg[instrNode]
|
||||||
|
if (tp1.uops == tp2.uops and tp1.fused_uops == tp2.fused_uops): continue
|
||||||
|
tpResult = tp2
|
||||||
|
htmlReports.append('<hr><h2>With the same register for different operands</h2>')
|
||||||
|
elif useIndexedAddr:
|
||||||
|
tpResult = tpDictIndexedAddr[instrNode]
|
||||||
|
htmlReports.append('<hr><h2>With an indexed addressing mode</h2>')
|
||||||
|
else:
|
||||||
|
tpResult = tpDict[instrNode]
|
||||||
|
|
||||||
if useDistinctRegs:
|
rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2))
|
||||||
tpResult = tpDict[instrNode]
|
|
||||||
else:
|
|
||||||
if tpDict[instrNode].uops == tpDictSameReg[instrNode].uops: continue
|
|
||||||
tpResult = tpDictSameReg[instrNode]
|
|
||||||
htmlReports.append('<h2>With the same register for different operands</h2>')
|
|
||||||
|
|
||||||
rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2))
|
if not useIACA and tpResult.config.preInstrNodes:
|
||||||
|
rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes)
|
||||||
|
|
||||||
if not useIACA and tpResult.config.preInstrNodes:
|
# use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen
|
||||||
rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes)
|
used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05}
|
||||||
|
if debugOutput: print instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports)
|
||||||
|
|
||||||
# use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen
|
if not isAVXInstr(instrNode):
|
||||||
used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05}
|
blockingInstrs = blockingInstructionsDictNonAVX
|
||||||
if debugOutput: print instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports)
|
sortedPortCombinations = sortedPortCombinationsNonAVX
|
||||||
|
else:
|
||||||
|
blockingInstrs = blockingInstructionsDictNonSSE
|
||||||
|
sortedPortCombinations = sortedPortCombinationsNonSSE
|
||||||
|
|
||||||
if not isAVXInstr(instrNode):
|
uopsCombinationList = []
|
||||||
blockingInstrs = blockingInstructionsDictNonAVX
|
|
||||||
sortedPortCombinations = sortedPortCombinationsNonAVX
|
|
||||||
else:
|
|
||||||
blockingInstrs = blockingInstructionsDictNonSSE
|
|
||||||
sortedPortCombinations = sortedPortCombinationsNonSSE
|
|
||||||
|
|
||||||
uopsCombinationList = []
|
if not used_ports:
|
||||||
|
htmlReports.append('No uops')
|
||||||
|
elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (not tpResult.ILD_stalls > 0):
|
||||||
|
# one uop instruction
|
||||||
|
uopsCombinationList = [(frozenset(used_ports), 1)]
|
||||||
|
htmlReports.append('<hr>Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports))
|
||||||
|
elif rem_uops > 0 and not isAMDCPU():
|
||||||
|
for combination in sortedPortCombinations:
|
||||||
|
if not combination.intersection(used_ports): continue
|
||||||
|
|
||||||
if not used_ports:
|
prevUopsOnCombination = 0
|
||||||
htmlReports.append('No uops')
|
for prev_combination, prev_uops in uopsCombinationList:
|
||||||
elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (not tpResult.ILD_stalls > 0):
|
if prev_combination.issubset(combination):
|
||||||
# one uop instruction
|
prevUopsOnCombination += prev_uops
|
||||||
uopsCombinationList = [(frozenset(used_ports), 1)]
|
|
||||||
htmlReports.append('<hr>Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports))
|
|
||||||
elif rem_uops > 0 and not isAMDCPU():
|
|
||||||
for combination in sortedPortCombinations:
|
|
||||||
if not combination.intersection(used_ports): continue
|
|
||||||
|
|
||||||
prevUopsOnCombination = 0
|
if not useIACA:
|
||||||
for prev_combination, prev_uops in uopsCombinationList:
|
if tpResult.config.preInstrNodes:
|
||||||
if prev_combination.issubset(combination):
|
for preInstrNode in tpResult.config.preInstrNodes:
|
||||||
prevUopsOnCombination += prev_uops
|
for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]:
|
||||||
|
if pre_comb.issubset(combination):
|
||||||
|
prevUopsOnCombination += pre_uops
|
||||||
|
|
||||||
if not useIACA:
|
nPortsInComb = sum(len(str(x)) for x in combination)
|
||||||
if tpResult.config.preInstrNodes:
|
blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10)
|
||||||
for preInstrNode in tpResult.config.preInstrNodes:
|
blockInstrRep = min(blockInstrRep, 100)
|
||||||
for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]:
|
uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports)
|
||||||
if pre_comb.issubset(combination):
|
if uopsOnBlockedPorts is None:
|
||||||
prevUopsOnCombination += pre_uops
|
print 'no uops on blocked ports: ' + str(combination)
|
||||||
|
continue
|
||||||
|
|
||||||
nPortsInComb = sum(len(str(x)) for x in combination)
|
uopsOnBlockedPorts -= prevUopsOnCombination
|
||||||
blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10)
|
|
||||||
blockInstrRep = min(blockInstrRep, 100)
|
|
||||||
uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports)
|
|
||||||
if uopsOnBlockedPorts is None:
|
|
||||||
print 'no uops on blocked ports: ' + str(combination)
|
|
||||||
continue
|
|
||||||
|
|
||||||
uopsOnBlockedPorts -= prevUopsOnCombination
|
if rem_uops < uopsOnBlockedPorts:
|
||||||
|
print 'More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts)
|
||||||
|
|
||||||
if rem_uops < uopsOnBlockedPorts:
|
if uopsOnBlockedPorts <= 0: continue
|
||||||
print 'More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts)
|
|
||||||
|
|
||||||
if uopsOnBlockedPorts <= 0: continue
|
if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1':
|
||||||
|
# for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which
|
||||||
|
# seems to lead to replays of the store data uops
|
||||||
|
uopsOnBlockedPorts = 1
|
||||||
|
|
||||||
if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1':
|
uopsCombinationList.append((combination, uopsOnBlockedPorts))
|
||||||
# for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which
|
|
||||||
# seems to lead to replays of the store data uops
|
|
||||||
uopsOnBlockedPorts = 1
|
|
||||||
|
|
||||||
uopsCombinationList.append((combination, uopsOnBlockedPorts))
|
htmlReports.append('<strong>⇨ ' +
|
||||||
|
((str(uopsOnBlockedPorts) + ' μops') if (uopsOnBlockedPorts > 1) else 'One μop') +
|
||||||
|
' that can only use port' +
|
||||||
|
('s {' if len(combination)>1 else ' ') +
|
||||||
|
str(list(combination))[1:-1] +
|
||||||
|
('}' if len(combination)>1 else '') + '</strong>')
|
||||||
|
|
||||||
htmlReports.append('<strong>⇨ ' +
|
rem_uops -= uopsOnBlockedPorts
|
||||||
((str(uopsOnBlockedPorts) + ' μops') if (uopsOnBlockedPorts > 1) else 'One μop') +
|
if rem_uops <= 0: break
|
||||||
' that can only use port' +
|
|
||||||
('s {' if len(combination)>1 else ' ') +
|
|
||||||
str(list(combination))[1:-1] +
|
|
||||||
('}' if len(combination)>1 else '') + '</strong>')
|
|
||||||
|
|
||||||
rem_uops -= uopsOnBlockedPorts
|
# on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports
|
||||||
if rem_uops <= 0: break
|
# we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate
|
||||||
|
uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList]
|
||||||
|
|
||||||
# on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports
|
if not useDistinctRegs:
|
||||||
# we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate
|
portCombinationsResultDictSameReg[instrNode] = uopsCombinationList
|
||||||
uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList]
|
elif useIndexedAddr:
|
||||||
|
portCombinationsResultDictIndexedAddr[instrNode] = uopsCombinationList
|
||||||
|
else:
|
||||||
|
portCombinationsResultDict[instrNode] = uopsCombinationList
|
||||||
|
|
||||||
if useDistinctRegs:
|
|
||||||
portCombinationsResultDict[instrNode] = uopsCombinationList
|
|
||||||
else:
|
|
||||||
portCombinationsResultDictSameReg[instrNode] = uopsCombinationList
|
|
||||||
|
|
||||||
writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
|
writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
|
||||||
|
|
||||||
@@ -2905,13 +2958,18 @@ def main():
|
|||||||
else:
|
else:
|
||||||
resultNode = archNode.find('./measurement')
|
resultNode = archNode.find('./measurement')
|
||||||
|
|
||||||
tpResult_dr = tpDict[instrNode]
|
applicableResults = [(tpDict[instrNode], portCombinationsResultDict.get(instrNode, None), '')]
|
||||||
tpResult_sr = tpDictSameReg.get(instrNode, tpResult_dr)
|
for otherTPDict, otherPCDict, suffix in [(tpDictSameReg, portCombinationsResultDictSameReg, '_same_reg'),
|
||||||
|
(tpDictIndexedAddr, portCombinationsResultDictIndexedAddr, '_indexed')]:
|
||||||
for tpResult in ([tpResult_dr, tpResult_sr] if tpResult_dr.uops != tpResult_sr.uops else [tpResult_dr]):
|
if instrNode in otherTPDict:
|
||||||
suffix = ('' if tpResult == tpResult_dr else '_same_reg')
|
t1 = tpDict[instrNode]
|
||||||
curPortCombinationsResultDict = (portCombinationsResultDict if tpResult == tpResult_dr else portCombinationsResultDictSameReg)
|
t2 = otherTPDict[instrNode]
|
||||||
|
p1 = portCombinationsResultDict.get(instrNode, None)
|
||||||
|
p2 = otherPCDict.get(instrNode, None)
|
||||||
|
if (t1.uops != t2.uops or t1.fused_uops != t2.fused_uops or ((p2 is not None) and (p1 != p2))):
|
||||||
|
applicableResults.append((t2, p2, suffix))
|
||||||
|
|
||||||
|
for tpResult, portUsageList, suffix in applicableResults:
|
||||||
uops = tpResult.uops
|
uops = tpResult.uops
|
||||||
uopsFused = tpResult.fused_uops
|
uopsFused = tpResult.fused_uops
|
||||||
if useIACA:
|
if useIACA:
|
||||||
@@ -2937,10 +2995,7 @@ def main():
|
|||||||
|
|
||||||
portPrefix = ('p' if isIntelCPU() else 'FP')
|
portPrefix = ('p' if isIntelCPU() else 'FP')
|
||||||
computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(str(p) for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0])))
|
computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(str(p) for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0])))
|
||||||
if instrNode in curPortCombinationsResultDict:
|
if portUsageList:
|
||||||
portUsageList = curPortCombinationsResultDict[instrNode]
|
|
||||||
if not portUsageList: continue
|
|
||||||
|
|
||||||
resultNode.attrib['ports'+suffix] = computePortStr(portUsageList)
|
resultNode.attrib['ports'+suffix] = computePortStr(portUsageList)
|
||||||
|
|
||||||
portUsageWithDivList = list(portUsageList)
|
portUsageWithDivList = list(portUsageList)
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ def latencyNodeToStr(latNode, sameReg, addr_mem):
|
|||||||
ret += ', with the same register for different operands'
|
ret += ', with the same register for different operands'
|
||||||
if addr_mem == 'addr':
|
if addr_mem == 'addr':
|
||||||
ret += ' (address, base register)'
|
ret += ' (address, base register)'
|
||||||
elif addr_mem == 'addr_VSIB':
|
elif addr_mem in ['addr_index', 'addr_VSIB']:
|
||||||
ret += ' (address, index register)'
|
ret += ' (address, index register)'
|
||||||
elif addr_mem == 'mem':
|
elif addr_mem == 'mem':
|
||||||
ret += ' (memory)'
|
ret += ' (memory)'
|
||||||
@@ -144,7 +144,7 @@ def getLatencyTableEntry(measurementNode):
|
|||||||
|
|
||||||
for latNode in measurementNode.findall('./latency'):
|
for latNode in measurementNode.findall('./latency'):
|
||||||
for sameReg in [False, True]:
|
for sameReg in [False, True]:
|
||||||
for addr_mem in ['', 'addr', 'mem']:
|
for addr_mem in ['', 'addr', 'addr_index', 'addr_VSIB', 'mem']:
|
||||||
suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if sameReg else '')
|
suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if sameReg else '')
|
||||||
if 'cycles'+suffix in latNode.attrib:
|
if 'cycles'+suffix in latNode.attrib:
|
||||||
cycles = int(latNode.attrib['cycles'+suffix])
|
cycles = int(latNode.attrib['cycles'+suffix])
|
||||||
|
|||||||
Reference in New Issue
Block a user