diff --git a/.gitignore b/.gitignore
index d43152b..b92b833 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,11 +3,14 @@
*.o
*.o.ur-safe
*.mod
+*.dwo
*.pyc
*.html
+*.xml
*.asm
*.s
-
+*.tar.gz
+.vscode
# Kernel ignore files/directories
kernel/.cache.mk
diff --git a/kernel/Makefile b/kernel/Makefile
index cd23eaa..89c0c85 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,5 +23,5 @@ all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
- rm -f ../common/*.o ../common/*.ur-safe ../common/*.dwo
- rm -rf *.o *.ko *.mod *.mod.c *.ur-safe *.dwo .tmp_versions modules.order Module.symvers
+ rm -f ../common/*.o ../common/*.ur-safe ../common/*.dwo ../common/.nanoBench.*
+ rm -rf *.o *.ko *.mod *.mod.c *.ur-safe *.dwo .tmp_versions .cache.mk .nb.* .nb_km.* .modules.* .Module.* modules.order Module.symvers
diff --git a/tools/cpuBench/addIACAToXML.sh b/tools/cpuBench/addIACAToXML.sh
new file mode 100755
index 0000000..0673f85
--- /dev/null
+++ b/tools/cpuBench/addIACAToXML.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+set -x
+
+./cpuBench.py -iaca "$1/iaca-version-2.1/bin/iaca.sh" -input "$2" -arch 'NHM' > output_NHM2.1.txt 2>error_NHM2.1.txt
+./cpuBench.py -iaca "$1/iaca-version-2.2/bin/iaca.sh" -input result.xml -arch 'NHM' > output_NHM2.2.txt 2>error_NHM2.2.txt
+./cpuBench.py -iaca "$1/iaca-version-2.1/bin/iaca.sh" -input result.xml -arch 'WSM' > output_WSM2.1.txt 2>error_WSM2.1.txt
+./cpuBench.py -iaca "$1/iaca-version-2.2/bin/iaca.sh" -input result.xml -arch 'WSM' > output_WSM2.2.txt 2>error_WSM2.2.txt
+./cpuBench.py -iaca "$1/iaca-version-2.1/bin/iaca.sh" -input result.xml -arch 'SNB' > output_SNB2.1.txt 2>error_SNB2.1.txt
+./cpuBench.py -iaca "$1/iaca-version-2.2/bin/iaca.sh" -input result.xml -arch 'SNB' > output_SNB2.2.txt 2>error_SNB2.2.txt
+./cpuBench.py -iaca "$1/iaca-version-2.3/bin/iaca.sh" -input result.xml -arch 'SNB' > output_SNB2.3.txt 2>error_SNB2.3.txt
+./cpuBench.py -iaca "$1/iaca-version-2.1/bin/iaca.sh" -input result.xml -arch 'IVB' > output_IVB2.1.txt 2>error_IVB2.1.txt
+./cpuBench.py -iaca "$1/iaca-version-2.2/bin/iaca.sh" -input result.xml -arch 'IVB' > output_IVB2.2.txt 2>error_IVB2.2.txt
+./cpuBench.py -iaca "$1/iaca-version-2.3/bin/iaca.sh" -input result.xml -arch 'IVB' > output_IVB2.3.txt 2>error_IVB2.3.txt
+./cpuBench.py -iaca "$1/iaca-version-2.1/bin/iaca.sh" -input result.xml -arch 'HSW' > output_HSW2.1.txt 2>error_HSW2.1.txt
+./cpuBench.py -iaca "$1/iaca-version-2.2/bin/iaca.sh" -input result.xml -arch 'HSW' > output_HSW2.2.txt 2>error_HSW2.2.txt
+./cpuBench.py -iaca "$1/iaca-version-2.3/bin/iaca.sh" -input result.xml -arch 'HSW' > output_HSW2.3.txt 2>error_HSW2.3.txt
+./cpuBench.py -iaca "$1/iaca-version-3.0/iaca" -input result.xml -arch 'HSW' > output_HSW3.0.txt 2>error_HSW3.0.txt
+./cpuBench.py -iaca "$1/iaca-version-2.2/bin/iaca.sh" -input result.xml -arch 'BDW' > output_BDW2.2.txt 2>error_BDW2.2.txt
+./cpuBench.py -iaca "$1/iaca-version-2.3/bin/iaca.sh" -input result.xml -arch 'BDW' > output_BDW2.3.txt 2>error_BDW2.3.txt
+./cpuBench.py -iaca "$1/iaca-version-3.0/iaca" -input result.xml -arch 'BDW' > output_BDW3.0.txt 2>error_BDW3.0.txt
+./cpuBench.py -iaca "$1/iaca-version-2.3/bin/iaca.sh" -input result.xml -arch 'SKL' > output_SKL2.3.txt 2>error_SKL2.3.txt
+./cpuBench.py -iaca "$1/iaca-version-3.0/iaca" -input result.xml -arch 'SKL' > output_SKL3.0.txt 2>error_SKL3.0.txt
+./cpuBench.py -iaca "$1/iaca-version-2.3/bin/iaca.sh" -input result.xml -arch 'SKX' > output_SKX2.3.txt 2>error_SKX2.3.txt
+./cpuBench.py -iaca "$1/iaca-version-3.0/iaca" -input result.xml -arch 'SKX' > output_SKX3.0.txt 2>error_SKX3.0.txt
diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py
index 8f28773..beebb7f 100755
--- a/tools/cpuBench/cpuBench.py
+++ b/tools/cpuBench/cpuBench.py
@@ -37,16 +37,19 @@ supportsAVX = False
instrNodeList = [] # list of all XML instruction nodes that are not filtered out
instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode
-globalDoNotWriteRegs = {'R13', 'R13D', 'R13W', 'R13B', 'R14', 'R14D', 'R14W', 'R14B', 'R15', 'R15D', 'R15W', 'R15B', 'SP', 'SPL', 'ESP', 'RSP', 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15', 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'RBP', 'EBP', 'BP', 'K0'} #ToDo
+#R15: loop counter
#R14: reserved for memory addresses (base)
#R13: reserved for memory addresses (index)
-#R15: loop counter
+globalDoNotWriteRegs = {'R15', 'R15D', 'R15W', 'R15B', 'RSP', 'ESP' , 'SP', 'SPL',
+ 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15',
+ 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'K0'}
+memRegs = {'R14', 'R14D', 'R14W', 'R14B', 'R13', 'R13D', 'R13W', 'R13B', 'RDI', 'EDI', 'DI', 'DIL', 'RSI', 'ESI', 'SI', 'SIL', 'RBP', 'EBP', 'BP', 'BPL'}
+
specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP',
'TR', 'TSC', 'TSCAUX', 'X87CONTROL', 'X87POP', 'X87POP2', 'X87PUSH', 'X87STATUS', 'X87TAG', 'XCR0', 'XMM0dq', 'CR0', 'CR2', 'CR3', 'CR4', 'CR8', 'ERROR',
'BND0', 'BND1', 'BND2', 'BND3'}
-
maxTPRep = 16
#iforms of serializing and memory-ordering instructions according to Ch. 8.3 of the Intel manual
@@ -63,7 +66,7 @@ def isIntelCPU():
def getAddrReg(instrNode, opNode):
if opNode.attrib.get('suppressed', '0') == '1':
return opNode.attrib['base']
- elif instrNode.attrib.get('rex', '1') == '0':
+ elif instrNode.attrib.get('high8', '') != '':
return 'RDI'
else:
return 'R14'
@@ -73,7 +76,7 @@ def getIndexReg(instrNode, opNode):
return regTo64(opNode.attrib['index'])
elif opNode.attrib.get('VSIB', '0') != '0':
return opNode.attrib.get('VSIB') + '14'
- elif instrNode.attrib.get('rex', '1') == '0':
+ elif instrNode.attrib.get('high8', '') != '':
return 'RSI'
else:
return 'R13'
@@ -161,7 +164,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
initCode = '; '.join(init)
useLateInit = any((reg in initCode.upper()) for reg in High8Regs)
- if instrNode is not None and (instrNode.attrib.get('vex', '') == '1' or instrNode.attrib.get('evex', '') == '1'):
+ if (instrNode is not None) and (instrNode.attrib.get('vex', '') == '1' or instrNode.attrib.get('evex', '') == '1'):
# vex and evex encoded instructions need a warm-up period before memory reads operate at full speed;
# https://software.intel.com/en-us/forums/intel-isa-extensions/topic/710248
reg = 'ZMM' if 'ZMM' in instrNode.attrib['iform'] else 'YMM'
@@ -172,6 +175,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
initCode = avxInitCode + initCode
nanoBenchCmd = 'sudo ./kernel-nanoBench.sh'
+ nanoBenchCmd += ' -f'
nanoBenchCmd += ' -unroll ' + str(unrollCount)
if loopCount > 0: nanoBenchCmd += ' -loop ' + str(loopCount)
if basicMode: nanoBenchCmd += ' -basic'
@@ -205,20 +209,20 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
if evt == 'UOPS':
if arch in ['CON', 'WOL']: evt = 'RS_UOPS_DISPATCHED'
elif arch in ['NHM', 'WSM']: evt = 'UOPS_RETIRED.ANY'
- elif arch in ['SNB']: evt = 'UOPS_RETIRED.ALL'
+ elif arch in ['SNB', 'ADL-E']: evt = 'UOPS_RETIRED.ALL'
elif arch in ['HSW']: evt = 'UOPS_EXECUTED.CORE'
- elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: evt = 'UOPS_EXECUTED.THREAD'
+ elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: evt = 'UOPS_EXECUTED.THREAD'
localHtmlReports.append('
' + evt + ': ' + str(value) + '\n')
localHtmlReports.append('\n')
- if arch in ['NHM', 'WSM'] and 'UOPS_PORT3' in ret:
+ if arch in ['NHM', 'WSM'] and 'UOPS_PORT_3' in ret:
# Workaround for broken port4 and port5 counters
- ret['UOPS_PORT4'] = ret['UOPS_PORT3']
- ret['UOPS_PORT5'] = max(0, ret['UOPS'] - ret['UOPS_PORT0'] - ret['UOPS_PORT1'] - ret['UOPS_PORT2'] - ret['UOPS_PORT3'] - ret['UOPS_PORT4'])
+ ret['UOPS_PORT_4'] = ret['UOPS_PORT_3']
+ ret['UOPS_PORT_5'] = max(0, ret['UOPS'] - ret['UOPS_PORT_0'] - ret['UOPS_PORT_1'] - ret['UOPS_PORT_2'] - ret['UOPS_PORT_3'] - ret['UOPS_PORT_4'])
- if arch in ['SNB'] and all(('UOPS_PORT'+str(p) in ret) for p in range(0,6)):
+ if arch in ['SNB'] and all(('UOPS_PORT_'+str(p) in ret) for p in range(0,6)):
# some retired uops are not executed due to, e.g., move el. and zero idioms; however, using the sum of the uops on all ports may overcount due to replays
- ret['UOPS'] = min(ret['UOPS'], sum(ret['UOPS_PORT'+str(p)] for p in range(0,6)))
+ ret['UOPS'] = min(ret['UOPS'], sum(ret['UOPS_PORT_'+str(p)] for p in range(0,6)))
if isAMDCPU():
ret['Core cycles'] = ret['APERF']
@@ -235,7 +239,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
# return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
if any('PORT' in e for e in ret):
- maxPortUops = max(v/(len(e)-9) for e,v in ret.items() if 'PORT' in e)
+ maxPortUops = max(v/len(e.replace('UOPS_PORT_', '')) for e,v in ret.items() if 'PORT' in e)
if maxPortUops * .98 > ret['Core cycles']:
print('Repeating experiment because there were more uops on a port than core cycles')
return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=True, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
@@ -271,62 +275,77 @@ def getEventConfig(event):
if arch in ['CON', 'WOL']: return 'A0.00' # RS_UOPS_DISPATCHED
if arch in ['NHM', 'WSM', 'SNB' ]: return 'C2.01' # UOPS_RETIRED.ANY
if arch in ['SNB']: return 'C2.01' # UOPS_RETIRED.ALL
+ if arch in ['ADL-E']: return 'C2.00' # UOPS_RETIRED.ALL
if arch in ['HSW']: return 'B1.02' # UOPS_EXECUTED.CORE; note: may undercount due to erratum HSD30
- if arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'B1.01' # UOPS_EXECUTED.THREAD
+ if arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: return 'B1.01' # UOPS_EXECUTED.THREAD
if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '0C1.00'
if event == 'RETIRE_SLOTS':
- if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'C2.02'
+ if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: return 'C2.02'
if event == 'UOPS_MITE':
- if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return '79.04'
- if event == 'UOPS_MITE>0':
- if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return '79.04.CMSK=1'
+ if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: return '79.04'
+ if event == 'UOPS_MITE>=1':
+ if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: return '79.04.CMSK=1'
if event == 'UOPS_MS':
if arch in ['NHM', 'WSM']: return 'D1.02'
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return '79.30'
- if event == 'UOPS_PORT0':
+ if arch in ['ADL-P']: return '79.20'
+ if arch in ['ADL-E']: return 'C2.01'
+ if event == 'UOPS_PORT_0':
if arch in ['CON', 'WOL']: return 'A1.01.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.01'
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.01'
- if event == 'UOPS_PORT1':
+ if arch in ['ADL-P']: return 'B2.01'
+ if event == 'UOPS_PORT_1':
if arch in ['CON', 'WOL']: return 'A1.02.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.02'
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.02'
- if event == 'UOPS_PORT2':
+ if arch in ['ADL-P']: return 'B2.02'
+ if event == 'UOPS_PORT_2':
if arch in ['CON', 'WOL']: return 'A1.04.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.04'
if arch in ['SNB', 'IVB']: return 'A1.0C'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.04'
- if event == 'UOPS_PORT3':
+ if event == 'UOPS_PORT_3':
if arch in ['CON', 'WOL']: return 'A1.08.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.08'
if arch in ['SNB', 'IVB']: return 'A1.30'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.08'
- if event == 'UOPS_PORT4':
+ if event == 'UOPS_PORT_4':
if arch in ['CON', 'WOL']: return 'A1.10.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.10'
if arch in ['SNB', 'IVB']: return 'A1.40'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.10'
- if event == 'UOPS_PORT5':
+ if event == 'UOPS_PORT_5':
if arch in ['CON', 'WOL']: return 'A1.20.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.20'
if arch in ['SNB', 'IVB']: return 'A1.80'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.20'
- if event == 'UOPS_PORT6':
+ if event == 'UOPS_PORT_6':
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return 'A1.40'
- if event == 'UOPS_PORT7':
+ if arch in ['ADL-P']: return 'B2.40'
+ if event == 'UOPS_PORT_7':
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.80'
- if event == 'UOPS_PORT23':
+ if event == 'UOPS_PORT_23':
if arch in ['ICL', 'TGL', 'RKL']: return 'A1.04'
- if event == 'UOPS_PORT49':
+ if event == 'UOPS_PORT_49':
if arch in ['ICL', 'TGL', 'RKL']: return 'A1.10'
- if event == 'UOPS_PORT78':
+ if arch in ['ADL-P']: return 'B2.10'
+ if event == 'UOPS_PORT_78':
if arch in ['ICL', 'TGL', 'RKL']: return 'A1.80'
+ if arch in ['ADL-P']: return 'B2.80'
+ if event == 'UOPS_PORT_5B':
+ if arch in ['ADL-P']: return 'B2.20'
+ if event == 'UOPS_PORT_5B>=2':
+ if arch in ['ADL-P']: return 'B2.20.CMSK=2'
+ if event == 'UOPS_PORT_23A':
+ if arch in ['ADL-P']: return 'B2.04'
if event == 'DIV_CYCLES':
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return '14.01' # undocumented on HSW, but seems to work
if arch in ['ICL', 'TGL', 'RKL']: return '14.09'
if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '0D3.00'
+ if arch in ['ADL-P']: return 'B0.09.CMSK=1'
if event == 'ILD_STALL.LCP':
- if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return '87.01'
+ if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: return '87.01'
if event == 'INST_DECODED.DEC0':
if arch in ['NHM', 'WSM']: return '18.01'
if event == 'FpuPipeAssignment.Total0':
@@ -363,6 +382,7 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
if not opRegDict: opRegDict = {}
if instrNode.attrib['extension'] == 'AVX2GATHER': useDistinctRegs=True
+ hasMemOperand = len(instrNode.findall('./operand[@type="mem"]'))>0
readRegs = set()
writtenRegs = set()
@@ -384,7 +404,7 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
commonReg = None
if not useDistinctRegs:
commonRegs = findCommonRegisters(instrNode)
- commonRegs -= set(doNotWriteRegs)|set(doNotReadRegs)|globalDoNotWriteRegs
+ commonRegs -= set(doNotWriteRegs)|set(doNotReadRegs)|globalDoNotWriteRegs|(memRegs if hasMemOperand else set())
if commonRegs:
commonReg = sortRegs(commonRegs)[0]
@@ -419,7 +439,7 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
if len(regsList) > 1:
ignoreRegs = set()
if operandNode.attrib.get('w', '0') == '1':
- ignoreRegs |= set(doNotWriteRegs)|globalDoNotWriteRegs|set(opRegDict.values())
+ ignoreRegs |= set(doNotWriteRegs)|globalDoNotWriteRegs|set(opRegDict.values())|(memRegs if hasMemOperand else set())
if operandNode.attrib.get('r', '0') == '1':
ignoreRegs |= set(doNotReadRegs)|writtenRegs|readRegs|set(opRegDict.values())
regsList = [x for x in regsList if not any(getCanonicalReg(x) == getCanonicalReg(y) for y in ignoreRegs)]
@@ -527,12 +547,12 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
writtenRegs = instrInstance.writtenRegs
if debugOutput: print(' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs))
- blockInstrsList = getIndependentInstructions(blockInstrNode, True, False, writtenRegs|readRegs, writtenRegs|readRegs, 64)
+ blockInstrsList = getIndependentInstructions(blockInstrNode, True, False, writtenRegs|readRegs, writtenRegs|readRegs|memRegs, 64)
if debugOutput: print(' bIL: ' + str(blockInstrsList))
htmlReports.append('
With blocking instructions for port' +
('s {' if len(blockedPorts)>1 else ' ') +
- str(list(blockedPorts))[1:-1] +
+ str(sorted(blockedPorts))[1:-1] +
('}' if len(blockedPorts)>1 else '') + ':
')
if useIACA:
@@ -556,10 +576,10 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
instrUopsOnBlockedPorts = 0.0
for p in blockedPorts:
- allPortsCol = allPortsLine.split('|')[p+2].split()
+ allPortsCol = allPortsLine.split('|')[int(p)+2].split()
allUopsOnBlockedPorts += float(allPortsCol[0])
- instrPortsCol = instrPortsLine.split('|')[p+2].split()
+ instrPortsCol = instrPortsLine.split('|')[int(p)+2].split()
if instrPortsCol:
instrUopsOnBlockedPorts += float(instrPortsCol[0])
@@ -576,9 +596,14 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
if isIntelCPU():
if arch in ['NHM', 'WSM']:
# Needed for workaround for broken port 5 counter
- events = ['UOPS_PORT'+str(p) for p in range(0,6)] + ['UOPS']
+ events = ['UOPS_PORT_'+str(p) for p in range(0,6)] + ['UOPS']
else:
- events = ['UOPS_PORT'+str(p) for p in blockedPorts]
+ events = ['UOPS_PORT_'+str(p) for p in blockedPorts]
+
+ if (arch in ['ADL-P']) and ('5' in blockedPorts): # note that any port combination that contains B also contains 5
+ events += ['UOPS_PORT_5B']
+ if 'B' not in blockedPorts:
+ events += ['UOPS_PORT_5B>=2']
else:
if arch in ['ZEN+', 'ZEN2']:
events = ['FpuPipeAssignment.Total'+str(p) for p in range(0,4)]
@@ -600,29 +625,41 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr
htmlReports.append('\n')
if float(measurementResult['Core cycles']) < -10:
- #something went wrong; this happens for example on HSW with long sequences of JMP instructions
+ # something went wrong; this happens for example on HSW with long sequences of JMP instructions
if debugOutput: print('Core cycles < -10 in getUopsOnBlockedPorts')
- if sum(u for p, u in measurementResult.items() if ('UOPS_PORT' in p or 'FpuPipeAssignment.Total' in p)) < blockInstrRep-.5:
+ if 'UOPS_PORT_5B>=2' in measurementResult:
+ measurementResult['UOPS_PORT_5'] = measurementResult['UOPS_PORT_5B'] - measurementResult['UOPS_PORT_5B>=2']
+ del measurementResult['UOPS_PORT_5B>=2']
+ del measurementResult['UOPS_PORT_5B']
+ elif 'UOPS_PORT_5B' in measurementResult:
+ # in the following, only the sum of the uops on ports 5 and B matters, not how they are distributed
+ measurementResult['UOPS_PORT_5'] = measurementResult['UOPS_PORT_5B']
+ del measurementResult['UOPS_PORT_5B']
+
+ if isIntelCPU():
+ ports_dict = {p[10:]: i for p, i in measurementResult.items() if p.startswith('UOPS_PORT')}
+ else:
+ ports_dict = {p[23:]: i for p, i in measurementResult.items() if 'FpuPipeAssignment.Total' in p}
+
+ if sum(ports_dict.values()) < blockInstrRep-.5:
# something went wrong; fewer uops on ports than blockInstrRep
# happens, e.g., on SKX for ports {0, 1} if AVX-512 is active
return None
- if isIntelCPU():
- ports_dict = {int(p[9:]): i for p, i in measurementResult.items() if p.startswith('UOPS_PORT')}
- else:
- ports_dict = {int(p[23:]): i for p, i in measurementResult.items() if 'FpuPipeAssignment.Total' in p}
-
return int(.2+sum([uops for p, uops in ports_dict.items() if p in blockedPorts])) - blockInstrRep
# Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of
# calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep.
def getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, doNotReadRegs=None, doNotWriteRegs=None, initialOffset=0, immediate=2):
+ hasMemOperand = len(instrNode.findall('./operand[@type="mem"]'))>0
if not doNotReadRegs: doNotReadRegs = set()
if not doNotWriteRegs: doNotWriteRegs = set()
doNotReadRegs |= specialRegs
doNotWriteRegs |= globalDoNotWriteRegs|specialRegs
+ if hasMemOperand:
+ doNotWriteRegs |= memRegs
for opNode in instrNode.iter('operand'):
if opNode.attrib['type'] == 'reg':
@@ -647,7 +684,7 @@ def getIndependentInstructions(instrNode, useDistinctRegs, useIndexedAddr, doNot
break
maxMemWidth = 0
- for memNode in instrNode.findall('./operand[@type="mem"][@w="1"]'):
+ for memNode in instrNode.findall('./operand[@type="mem"]'):
maxMemWidth = max(maxMemWidth, int(memNode.attrib.get('width', '0')) // 8)
offset += maxMemWidth
@@ -783,7 +820,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeI
config = TPConfig(independentInstrs, depBreakingInstrs, [], preInstrCode, preInstrNodes)
if re.search('BT.*MEMv_GPRv', iform):
- config.init = list(set('mov ' + regTo64(r) + ', 0' for i in independentInstrs for r in i.readRegs if not regTo64(r) in globalDoNotWriteRegs))
+ config.init = list(set('mov ' + regTo64(r) + ', 0' for i in independentInstrs for r in i.readRegs if not regTo64(r) in globalDoNotWriteRegs|memRegs))
if iform in ['CALL_NEAR_GPRv', 'JMP_GPRv']:
config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1: 'RAX'})]
@@ -802,7 +839,7 @@ def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeI
if iform == 'LLDT_GPR16': config.init = list(set('SLDT ' + reg for i in independentInstrs for reg in i.readRegs))
if iform == 'LMSW_GPR16': config.init = list(set('SMSW ' + reg for i in independentInstrs for reg in i.readRegs))
- if iform == 'LMSW_MEMw': config.init = list(['SMSW [R14+'+str(i*64)+']' for i in range(0,maxTPRep)])
+ if iform == 'LMSW_MEMw': config.init = list(['SMSW [R14+'+str(i*2)+']' for i in range(0,maxTPRep)])
if iform == 'MOVDIR64B_GPRa_MEM':
config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1: 'RSI'})]
@@ -998,7 +1035,7 @@ def fancyRound(cycles):
TPResult = namedtuple('TPResult', ['TP', 'TP_loop', 'TP_noLoop', 'TP_noDepBreaking_noLoop', 'TP_single', 'uops', 'fused_uops', 'uops_MITE', 'uops_MS', 'divCycles',
- 'ILD_stalls', 'complexDec', 'nAvailableSimpleDecoders', 'config', 'unblocked_ports'])
+ 'ILD_stalls', 'complexDec', 'nAvailableSimpleDecoders', 'config', 'unblocked_ports', 'all_used_ports'])
# returns TPResult
# port usages are averages (when no ports are blocked by other instructions)
@@ -1010,6 +1047,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
minTP_noLoop = sys.maxsize
minTP_noDepBreaking_noLoop = sys.maxsize
minTP_single = sys.maxsize
+ ports_dict = {}
+ all_used_ports = set()
if useIACA:
config = configs[0] # consider only first config as IACA does not seem to consider different values in registers
@@ -1031,7 +1070,7 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o'])
iaca_out = subprocess.check_output(iacaCMDLine + ['/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT).decode()
except subprocess.CalledProcessError as e:
- logging.warn('Error: ' + e.output.decode())
+ logging.warning('Error: ' + e.output.decode())
if minTP != sys.maxsize:
htmlReports.append('' + e.output.decode() + '
\n')
continue # on SNB, IACA 2.2 crashes on only some (larger) inputs
@@ -1060,13 +1099,15 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
num_ports = re.search('\| Port \|.*', iaca_out).group(0).count('|')-2
- ports_dict = {}
for p in range(0, num_ports):
portCol = ports_line.split('|')[p+2].split()
if portCol:
- ports_dict[p] = float(portCol[0])
+ usage = float(portCol[0])
+ ports_dict[str(p)] = usage
+ if usage > 0:
+ all_used_ports.add(str(p))
else:
- ports_dict[p] = 0.0
+ ports_dict[str(p)] = 0.0
port0 = ports_line.split('|')[2].split()
if len(port0)>1:
@@ -1074,7 +1115,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
else:
divCycles = 0
- return TPResult(minTP, minTP, minTP, minTP_noDepBreaking_noLoop, minTP_single, unfused_uops, fused_uops, None, None, divCycles, 0, False, None, config, ports_dict)
+ return TPResult(minTP, minTP, minTP, minTP_noDepBreaking_noLoop, minTP_single, unfused_uops, fused_uops, None, None, divCycles, 0, False, None, config,
+ ports_dict, all_used_ports)
else:
hasMemWriteOperand = len(instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'))>0
uops = None
@@ -1086,7 +1128,6 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
complexDec = False
# number of other instr. requiring the simple decoder that can be decoded in the same cycle; only applicable for instr. that require the complex decoder
nAvailableSimpleDecoders = None
- ports_dict = {}
for config in configs:
if config.note: htmlReports.append('' + config.note + '
\n')
@@ -1112,8 +1153,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
if ic > 1 and minTP_noLoop < sys.maxsize and minTP_loop < sys.maxsize and minTP_noLoop > 100 and minTP_loop > 100: break
paddingTypes = ['']
- if ((repType != 'unrollOnly') and (uopsMITE is not None) and (not uopsMS) and (math.ceil(32.0/instrLen) * uopsMITE > 18)
- and (not 'RIP' in config.preInstrCode)):
+ if ((repType != 'unrollOnly') and (uopsMITE is not None) and (not uopsMS) and (not 'RIP' in config.preInstrCode)
+ and ((math.ceil(32.0/instrLen) * uopsMITE > 18) or any(imm in instrNode.attrib['string'] for imm in ['I16', 'I32', 'I64']))):
if (instrNode.attrib.get('vex', '') != '') or (instrNode.attrib.get('evex', '') != '') or (instrNode.attrib.get('high8', '') != ''):
paddingTypes.append('long NOPs')
else:
@@ -1182,6 +1223,9 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
minTP_noLoop = min(minTP_noLoop, cycles)
if not useDepBreakingInstrs:
minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles)
+ for p, i in result.items():
+ if (i > .1) and (('UOPS_PORT' in p) or ('FpuPipeAssignment.Total' in p)):
+ all_used_ports.add(p[10:] if ('UOPS_PORT' in p) else p[23:])
else:
minTP_loop = min(minTP_loop, cycles)
@@ -1190,10 +1234,10 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
minTP_single = min(minTP_single, cycles)
if isIntelCPU():
- ports_dict = {int(p[9:]): i for p, i in result.items() if 'UOPS_PORT' in p}
+ ports_dict = {p[10:]: i for p, i in result.items() if 'UOPS_PORT' in p}
elif isAMDCPU() and not instrNode.attrib['extension'] == 'BASE':
# We ignore BASE instructions, as they sometimes wrongly count floating point uops
- ports_dict = {int(p[23:]): i for p, i in result.items() if 'FpuPipeAssignment.Total' in p}
+ ports_dict = {p[23:]: i for p, i in result.items() if 'FpuPipeAssignment.Total' in p}
uops = int(result['UOPS']+.2)
if 'RETIRE_SLOTS' in result:
@@ -1209,18 +1253,18 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
ILD_stalls = int(result['ILD_STALL.LCP'])
if (not config.preInstrCode) and (((uopsMITE is not None) and (uopsMITE > 1)) or ((uopsMS is not None) and (uopsMS > 0)) or
- (result.get('INST_DECODED.DEC0', 0) > .05) or ((result.get('UOPS_MITE>0', 0) > .95) and (not isBranchInstr(instrNode)))):
+ (result.get('INST_DECODED.DEC0', 0) > .05) or ((result.get('UOPS_MITE>=1', 0) > .95) and (not isBranchInstr(instrNode)))):
# ToDo: preInstrs
complexDec = True
- if complexDec and ('UOPS_MITE>0' in result):
+ if complexDec and ('UOPS_MITE>=1' in result):
for nNops in count(1):
nopStr = str(nNops) + ' NOP' + ('s' if nNops > 1 else '')
htmlReports.append('With unroll_count=' + str(unrollCount) +', no inner loop, and ' + nopStr + '
\n')
htmlReports.append('\n')
resultNops = runExperiment(instrNode, instrStr + ('; nop' * nNops), init=init, unrollCount=unrollCount, htmlReports=htmlReports)
htmlReports.append('
\n')
- if resultNops['UOPS_MITE>0'] > result['UOPS_MITE>0'] +.95:
+ if resultNops['UOPS_MITE>=1'] > result['UOPS_MITE>=1'] +.95:
nAvailableSimpleDecoders = nNops - 1
break
else:
@@ -1232,7 +1276,7 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
if minTP < sys.maxsize:
return TPResult(minTP, minTP_loop, minTP_noLoop, minTP_noDepBreaking_noLoop, minTP_single, uops, uopsFused, uopsMITE, uopsMS, divCycles, ILD_stalls,
- complexDec, nAvailableSimpleDecoders, minConfig, ports_dict)
+ complexDec, nAvailableSimpleDecoders, minConfig, ports_dict, all_used_ports)
def canMacroFuse(flagInstrNode, branchInstrNode, htmlReports):
@@ -1256,87 +1300,91 @@ def canMacroFuse(flagInstrNode, branchInstrNode, htmlReports):
basicLatency = {}
def getBasicLatencies(instrNodeList):
+ andResult = runExperiment(instrNodeDict['AND_21 (R64, R64)'], 'AND RAX, RBX')
+ basicLatency['AND'] = int(andResult['Core cycles'] + .2)
+
+ orResult = runExperiment(instrNodeDict['OR_09 (R64, R64)'], 'OR RAX, RBX')
+ basicLatency['OR'] = int(orResult['Core cycles'] + .2)
+
+ xorResult = runExperiment(instrNodeDict['XOR_31 (R64, R64)'], 'XOR RAX, RBX')
+ basicLatency['XOR'] = int(xorResult['Core cycles'] + .2)
+
+ movR8hR8hResult = runExperiment(instrNodeDict['MOV_88 (R8h, R8h)'], 'MOV AH, AH')
+ basicLatency['MOV_R8h_R8h'] = int(movR8hR8hResult['Core cycles'] + .2)
+
+ movR8hResult = runExperiment(None, 'MOV AH, AL')
+ basicLatency['MOV_R8h_R8l'] = max(1, int(movR8hResult['Core cycles'] + .2))
+
for t in [16, 32, 64]:
for s in [8,16,32]:
if s >= t:
continue
movsxResult = runExperiment(None, 'MOVSX {}, {}'.format(regToSize('RAX', t), regToSize('RAX', s)))
- basicLatency['MOVSX_R{}_R{}'.format(t,s)] = int(round(movsxResult['Core cycles']))
+ basicLatency['MOVSX_R{}_R{}'.format(t,s)] = int(movsxResult['Core cycles'] + .2)
if t < 64:
- movsxResult = runExperiment(None, 'MOVSX {}, ah'.format(regToSize('RAX', t)))
- basicLatency['MOVSX_R{}_R8h'.format(t)] = int(round(movsxResult['Core cycles']))
-
- movR8hResult = runExperiment(None, 'MOV AH, AL')
- basicLatency['MOV_R8h_R8l'] = max(1, int(round(movR8hResult['Core cycles'])))
-
- movR8hR8hResult = runExperiment(instrNodeDict['MOV_88 (R8h, R8h)'], 'MOV AH, AH')
- basicLatency['MOV_R8h_R8h'] = int(round(movR8hR8hResult['Core cycles']))
-
- andResult = runExperiment(instrNodeDict['AND_21 (R64, R64)'], 'AND RAX, RBX')
- basicLatency['AND'] = int(round(andResult['Core cycles']))
-
- orResult = runExperiment(instrNodeDict['OR_09 (R64, R64)'], 'OR RAX, RBX')
- basicLatency['OR'] = int(round(orResult['Core cycles']))
-
- xorResult = runExperiment(instrNodeDict['XOR_31 (R64, R64)'], 'XOR RAX, RBX')
- basicLatency['XOR'] = int(round(xorResult['Core cycles']))
+ movsxResult = runExperiment(None, 'MOVSX {}, AH; MOV AH, BL'.format(regToSize('RBX', t)))
+ basicLatency['MOVSX_R{}_R8h'.format(t)] = int(movsxResult['Core cycles'] + .2) - basicLatency['MOV_R8h_R8l']
cmcResult = runExperiment(instrNodeDict['CMC'], 'CMC')
- basicLatency['CMC'] = int(round(cmcResult['Core cycles']))
+ basicLatency['CMC'] = int(cmcResult['Core cycles'] + .2)
movqResult = runExperiment(instrNodeDict['MOVQ_0F6F (MM, MM)'], 'MOVQ MM0, MM0')
- basicLatency['MOVQ'] = int(round(movqResult['Core cycles']))
+ basicLatency['MOVQ'] = int(movqResult['Core cycles'] + .2)
for flag in STATUSFLAGS_noAF:
testSetResult = runExperiment(None, 'TEST AL, AL; SET' + flag[0] + ' AL')
- testSetCycles = int(round(testSetResult['Core cycles']))
- if not testSetCycles == 2:
+ # we additionally test with a nop, as the result may be higher than the actual latency (e.g., on ADL-P), probably due to non-optimal port assignments
+ testSetResultNop = runExperiment(None, 'TEST AL, AL; SET' + flag[0] + ' AL; NOP')
+ testSetCycles = min(int(testSetResult['Core cycles'] + .2), int(testSetResultNop['Core cycles'] + .2))
+
+ if (not testSetCycles == 2):
print('Latencies of TEST and SET' + flag[0] + ' must be 1')
sys.exit()
basicLatency['SET' + flag[0]] = 1
basicLatency['TEST'] = 1
testSetHigh8Result = runExperiment(None, 'TEST AH, AH; SET' + flag[0] + ' AH')
- testSetHigh8Cycles = int(round(testSetHigh8Result['Core cycles']))
+ testSetHigh8Cycles = int(testSetHigh8Result['Core cycles'] + .2)
if testSetHigh8Cycles == 2:
basicLatency['SET' + flag[0] + '_R8h'] = 1
basicLatency['TEST_R8h_R8h'] = 1
testCmovResult = runExperiment(None, 'TEST RAX, RAX; CMOV' + flag[0] + ' RAX, RAX')
- basicLatency['CMOV' + flag[0]] = int(round(testCmovResult['Core cycles'])) - 1
+ testCmovResultNop = runExperiment(None, 'TEST RAX, RAX; CMOV' + flag[0] + ' RAX, RAX; NOP')
+ basicLatency['CMOV' + flag[0]] = min(int(testCmovResult['Core cycles'] + .2), int(testCmovResultNop['Core cycles'] + .2)) - 1
for instr in ['ANDPS', 'ANDPD', 'ORPS', 'ORPD', 'PAND', 'POR']:
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM)'], instr + ' XMM1, XMM1')
- basicLatency[instr] = int(round(result['Core cycles']))
+ basicLatency[instr] = int(result['Core cycles'] + .2)
for instr in ['PSHUFD', 'SHUFPD']:
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, I8)'], instr + ' XMM1, XMM1, 0')
- basicLatency[instr] = int(round(result['Core cycles']))
+ basicLatency[instr] = int(result['Core cycles'] + .2)
if any(x for x in instrNodeList if x.findall('[@iclass="VANDPS"]')):
for instr in ['VANDPS', 'VANDPD', 'VORPS', 'VORPD', 'VPAND', 'VPOR']:
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, XMM)'], instr + ' XMM1, XMM1, XMM1')
- basicLatency[instr] = int(round(result['Core cycles']))
+ basicLatency[instr] = int(result['Core cycles'] + .2)
for instr in ['VSHUFPD']:
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, XMM, I8)'], instr + ' XMM1, XMM1, XMM1, 0')
- basicLatency[instr] = int(round(result['Core cycles']))
+ basicLatency[instr] = int(result['Core cycles'] + .2)
for instr in ['VPSHUFD']:
result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, I8)'], instr + ' XMM1, XMM1, 0')
- basicLatency[instr] = int(round(result['Core cycles']))
+ basicLatency[instr] = int(result['Core cycles'] + .2)
if any(x for x in instrNodeList if x.findall('[@extension="AVX512EVEX"]')):
kmovq_result = runExperiment(instrNodeDict['KMOVQ (K, K)'], 'KMOVQ K1, K1')
- basicLatency['KMOVQ'] = int(round(kmovq_result['Core cycles']))
+ basicLatency['KMOVQ'] = int(kmovq_result['Core cycles'] + .2)
vpandd_result = runExperiment(instrNodeDict['VPANDD (ZMM, ZMM, ZMM)'], 'VPANDD ZMM0, ZMM0, ZMM0')
- basicLatency['VPANDD'] = int(round(vpandd_result['Core cycles']))
+ basicLatency['VPANDD'] = int(vpandd_result['Core cycles'] + .2)
for regType in ['XMM', 'YMM', 'ZMM']:
vmovups_result = runExperiment(instrNodeDict['VMOVUPS ({0}, K, {0})'.format(regType)], 'VMOVUPS ' + regType + '1 {k1}, ' + regType + '1')
- vmovups_cycles = int(round(vmovups_result['Core cycles']))
- vmovups_uops = int(round(vmovups_result['UOPS']))
+ vmovups_cycles = int(vmovups_result['Core cycles'] + .2)
+ vmovups_uops = int(vmovups_result['UOPS'] + .2)
basicLatency['VMOVUPS_' + regType + '_' + 'K'] = vmovups_cycles
if not vmovups_uops == 1:
@@ -1345,16 +1393,16 @@ def getBasicLatencies(instrNodeList):
vpmovq2m_result = runExperiment(instrNodeDict['VPMOVQ2M (K, ' + regType + ')'],
'VPMOVQ2M K1, ' + regType + '1; VMOVUPS ' + regType + '1 {k1}, ' + regType + '1')
- basicLatency['VPMOVQ2M_'+regType] = int(round(vpmovq2m_result['Core cycles'])) - vmovups_cycles
+ basicLatency['VPMOVQ2M_'+regType] = int(vpmovq2m_result['Core cycles'] + .2) - vmovups_cycles
vptestnmq_result = runExperiment(instrNodeDict['VPTESTNMQ (K, K, {0}, {0})'.format(regType)],
'VPTESTNMQ K1 {K1}, ' + regType + '1, ' + regType + '1; VMOVUPS ' + regType + '1 {k1}, ' + regType + '1')
- basicLatency['VPTESTNMQ_'+regType] = int(round(vptestnmq_result['Core cycles'])) - vmovups_cycles
+ basicLatency['VPTESTNMQ_'+regType] = int(vptestnmq_result['Core cycles'] + .2) - vmovups_cycles
for memWidth in [8, 16, 32, 64]:
reg = regToSize('R12', memWidth)
mov_10movsx_mov_result = runExperiment(None, 'mov ' + reg + ', [r14];' + ';'.join(10*['MOVSX R12, R12w']) + '; mov [r14], ' + reg , unrollCount=100)
- basicLatency['MOV_10MOVSX_MOV_'+str(memWidth)] = int(round(mov_10movsx_mov_result['Core cycles']))
+ basicLatency['MOV_10MOVSX_MOV_'+str(memWidth)] = int(mov_10movsx_mov_result['Core cycles'] + .2)
print('Basic Latencies: ' + str(basicLatency))
@@ -1373,7 +1421,7 @@ def getDependencyBreakingInstrs(instrNode, opRegDict, ignoreOperand = None):
reg = opNode.text
regPrefix = re.sub('\d', '', reg)
if reg in GPRegs:
- if reg not in globalDoNotWriteRegs:
+ if reg not in globalDoNotWriteRegs|memRegs:
depBreakingInstrs[opNode] = 'MOV ' + reg + ', 0' # don't use XOR as this would also break flag dependencies
elif reg in ['RSP', 'RBP']:
depBreakingInstrs[opNode] = 'MOV ' + reg + ', R14'
@@ -1416,7 +1464,7 @@ def getDependencyBreakingInstrsForSuppressedOperands(instrNode):
reg = opNode.text
if not reg in GPRegs: continue
- if reg in globalDoNotWriteRegs|specialRegs: continue
+ if reg in globalDoNotWriteRegs|specialRegs|memRegs: continue
writeOfRegFound = False
for opNode2 in instrNode.findall('./operand[@type="reg"][@w="1"]'):
@@ -1975,7 +2023,7 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
#################
# reg -> ...
#################
- regs1 = set(startNode.text.split(","))-globalDoNotWriteRegs-specialRegs
+ regs1 = set(startNode.text.split(","))-globalDoNotWriteRegs-specialRegs-memRegs
if not regs1: return None
@@ -1983,7 +2031,7 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
#################
# reg -> reg
#################
- regs2 = set(targetNode.text.split(","))-globalDoNotWriteRegs-specialRegs
+ regs2 = set(targetNode.text.split(","))-globalDoNotWriteRegs-specialRegs-memRegs
if not regs2:
return None
@@ -2185,7 +2233,7 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
#################
# flags -> reg
#################
- regs = set(targetNode.text.split(','))-globalDoNotWriteRegs-specialRegs
+ regs = set(targetNode.text.split(','))-globalDoNotWriteRegs-specialRegs-memRegs
if not regs: return None
reg = sortRegs(regs)[0]
@@ -2251,7 +2299,7 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
# mem -> reg
#################
regs = set(targetNode.text.split(","))
- if not suppressedTarget: regs -= globalDoNotWriteRegs | specialRegs
+ if not suppressedTarget: regs -= globalDoNotWriteRegs | specialRegs | memRegs
if not regs: return None
reg = sortRegs(regs)[0]
regSize = getRegSize(reg)
@@ -2506,7 +2554,7 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
print('readOnlyRegOpNodeIdx not found in opRegDict')
continue
reg = latConfig.instrI.opRegDict[readOnlyRegOpNodeIdx]
- if (not reg in GPRegs) or (reg in High8Regs) or (reg in globalDoNotWriteRegs) or (reg in specialRegs): continue
+ if (not reg in GPRegs) or (reg in High8Regs) or (reg in globalDoNotWriteRegs|specialRegs|memRegs): continue
if any((opNode is not None) for opNode in instrNode.findall('./operand[@type="reg"][@w="1"]')
if regTo64(latConfig.instrI.opRegDict[int(opNode.attrib['idx'])]) == regTo64(reg)): continue
@@ -2526,7 +2574,7 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
for opNode in instrNode.findall('./operand[@r="1"][@type="reg"]'):
reg = latConfig.instrI.opRegDict[int(opNode.attrib['idx'])]
regPrefix = re.sub('\d', '', reg)
- if (regPrefix in ['XMM', 'YMM', 'ZMM']) and (reg not in globalDoNotWriteRegs):
+ if (regPrefix in ['XMM', 'YMM', 'ZMM']) and (reg not in globalDoNotWriteRegs|memRegs):
for initOp in instrNode.findall('./operand[@w="1"][@type="reg"]'):
if initOp.text != opNode.text: continue
regInit += [getInstrInstanceFromNode(instrNode, opRegDict={int(initOp.attrib['idx']):reg}, computeRegMemInit=False).asm]
@@ -2690,7 +2738,7 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
def isSSEInstr(instrNode):
extension = instrNode.attrib['extension']
- return 'SSE' in extension or extension in ['AES']
+ return ('SSE' in extension) or (('XMM' in instrNode.attrib['string']) and not isAVXInstr(instrNode))
def isAVXInstr(instrNode):
return ('vex' in instrNode.attrib or 'evex' in instrNode.attrib)
@@ -2733,7 +2781,7 @@ def filterInstructions(XMLRoot):
isaSet = XMLInstr.attrib['isa-set']
# Future instruction set extensions
- if extension in ['AMD_INVLPGB', 'CET', 'KEYLOCKER', 'KEYLOCKER_WIDE', 'RDPRU', 'SERIALIZE', 'TDX', 'TSX_LDTRK']: instrSet.discard(XMLInstr)
+ if extension in ['AMD_INVLPGB', 'CET', 'KEYLOCKER', 'KEYLOCKER_WIDE', 'RDPRU', 'TDX', 'TSX_LDTRK']: instrSet.discard(XMLInstr)
# Not supported by assembler
if XMLInstr.attrib['iclass'] == 'NOP' and len(XMLInstr.findall('operand')) > 1:
@@ -2822,17 +2870,21 @@ def filterInstructions(XMLRoot):
if extension == 'PKU' and not cpuid.get_bit(ecx7, 4): instrSet.discard(XMLInstr)
if extension == 'WAITPKG' and not cpuid.get_bit(ecx7, 5): instrSet.discard(XMLInstr)
if isaSet.startswith('AVX512_VBMI2') and not cpuid.get_bit(ecx7, 6): instrSet.discard(XMLInstr)
- if category == 'GFNI' and not cpuid.get_bit(ecx7, 8): instrSet.discard(XMLInstr)
+ if category == 'GFNI':
+ if not cpuid.get_bit(ecx7, 8):
+ instrSet.discard(XMLInstr)
+ elif 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31):
+ instrSet.discard(XMLInstr)
if 'VAES' in isaSet:
if not cpuid.get_bit(ecx7, 9):
instrSet.discard(XMLInstr)
- else:
- if 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31): instrSet.discard(XMLInstr)
+ elif 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31):
+ instrSet.discard(XMLInstr)
if 'VPCLMULQDQ' in isaSet:
if not cpuid.get_bit(ecx7, 10):
instrSet.discard(XMLInstr)
- else:
- if 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31): instrSet.discard(XMLInstr)
+ elif 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31):
+ instrSet.discard(XMLInstr)
if isaSet.startswith('AVX512_VNNI') and not cpuid.get_bit(ecx7, 11): instrSet.discard(XMLInstr)
if isaSet.startswith('AVX512_BITALG') and not cpuid.get_bit(ecx7, 12): instrSet.discard(XMLInstr)
if isaSet.startswith('AVX512_VPOPCNTDQ') and not cpuid.get_bit(ecx7, 14): instrSet.discard(XMLInstr)
@@ -2845,6 +2897,7 @@ def filterInstructions(XMLRoot):
if isaSet.startswith('AVX512_4FMAPS') and not cpuid.get_bit(edx7, 3): instrSet.discard(XMLInstr)
if extension == 'UINTR' and not cpuid.get_bit(edx7, 5): instrSet.discard(XMLInstr)
if isaSet.startswith('AVX512_VP2INTERSECT') and not cpuid.get_bit(edx7, 8): instrSet.discard(XMLInstr)
+ if extension == 'SERIALIZE' and not cpuid.get_bit(edx7, 14): instrSet.discard(XMLInstr)
if extension == 'PCONFIG' and not cpuid.get_bit(edx7, 18): instrSet.discard(XMLInstr)
if extension == 'AMX_BF16' and not cpuid.get_bit(edx7, 22): instrSet.discard(XMLInstr)
if isaSet.startswith('AVX512_FP16') and not cpuid.get_bit(edx7, 23): instrSet.discard(XMLInstr)
@@ -2877,12 +2930,15 @@ def filterInstructions(XMLRoot):
# Transactional Synchronization Extensions
if extension in ['RTM']: instrSet.discard(XMLInstr)
+ # WAITPKG
+ if extension in ['WAITPKG']: instrSet.discard(XMLInstr)
+
# X87 instructions:
if extension in ['X87']: instrSet.discard(XMLInstr)
if XMLInstr.attrib['category'] in ['X87_ALU']: instrSet.discard(XMLInstr)
# System instructions
- if extension in ['INVPCID', 'MONITOR', 'MONITORX', 'RDWRFSGS', 'SMAP', 'SNP']: instrSet.discard(XMLInstr)
+ if extension in ['HRESET', 'INVPCID', 'MONITOR', 'MONITORX', 'PCONFIG', 'RDWRFSGS', 'SMAP', 'SNP']: instrSet.discard(XMLInstr)
if XMLInstr.attrib['category'] in ['INTERRUPT', 'SEGOP', 'SYSCALL', 'SYSRET']: instrSet.discard(XMLInstr)
if XMLInstr.attrib['iclass'] in ['CALL_FAR', 'HLT', 'INVD', 'IRET', 'IRETD', 'IRETQ', 'JMP_FAR', 'LTR', 'RET_FAR', 'UD2']: instrSet.discard(XMLInstr)
if 'XRSTOR' in XMLInstr.attrib['iclass']: instrSet.discard(XMLInstr)
@@ -2944,9 +3000,9 @@ def main():
configurePFCs(['UOPS','FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3',
'FpuPipeAssignment.Total4', 'FpuPipeAssignment.Total5', 'DIV_CYCLES'])
else:
- configurePFCs(['UOPS', 'RETIRE_SLOTS', 'UOPS_MITE', 'UOPS_MS', 'UOPS_PORT0', 'UOPS_PORT1', 'UOPS_PORT2', 'UOPS_PORT3', 'UOPS_PORT4', 'UOPS_PORT5',
- 'UOPS_PORT6', 'UOPS_PORT7', 'UOPS_PORT23', 'UOPS_PORT49', 'UOPS_PORT78', 'DIV_CYCLES', 'ILD_STALL.LCP', 'INST_DECODED.DEC0',
- 'UOPS_MITE>0'])
+ configurePFCs(['UOPS', 'RETIRE_SLOTS', 'UOPS_MITE', 'UOPS_MS', 'UOPS_PORT_0', 'UOPS_PORT_1', 'UOPS_PORT_2', 'UOPS_PORT_3', 'UOPS_PORT_4',
+ 'UOPS_PORT_5', 'UOPS_PORT_6', 'UOPS_PORT_7', 'UOPS_PORT_23', 'UOPS_PORT_49', 'UOPS_PORT_78', 'UOPS_PORT_5B', 'UOPS_PORT_5B>=2',
+ 'UOPS_PORT_23A', 'DIV_CYCLES', 'ILD_STALL.LCP', 'INST_DECODED.DEC0', 'UOPS_MITE>=1'])
try:
subprocess.check_output('mkdir -p /tmp/ramdisk; sudo mount -t tmpfs -o size=100M none /tmp/ramdisk/', shell=True)
@@ -3001,7 +3057,7 @@ def main():
tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()}
else:
for i, instrNode in enumerate(instrNodeList):
- #if not 'RCR (R64, 1)' in instrNode.attrib['string']: continue
+ #if not 'ROR_4 (R8l, I8)' in instrNode.attrib['string']: continue
print('Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')')
htmlReports = ['' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '
\n
\n']
@@ -3049,11 +3105,9 @@ def main():
if tp: tpDictNoInteriteration[instrNode] = tp
if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
- with open('tp_' + arch + '.pickle', 'wb') as f:
+ with open('/tmp/tp_' + arch + '.pickle', 'wb') as f:
pickle.dump((tpDict, tpDictSameReg, tpDictIndexedAddr, tpDictNoInteriteration), f)
- num_ports = len(list(tpDict.values())[0].unblocked_ports)
-
########################
# Latency
########################
@@ -3079,7 +3133,7 @@ def main():
if debugOutput: print(instrNode.attrib['iform'] + ': ' + str(lat))
latencyDict[instrNode] = lat
writeHtmlFile('html-lat/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
- with open('lat_' + arch + '.pickle', 'wb') as f:
+ with open('/tmp/lat_' + arch + '.pickle', 'wb') as f:
pickle.dump(latencyDict, f)
########################
@@ -3095,6 +3149,22 @@ def main():
portCombinationsResultDictIndexedAddr = {}
if not args.noPorts:
+ for instr, tpResult in tpDict.items():
+ up = tpResult.unblocked_ports
+ usedPorts = tpResult.all_used_ports
+ if '5B>=2' in up:
+ if '5B' in usedPorts:
+ usedPorts.add('5')
+ if '5B>=2' not in usedPorts:
+ up['5'] = up['5B']
+ else:
+ up['5'] = up['B'] = up['5B'] / 2
+ usedPorts.add('B')
+ up.pop('5B', None)
+ up.pop('5B>=2', None)
+ usedPorts.discard('5B')
+ usedPorts.discard('5B>=2')
+
# iforms of instructions that are potentially zero-latency instructions
# we consider all MOVZX instructions to be potentially zero-latency instr.; the descr. in the manual is not accurate as, e.g., MOVZX RSI, CL can be
# eliminated, but MOVZX RSI, DIL cannot (at least on Coffee Lake)
@@ -3147,7 +3217,7 @@ def main():
blockingInstructionsDictNonSSE_set = {}
for instrNode in oneUopInstrs:
- usedPorts = frozenset({p for p, x in tpDict[instrNode].unblocked_ports.items() if x>0.1})
+ usedPorts = frozenset(tpDict[instrNode].all_used_ports)
if usedPorts:
print(instrNode.attrib['iform'] + ': ' + str(usedPorts) + ' ' + str(len(instrNode.findall('./operand[@suppressed="1"]'))))
@@ -3174,23 +3244,23 @@ def main():
# mov to mem has always two uops: store address and store data; there is no instruction that uses just one of them
movMemInstrNode = instrNodeDict['MOV (M64, R64)']
- if arch in ['ICL', 'TGL', 'RKL']:
- storeDataPort = 49
+ if arch in ['ICL', 'TGL', 'RKL', 'ADL-P']:
+ storeDataPort = '49'
else:
- storeDataPort = 4
+ storeDataPort = '4'
blockingInstructionsDictNonAVX[frozenset({storeDataPort})] = movMemInstrNode
blockingInstructionsDictNonSSE[frozenset({storeDataPort})] = movMemInstrNode
- storeAddressPorts = frozenset({p for p, x in tpDict[movMemInstrNode].unblocked_ports.items() if x>=0.1 and not p == storeDataPort})
+ storeAddressPorts = frozenset({p for p in tpDict[movMemInstrNode].all_used_ports if not p == storeDataPort})
if storeAddressPorts not in blockingInstructionsDictNonAVX: blockingInstructionsDictNonAVX[storeAddressPorts] = movMemInstrNode
if storeAddressPorts not in blockingInstructionsDictNonSSE: blockingInstructionsDictNonSSE[storeAddressPorts] = movMemInstrNode
print('Non-AVX:')
for k,v in blockingInstructionsDictNonAVX.items():
- print(str(k) + ': ' + v.attrib['iform'])
+ print(str(k) + ': ' + v.attrib['string'])
print('Non-SSE:')
for k,v in blockingInstructionsDictNonSSE.items():
- print(str(k) + ': ' + v.attrib['iform'])
+ print(str(k) + ': ' + v.attrib['string'])
sortedPortCombinationsNonAVX = sorted(blockingInstructionsDictNonAVX.keys(), key=lambda x:(len(x), sorted(x)))
sortedPortCombinationsNonSSE = sorted(blockingInstructionsDictNonSSE.keys(), key=lambda x:(len(x), sorted(x)))
@@ -3221,8 +3291,7 @@ def main():
if not useIACA and tpResult.config.preInstrNodes:
rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes)
- # use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen
- used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05}
+ used_ports = tpResult.all_used_ports
if debugOutput: print(instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports))
if not isAVXInstr(instrNode):
@@ -3238,8 +3307,11 @@ def main():
htmlReports.append('No uops')
elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (tpResult.ILD_stalls in [None, 0]):
# one uop instruction
- uopsCombinationList = [(frozenset(used_ports), 1)]
- htmlReports.append('
Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports))
+ for combination in sortedPortCombinations:
+ if used_ports.issubset(combination):
+ uopsCombinationList = [(combination, 1)]
+ htmlReports.append('
Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in combination))
+ break
elif (rem_uops > 0) and (arch not in ['ZEN+', 'ZEN2']):
for combination in sortedPortCombinations:
if not combination.intersection(used_ports): continue
@@ -3257,7 +3329,7 @@ def main():
if tpResult.config.preInstrNodes:
for preInstrNode in tpResult.config.preInstrNodes:
for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]:
- if pre_comb.issubset(frozenset(''.join(map(str,combination)))):
+ if pre_comb.issubset(frozenset(''.join(combination))):
prevUopsOnCombination += pre_uops
nPortsInComb = sum(len(str(x)) for x in combination)
@@ -3265,7 +3337,7 @@ def main():
blockInstrRep = min(blockInstrRep, 100)
uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports)
if uopsOnBlockedPorts is None:
- print('no uops on blocked ports: ' + str(combination))
+ #print('no uops on blocked ports: ' + str(combination))
continue
uopsOnBlockedPorts -= prevUopsOnCombination
@@ -3286,7 +3358,7 @@ def main():
((str(uopsOnBlockedPorts) + ' μops') if (uopsOnBlockedPorts > 1) else 'One μop') +
' that can only use port' +
('s {' if len(combination)>1 else ' ') +
- str(list(combination))[1:-1] +
+ str(sorted(combination))[1:-1] +
('}' if len(combination)>1 else '') + '')
rem_uops -= uopsOnBlockedPorts
@@ -3294,7 +3366,7 @@ def main():
# on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports
# we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate
- uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList]
+ uopsCombinationList = [(frozenset(''.join(comb)), uops) for comb, uops in uopsCombinationList]
if not useDistinctRegs:
portCombinationsResultDictSameReg[instrNode] = uopsCombinationList
@@ -3303,10 +3375,8 @@ def main():
else:
portCombinationsResultDict[instrNode] = uopsCombinationList
-
writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports))
-
########################
# Write XML File
########################
@@ -3374,7 +3444,7 @@ def main():
if divCycles: resultNode.attrib['div_cycles'+suffix] = str(divCycles)
portPrefix = ('p' if isIntelCPU() else 'FP')
- computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(str(p) for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0])))
+ computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(p for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0])))
if portUsageList:
resultNode.attrib['ports'+suffix] = computePortStr(portUsageList)
try:
diff --git a/tools/cpuBench/simpleHTMLTable.py b/tools/cpuBench/simpleHTMLTable.py
new file mode 100755
index 0000000..33ae905
--- /dev/null
+++ b/tools/cpuBench/simpleHTMLTable.py
@@ -0,0 +1,103 @@
+#!/usr/bin/python
+from sys import maxsize
+import xml.etree.ElementTree as ET
+import argparse
+from utils import *
+
+def getLink(instrNode, text, arch, tool, linkType, anchor=None):
+ url = '/tmp/html-' + linkType + '/' + arch + '/' + canonicalizeInstrString(instrNode.attrib['string']) + '-' + tool + '.html'
+ if anchor: url += '#' + anchor
+ return '' + text + ''
+
+def main():
+ parser = argparse.ArgumentParser(description='Generates a basic HTML table with the results for a microarchitecture')
+ parser.add_argument("-input", help="Input XML file", default='result.xml')
+ parser.add_argument("-arch", help="Consider only this architecture")
+ args = parser.parse_args()
+
+ root = ET.parse(args.input)
+
+ TPSame = 0
+ TPDiff = 0
+
+ with open('instructions.html', "w") as f:
+ f.write('\n'
+ '\n'
+ 'Instructions\n'
+ '\n'
+ '\n'
+ '\n')
+
+ for XMLExtension in root.iter('extension'):
+ if not XMLExtension.findall('.//measurement'):
+ continue
+
+ f.write('' + XMLExtension.attrib['name'] + '
\n'
+ '\n'
+ ' \n'
+ ' | \n'
+ ' Lat | \n'
+ ' TP (ports) | \n'
+ ' TP (m) | \n'
+ ' uops | \n'
+ ' Ports | \n'
+ '
\n')
+
+ for XMLInstr in sorted(XMLExtension.findall('./instruction'), key=lambda x: x.attrib['string']):
+ for resultNode in XMLInstr.findall('./architecture[@name="' + args.arch + '"]/measurement'):
+ f.write(' \n')
+ f.write(' | ' + XMLInstr.attrib['string'] + ' | \n')
+
+ lat = ''
+ latTableEntry = getLatencyTableEntry(resultNode)
+ if latTableEntry is not None:
+ lat = str(latTableEntry[0])
+ f.write(' ' + getLink(XMLInstr, lat, args.arch, 'Measurements', 'lat') + ' | \n')
+
+ TPPorts = float(resultNode.attrib.get('TP_ports', float("inf")))
+ TPPortsStr = ("{:.2f}".format(TPPorts) if TPPorts < float("inf") else '')
+ f.write(' ' + TPPortsStr + ' | \n')
+
+ TPMeasured = min(float(resultNode.attrib.get('TP_loop', float("inf"))), float(resultNode.attrib.get('TP_unrolled', float("inf"))))
+ TPMeasuredStr = ("{:.2f}".format(TPMeasured) if TPMeasured < float("inf") else '')
+
+ uopsMS = int(resultNode.attrib.get('uops_MS', sys.maxsize))
+
+ color = ''
+ if TPPortsStr and TPMeasuredStr and (uopsMS == 0):
+ if abs(TPMeasured - TPPorts) < .02:
+ color = ' bgcolor="green"'
+ TPSame += 1
+ else:
+ color = ' bgcolor="orange"'
+ TPDiff += 1
+
+ f.write(' ' + getLink(XMLInstr, TPMeasuredStr, args.arch, 'Measurements', 'tp') + ' | \n')
+
+ f.write(' ' + resultNode.attrib.get('uops', '') + ' | \n')
+ f.write(' ' + getLink(XMLInstr, resultNode.attrib.get('ports', ''), args.arch, 'Measurements', 'ports') + ' | \n')
+ f.write('
\n')
+
+ f.write('
\n')
+
+ f.write('\n')
+ f.write('\n')
+
+ print('TPSame: ' + str(TPSame))
+ print('TPDiff: ' + str(TPDiff))
+ print('Result written to instructions.html')
+
+if __name__ == "__main__":
+ main()