support for Cascade Lake

This commit is contained in:
Andreas Abel
2020-10-27 21:48:46 +01:00
parent c3879089b4
commit bf0906750b
3 changed files with 138 additions and 90 deletions

View File

@@ -204,7 +204,10 @@ def micro_arch(cpu):
if (vi.displ_family, vi.displ_model) in [(0x06, 0x4E), (0x06, 0x5E)]: if (vi.displ_family, vi.displ_model) in [(0x06, 0x4E), (0x06, 0x5E)]:
return 'SKL' return 'SKL'
if (vi.displ_family, vi.displ_model) in [(0x06, 0x55)]: if (vi.displ_family, vi.displ_model) in [(0x06, 0x55)]:
return 'SKX' if vi.stepping <= 0x4:
return 'SKX'
else:
return 'CLX'
if (vi.displ_family, vi.displ_model) in [(0x06, 0x8E), (0x06, 0x9E)]: if (vi.displ_family, vi.displ_model) in [(0x06, 0x8E), (0x06, 0x9E)]:
# ToDo: not sure if this is correct # ToDo: not sure if this is correct
if vi.stepping <= 0x9: if vi.stepping <= 0x9:

View File

@@ -2,6 +2,7 @@
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from xml.dom import minidom from xml.dom import minidom
import argparse import argparse
import sys
# Shows the differences between two XML files for a specific microarchitecture # Shows the differences between two XML files for a specific microarchitecture
def main(): def main():
@@ -34,11 +35,12 @@ def main():
for mNode1 in instrNode1.findall('./architecture[@name="' + args.arch1 + '"]/measurement'): for mNode1 in instrNode1.findall('./architecture[@name="' + args.arch1 + '"]/measurement'):
for mNode2 in instrNode2.findall('./architecture[@name="' + args.arch2 + '"]/measurement'): for mNode2 in instrNode2.findall('./architecture[@name="' + args.arch2 + '"]/measurement'):
if args.TP: if args.TP:
tp1 = mNode1.attrib['TP'] tp1 = min(map(float, [mNode1.attrib.get('TP_unrolled', sys.maxsize), mNode1.attrib.get('TP_loop', sys.maxsize), mNode1.attrib.get('TP', sys.maxsize)]))
tp2 = mNode2.attrib['TP'] tp2 = min(map(float, [mNode2.attrib.get('TP_unrolled', sys.maxsize), mNode2.attrib.get('TP_loop', sys.maxsize), mNode2.attrib.get('TP', sys.maxsize)]))
if tp1 != tp2: if tp1 != tp2:
tpDiff += 1 tpDiff += 1
print instrStr + ' - TP1: ' + tp1 + ' - TP2: ' + tp2 print instrStr + ' - TP1: ' + str(tp1) + ' - TP2: ' + str(tp2)
if args.lat: if args.lat:
for latNode1, latNode2 in zip(mNode1.findall('./latency'), mNode2.findall('./latency')): for latNode1, latNode2 in zip(mNode1.findall('./latency'), mNode2.findall('./latency')):

View File

@@ -102,7 +102,7 @@ def getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr):
# we use vzeroall instead of just vzeroupper to make sure that XMM14 is 0 for VSIB addressing # we use vzeroall instead of just vzeroupper to make sure that XMM14 is 0 for VSIB addressing
init += ['VZEROALL'] init += ['VZEROALL']
if not 'DIV' in instrNode.attrib['iclass'] and not 'SQRT' in instrNode.attrib['iclass']: if not isDivOrSqrtInstr(instrNode):
for opNode in instrNode.findall('./operand[@r="1"]'): for opNode in instrNode.findall('./operand[@r="1"]'):
opIdx = int(opNode.attrib['idx']) opIdx = int(opNode.attrib['idx'])
xtype = opNode.attrib.get('xtype', '') xtype = opNode.attrib.get('xtype', '')
@@ -120,14 +120,7 @@ def getRegMemInit(instrNode, opRegDict, memOffset, useIndexedAddr):
else: else:
init += ['MOVUPD ' + reg + ', [R14]'] init += ['MOVUPD ' + reg + ', [R14]']
elif regPrefix in ['XMM', 'YMM', 'ZMM'] and isAVXInstr(instrNode): elif regPrefix in ['XMM', 'YMM', 'ZMM'] and isAVXInstr(instrNode):
# some AVX instr. (e.g. VORPS, VAESDEC) incur a penalty (?) if a source was not written by an AVX instr. of a similar kind init += ['VXORPS '+reg+', '+reg+', '+reg]
if reg not in globalDoNotWriteRegs:
for opNode2 in instrNode.findall('./operand[@w="1"]'):
if not opNode2.text == opNode.text: continue
init += [getInstrInstanceFromNode(instrNode, opRegDict={int(opNode2.attrib['idx']):reg}, computeRegMemInit=False).asm]
break
else:
init += ['VXORPS '+reg+', '+reg+', '+reg]
elif 'MM' in regPrefix: elif 'MM' in regPrefix:
init += ['PXOR '+reg+', '+reg] init += ['PXOR '+reg+', '+reg]
elif opNode.attrib['type'] == 'mem': elif opNode.attrib['type'] == 'mem':
@@ -202,7 +195,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
if arch in ['CON', 'WOL']: evt = 'RS_UOPS_DISPATCHED' if arch in ['CON', 'WOL']: evt = 'RS_UOPS_DISPATCHED'
elif arch in ['NHM', 'WSM']: evt = 'UOPS_RETIRED.ANY' elif arch in ['NHM', 'WSM']: evt = 'UOPS_RETIRED.ANY'
elif arch in ['SNB', 'IVB', 'HSW', 'BDW']: evt = 'UOPS_RETIRED.ALL' elif arch in ['SNB', 'IVB', 'HSW', 'BDW']: evt = 'UOPS_RETIRED.ALL'
elif arch in ['SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: evt = 'UOPS_EXECUTED.THREAD' elif arch in ['SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: evt = 'UOPS_EXECUTED.THREAD'
localHtmlReports.append('<li>' + evt + ': ' + str(value) + '</li>\n') localHtmlReports.append('<li>' + evt + ': ' + str(value) + '</li>\n')
localHtmlReports.append('</ul>\n</li>') localHtmlReports.append('</ul>\n</li>')
@@ -253,47 +246,47 @@ def getEventConfig(event):
if event == 'UOPS': if event == 'UOPS':
if arch in ['CON', 'WOL']: return 'A0.00' # RS_UOPS_DISPATCHED if arch in ['CON', 'WOL']: return 'A0.00' # RS_UOPS_DISPATCHED
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW']: return 'C2.01' # UOPS_RETIRED.ALL if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW']: return 'C2.01' # UOPS_RETIRED.ALL
if arch in ['SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'B1.01' # UOPS_EXECUTED.THREAD if arch in ['SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'B1.01' # UOPS_EXECUTED.THREAD
if arch in ['ZEN+', 'ZEN2']: return '0C1.00' if arch in ['ZEN+', 'ZEN2']: return '0C1.00'
if event == 'RETIRE_SLOTS': if event == 'RETIRE_SLOTS':
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'C2.02' if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'C2.02'
if event == 'UOPS_MITE': if event == 'UOPS_MITE':
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return '79.04' if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '79.04'
if event == 'UOPS_MS': if event == 'UOPS_MS':
if arch in ['NHM', 'WSM']: return 'D1.02' if arch in ['NHM', 'WSM']: return 'D1.02'
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return '79.30' if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '79.30'
if event == 'UOPS_PORT0': if event == 'UOPS_PORT0':
if arch in ['CON', 'WOL']: return 'A1.01.CTR=0' if arch in ['CON', 'WOL']: return 'A1.01.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.01' if arch in ['NHM', 'WSM']: return 'B1.01'
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.01' if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.01'
if event == 'UOPS_PORT1': if event == 'UOPS_PORT1':
if arch in ['CON', 'WOL']: return 'A1.02.CTR=0' if arch in ['CON', 'WOL']: return 'A1.02.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.02' if arch in ['NHM', 'WSM']: return 'B1.02'
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.02' if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.02'
if event == 'UOPS_PORT2': if event == 'UOPS_PORT2':
if arch in ['CON', 'WOL']: return 'A1.04.CTR=0' if arch in ['CON', 'WOL']: return 'A1.04.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.04' if arch in ['NHM', 'WSM']: return 'B1.04'
if arch in ['SNB', 'IVB']: return 'A1.0C' if arch in ['SNB', 'IVB']: return 'A1.0C'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.04' if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.04'
if event == 'UOPS_PORT3': if event == 'UOPS_PORT3':
if arch in ['CON', 'WOL']: return 'A1.08.CTR=0' if arch in ['CON', 'WOL']: return 'A1.08.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.08' if arch in ['NHM', 'WSM']: return 'B1.08'
if arch in ['SNB', 'IVB']: return 'A1.30' if arch in ['SNB', 'IVB']: return 'A1.30'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.08' if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.08'
if event == 'UOPS_PORT4': if event == 'UOPS_PORT4':
if arch in ['CON', 'WOL']: return 'A1.10.CTR=0' if arch in ['CON', 'WOL']: return 'A1.10.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.10' if arch in ['NHM', 'WSM']: return 'B1.10'
if arch in ['SNB', 'IVB']: return 'A1.40' if arch in ['SNB', 'IVB']: return 'A1.40'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.10' if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.10'
if event == 'UOPS_PORT5': if event == 'UOPS_PORT5':
if arch in ['CON', 'WOL']: return 'A1.20.CTR=0' if arch in ['CON', 'WOL']: return 'A1.20.CTR=0'
if arch in ['NHM', 'WSM']: return 'B1.20' if arch in ['NHM', 'WSM']: return 'B1.20'
if arch in ['SNB', 'IVB']: return 'A1.80' if arch in ['SNB', 'IVB']: return 'A1.80'
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.20' if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.20'
if event == 'UOPS_PORT6': if event == 'UOPS_PORT6':
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.40' if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.40'
if event == 'UOPS_PORT7': if event == 'UOPS_PORT7':
if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.80' if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.80'
if event == 'UOPS_PORT23': if event == 'UOPS_PORT23':
if arch in ['ICL']: return 'A1.04' if arch in ['ICL']: return 'A1.04'
if event == 'UOPS_PORT49': if event == 'UOPS_PORT49':
@@ -301,11 +294,11 @@ def getEventConfig(event):
if event == 'UOPS_PORT78': if event == 'UOPS_PORT78':
if arch in ['ICL']: return 'A1.80' if arch in ['ICL']: return 'A1.80'
if event == 'DIV_CYCLES': if event == 'DIV_CYCLES':
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return '14.01.CMSK=1' # undocumented on HSW, but seems to work if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return '14.01.CMSK=1' # undocumented on HSW, but seems to work
if arch in ['ICL']: return '14.09.CMSK=1' if arch in ['ICL']: return '14.09.CMSK=1'
if arch in ['ZEN+', 'ZEN2']: return '0D3.00' if arch in ['ZEN+', 'ZEN2']: return '0D3.00'
if event == 'ILD_STALL.LCP': if event == 'ILD_STALL.LCP':
if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return '87.01' if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '87.01'
if event == 'INST_DECODED.DEC0': if event == 'INST_DECODED.DEC0':
if arch in ['NHM', 'WSM']: return '18.01' if arch in ['NHM', 'WSM']: return '18.01'
if event == 'FpuPipeAssignment.Total0': if event == 'FpuPipeAssignment.Total0':
@@ -355,6 +348,13 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
elif operandNode.attrib['type'] == "mem" and 'base' in operandNode.attrib: elif operandNode.attrib['type'] == "mem" and 'base' in operandNode.attrib:
readRegs.add(operandNode.attrib['base']) readRegs.add(operandNode.attrib['base'])
commonReg = None
if not useDistinctRegs:
commonRegs = findCommonRegisters(instrNode)
commonRegs -= set(doNotWriteRegs)|set(doNotReadRegs)|globalDoNotWriteRegs
if commonRegs:
commonReg = sortRegs(commonRegs)[0]
asm = instrNode.attrib['asm'] asm = instrNode.attrib['asm']
first = True first = True
@@ -376,23 +376,23 @@ def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None,
else: else:
regsList = operandNode.text.split(',') regsList = operandNode.text.split(',')
if len(regsList) > 1: reg = None
ignoreRegs = set() if commonReg:
if operandNode.attrib.get('w', '0') == '1': for reg2 in regsList:
ignoreRegs |= set(doNotWriteRegs)|globalDoNotWriteRegs|(set(opRegDict.values()) if useDistinctRegs else set(doNotReadRegs)) if getCanonicalReg(reg2) == commonReg:
if operandNode.attrib.get('r', '0') == '1': reg = reg2
ignoreRegs |= set(doNotReadRegs)|(writtenRegs|readRegs|set(opRegDict.values()) if useDistinctRegs else set(doNotWriteRegs)|globalDoNotWriteRegs) break
regsList = filter(lambda x: not any(y in ignoreRegs for y in getSubRegs(x)) and not (x in [z for y in ignoreRegs for z in getSubRegs(y)]), regsList) if reg is None:
if not regsList: if len(regsList) > 1:
return None; ignoreRegs = set()
if operandNode.attrib.get('w', '0') == '1':
reg = sortRegs(regsList)[0]; ignoreRegs |= set(doNotWriteRegs)|globalDoNotWriteRegs|set(opRegDict.values())
if not useDistinctRegs: if operandNode.attrib.get('r', '0') == '1':
for oReg in opRegDict.values(): ignoreRegs |= set(doNotReadRegs)|writtenRegs|readRegs|set(opRegDict.values())
for reg2 in regsList: regsList = filter(lambda x: not any(y in ignoreRegs for y in getSubRegs(x)) and not (x in [z for y in ignoreRegs for z in getSubRegs(y)]), regsList)
if getCanonicalReg(oReg) == getCanonicalReg(reg2): if not regsList:
reg = reg2 return None;
break reg = sortRegs(regsList)[0]
opRegDict[opI] = reg opRegDict[opI] = reg
if operandNode.attrib.get('w', '0') == '1': if operandNode.attrib.get('w', '0') == '1':
@@ -626,14 +626,18 @@ def hasCommonRegister(instrNode):
return False return False
if instrNode.find('./operand[@type="reg"][@suppressed="1"]') is not None: if instrNode.find('./operand[@type="reg"][@suppressed="1"]') is not None:
return False return False
return len(findCommonRegisters(instrNode)) > 0
def findCommonRegisters(instrNode):
for opNode1 in instrNode.findall('./operand[@type="reg"]'): for opNode1 in instrNode.findall('./operand[@type="reg"]'):
regs1 = set(map(getCanonicalReg, opNode1.text.split(","))) regs1 = set(map(getCanonicalReg, opNode1.text.split(",")))
for opNode2 in instrNode.findall('./operand[@type="reg"]'): for opNode2 in instrNode.findall('./operand[@type="reg"]'):
if opNode1 == opNode2: continue if opNode1 == opNode2: continue
regs2 = set(map(getCanonicalReg, opNode2.text.split(","))) regs2 = set(map(getCanonicalReg, opNode2.text.split(",")))
if regs1.intersection(regs2): intersection = regs1.intersection(regs2)
return True if intersection:
return False return intersection
return set()
def hasExplicitNonVSIBMemOperand(instrNode): def hasExplicitNonVSIBMemOperand(instrNode):
for opNode in instrNode.findall('./operand[@type="mem"]'): for opNode in instrNode.findall('./operand[@type="mem"]'):
@@ -673,12 +677,12 @@ class TPConfig:
self.note = note self.note = note
def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeIndepAndDepBreakingInstrs=True): def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeIndepAndDepBreakingInstrs=True):
if isDivOrSqrtInstr(instrNode):
return getTPConfigsForDiv(instrNode)
iform = instrNode.attrib['iform'] iform = instrNode.attrib['iform']
iclass = instrNode.attrib['iclass'] iclass = instrNode.attrib['iclass']
if 'DIV' in iclass or 'SQRT' in iclass:
return getTPConfigsForDiv(instrNode)
independentInstrs = [] independentInstrs = []
depBreakingInstrs = '' depBreakingInstrs = ''
if computeIndepAndDepBreakingInstrs: if computeIndepAndDepBreakingInstrs:
@@ -880,7 +884,7 @@ def getTPConfigsForDiv(instrNode):
config.init += ['VMOVUP' + dataType + ' ' + divisorReg + ', [R14]'] config.init += ['VMOVUP' + dataType + ' ' + divisorReg + ', [R14]']
config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1:regType+str(reg), (nOperands-1):dividendReg, nOperands:divisorReg}) for reg in range(2, 10)] config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1:regType+str(reg), (nOperands-1):dividendReg, nOperands:divisorReg}) for reg in range(2, 10)]
elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'RCPSS', 'RCPPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD', 'VSQRTPD','VRSQRTSS', 'VRSQRTPS', 'VRCPSS', 'VRCPPS', 'VRSQRT14SS', 'VRSQRT14SD', 'VRSQRT14PS', 'VRSQRT14PD']: elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD', 'VSQRTPD','VRSQRTSS', 'VRSQRTPS', 'VRSQRT14SS', 'VRSQRT14SD', 'VRSQRT14PS', 'VRSQRT14PD']:
dataType = instrNode.attrib['iclass'][-1] dataType = instrNode.attrib['iclass'][-1]
if dataType == 'S': if dataType == 'S':
@@ -931,7 +935,8 @@ def fancyRound(cycles):
return round(cycles, 2) return round(cycles, 2)
TPResult = namedtuple('TPResult', ['TP', 'TP_noDepBreaking_noLoop', 'TP_single', 'uops', 'fused_uops', 'uops_MITE', 'uops_MS', 'divCycles', 'ILD_stalls', 'dec0', 'config', 'unblocked_ports']) TPResult = namedtuple('TPResult', ['TP', 'TP_loop', 'TP_noLoop', 'TP_noDepBreaking_noLoop', 'TP_single', 'uops', 'fused_uops', 'uops_MITE', 'uops_MS', 'divCycles',
'ILD_stalls', 'dec0', 'config', 'unblocked_ports'])
# returns TPResult # returns TPResult
# port usages are averages (when no ports are blocked by other instructions) # port usages are averages (when no ports are blocked by other instructions)
@@ -939,6 +944,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
configs = getTPConfigs(instrNode, useDistinctRegs, useIndexedAddr) configs = getTPConfigs(instrNode, useDistinctRegs, useIndexedAddr)
minTP = sys.maxint minTP = sys.maxint
minTP_loop = sys.maxint
minTP_noLoop = sys.maxint
minTP_noDepBreaking_noLoop = sys.maxint minTP_noDepBreaking_noLoop = sys.maxint
minTP_single = sys.maxint minTP_single = sys.maxint
@@ -1007,7 +1014,7 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
else: else:
divCycles = 0 divCycles = 0
return TPResult(minTP, minTP_noDepBreaking_noLoop, minTP_single, unfused_uops, fused_uops, None, None, divCycles, 0, False, config, ports_dict) return TPResult(minTP, minTP, minTP, minTP_noDepBreaking_noLoop, minTP_single, unfused_uops, fused_uops, None, None, divCycles, 0, False, config, ports_dict)
else: else:
hasMemWriteOperand = len(instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'))>0 hasMemWriteOperand = len(instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'))>0
uops = None uops = None
@@ -1016,17 +1023,21 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
uopsMS = None uopsMS = None
divCycles = None divCycles = None
ILD_stalls = None ILD_stalls = None
dec0 = False dec0 = None
ports_dict = {} ports_dict = {}
for config in configs: for config in configs:
if config.note: htmlReports.append('<h2>' + config.note + '</h2>\n') if config.note: htmlReports.append('<h2>' + config.note + '</h2>\n')
instrIList = config.independentInstrs instrIList = config.independentInstrs
for ic in sorted(set([1, min(4, len(instrIList)), min(8, len(instrIList)), len(instrIList)])): for ic in sorted(set([1, min(4, len(instrIList)), min(8, len(instrIList)), len(instrIList)])):
if minTP_noLoop < sys.maxint and minTP_loop < sys.maxint and minTP_noLoop > 100 and minTP_loop > 100: break
if len(instrIList) > 1: htmlReports.append('<h3 style="margin-left: 25px">With ' + str(ic) + ' independent instruction' + ('s' if ic>1 else '') + '</h3>\n') if len(instrIList) > 1: htmlReports.append('<h3 style="margin-left: 25px">With ' + str(ic) + ' independent instruction' + ('s' if ic>1 else '') + '</h3>\n')
htmlReports.append('<div style="margin-left: 50px">\n') htmlReports.append('<div style="margin-left: 50px">\n')
for useDepBreakingInstrs in ([False, True] if config.depBreakingInstrs else [False]): for useDepBreakingInstrs in ([False, True] if config.depBreakingInstrs else [False]):
if minTP_noLoop < sys.maxint and minTP_loop < sys.maxint and minTP_noLoop > 100 and minTP_loop > 100: break
if useDepBreakingInstrs: if useDepBreakingInstrs:
instrStr = ';'.join([config.depBreakingInstrs+';'+config.preInstrCode+';'+i.asm for i in instrIList[0:ic]]) instrStr = ';'.join([config.depBreakingInstrs+';'+config.preInstrCode+';'+i.asm for i in instrIList[0:ic]])
htmlReports.append('<h4>With additional dependency-breaking instructions</h4>\n') htmlReports.append('<h4>With additional dependency-breaking instructions</h4>\n')
@@ -1036,17 +1047,21 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
init = list(chain.from_iterable(i.regMemInit for i in instrIList[0:ic])) + config.init init = list(chain.from_iterable(i.regMemInit for i in instrIList[0:ic])) + config.init
for repType in ['unrollOnly', 'loopSmall', 'loopBig']: for repType in ['unrollOnly', 'loopSmall', 'loopBig']:
if minTP < sys.maxint and minTP > 100: continue if minTP_noLoop < sys.maxint and minTP_loop < sys.maxint and minTP_noLoop > 100 and minTP_loop > 100: break
if repType == 'unrollOnly': if repType == 'unrollOnly':
unrollCount = int(round(500/ic+49, -2)) # should still fit in the icache unrollCount = int(round(500/ic+49, -2)) # should still fit in the icache
if instrNode.attrib['iclass'] in ['WBINVD']: unrollCount /= 10; if instrNode.attrib['iclass'] in ['CPUID', 'RDRAND', 'RDSEED', 'WBINVD'] or instrNode.attrib['category'] in ['IO', 'IOSTRINGOP']:
unrollCount = 10
loopCount = 0 loopCount = 0
else: else:
# we test with a small loop body so that uops may be delivered from the loop stream detector (LSD) # we test with a small loop body so that uops may be delivered from the loop stream detector (LSD)
# we also test with a larger loop body to minimize potential overhead from the loop itself # we also test with a larger loop body to minimize potential overhead from the loop itself
loopCount = 100; loopCount = 100
unrollCount = max(1, int(round(10.0/ic))) unrollCount = max(1, int(round(10.0/ic)))
if minTP < sys.maxint and minTP > 100:
unrollCount = 1
loopCount = 10
if repType == 'loopBig': if repType == 'loopBig':
unrollCount *= 10 unrollCount *= 10
@@ -1062,16 +1077,21 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
cycles = fancyRound(result['Core cycles']/ic) cycles = fancyRound(result['Core cycles']/ic)
invalid = False #invalid = False
if any('PORT' in e for e in result): #if any('PORT' in e for e in result):
maxPortUops = max(v/(len(e)-9) for e,v in result.items() if e.startswith('UOPS_PORT') and not '4' in e) # maxPortUops = max(v/(len(e)-9) for e,v in result.items() if e.startswith('UOPS_PORT') and not '4' in e)
if maxPortUops * .98 > result['Core cycles']: # if maxPortUops * .98 > result['Core cycles']:
print 'More uops on ports than cycles, uops: {}, cycles: {}'.format(maxPortUops, result['Core cycles']) # print 'More uops on ports than cycles, uops: {}, cycles: {}'.format(maxPortUops, result['Core cycles'])
#invalid = True # #invalid = True
if not invalid: #if not invalid:
minTP = min(minTP, cycles) minTP = min(minTP, cycles)
if not useDepBreakingInstrs and repType == 'unrollOnly': minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles) if repType == 'unrollOnly':
minTP_noLoop = min(minTP_noLoop, cycles)
if not useDepBreakingInstrs:
minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles)
else:
minTP_loop = min(minTP_loop, cycles)
if ic == 1 and (minTP == sys.maxint or cycles == minTP) and not useDepBreakingInstrs and repType == 'unrollOnly': if ic == 1 and (minTP == sys.maxint or cycles == minTP) and not useDepBreakingInstrs and repType == 'unrollOnly':
minTP_single = min(minTP_single, cycles) minTP_single = min(minTP_single, cycles)
@@ -1106,7 +1126,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
htmlReports.append('</div>') htmlReports.append('</div>')
if minTP < sys.maxint: if minTP < sys.maxint:
return TPResult(minTP, minTP_noDepBreaking_noLoop, minTP_single, uops, uopsFused, uopsMITE, uopsMS, divCycles, ILD_stalls, dec0, minConfig, ports_dict) return TPResult(minTP, minTP_loop, minTP_noLoop, minTP_noDepBreaking_noLoop, minTP_single, uops, uopsFused, uopsMITE, uopsMS, divCycles, ILD_stalls,
dec0, minConfig, ports_dict)
def canMacroFuse(flagInstrNode, branchInstrNode, htmlReports): def canMacroFuse(flagInstrNode, branchInstrNode, htmlReports):
@@ -1521,8 +1542,8 @@ def getDivLatConfigLists(instrNode, opNode1, opNode2, cRep):
configList.append(config) configList.append(config)
configList.isUpperBound = True configList.isUpperBound = True
return configLists return configLists
elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'RCPSS', 'RCPPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD', elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD',
'VSQRTPD','VRSQRTSS', 'VRSQRTPS', 'VRSQRT14PD', 'VRSQRT14PS', 'VRSQRT14SD', 'VRSQRT14SS', 'VRCPSS', 'VRCPPS']: 'VSQRTPD','VRSQRTSS', 'VRSQRTPS', 'VRSQRT14PD', 'VRSQRT14PS', 'VRSQRT14SD', 'VRSQRT14SS']:
dataType = instrNode.attrib['iclass'][-1] dataType = instrNode.attrib['iclass'][-1]
if dataType == 'S': if dataType == 'S':
@@ -1767,7 +1788,7 @@ LatResult = namedtuple('LatResult', ['minLat','maxLat','lat_sameReg','isUpperBou
def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem, tpDict): def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem, tpDict):
cRep = min(100, 2 + 2 * int(math.ceil(tpDict[instrNode].TP_single / 2))) # must be a multiple of 2 cRep = min(100, 2 + 2 * int(math.ceil(tpDict[instrNode].TP_single / 2))) # must be a multiple of 2
if 'DIV' in instrNode.attrib['iclass'] or 'SQRT' in instrNode.attrib['iclass']: if isDivOrSqrtInstr(instrNode):
if not useDistinctRegs: return None if not useDistinctRegs: return None
if targetNode.attrib['type'] == 'flags': return None if targetNode.attrib['type'] == 'flags': return None
if addrMem == 'mem': return None if addrMem == 'mem': return None
@@ -2188,15 +2209,14 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
inputOpnds.append(opNode) inputOpnds.append(opNode)
if opNode.attrib.get('w', '0') == '1': if opNode.attrib.get('w', '0') == '1':
outputOpnds.append(opNode) outputOpnds.append(opNode)
if opNode.attrib.get('r', '0') == '1': if opNode.attrib.get('r', '0') == '0':
continue if opNode.attrib['type'] == 'mem':
if opNode.attrib['type'] == 'mem': inputOpnds.append(opNode) # address of memory write
inputOpnds.append(opNode) # address of memory write elif opNode.attrib.get('conditionalWrite', '0') == '1':
elif opNode.attrib['type'] == 'reg':
if opNode.attrib.get('conditionalWrite', '0') == '1':
inputOpnds.append(opNode)
elif opNode.attrib.get('width', '') in ['8', '16'] and opNode.text.split(',')[0] in GPRegs:
inputOpnds.append(opNode) inputOpnds.append(opNode)
elif opNode.attrib['type'] == 'reg':
if opNode.attrib.get('width', '') in ['8', '16'] and opNode.text.split(',')[0] in GPRegs:
inputOpnds.append(opNode)
archNode = instrNode.find('./architecture[@name="' + arch + '"]') archNode = instrNode.find('./architecture[@name="' + arch + '"]')
measurementNode = archNode.find('./measurement') measurementNode = archNode.find('./measurement')
@@ -2270,12 +2290,31 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
newlatConfig.notes.append('with ' + reg + '=' + regVal) newlatConfig.notes.append('with ' + reg + '=' + regVal)
latConfigList.latConfigs.append(newlatConfig) latConfigList.latConfigs.append(newlatConfig)
# some SSE/AVX instr. (e.g., VORPS (on SKL, CLX), VAESDEC) incur a penalty (?) if a source was not written by an instr. of a similar kind,
# some other instructions (e.g., VPDPWSSD on ICL) incur a penalty if the source was written by an instr. of the same kind;
# therefore, we create configurations for both scenarios
if (isSSEInstr(instrNode) or isAVXInstr(instrNode)) and not isDivOrSqrtInstr(instrNode):
for latConfig in list(latConfigList.latConfigs):
regInit = []
for opNode in instrNode.findall('./operand[@r="1"][@type="reg"]'):
reg = latConfig.instrI.opRegDict[int(opNode.attrib['idx'])]
regPrefix = re.sub('\d', '', reg)
if (regPrefix in ['XMM', 'YMM', 'ZMM']) and (reg not in globalDoNotWriteRegs):
for opNode2 in instrNode.findall('./operand[@w="1"][@type="reg"]'):
if opNode2.text != opNode.text: continue
regInit += [getInstrInstanceFromNode(instrNode, opRegDict={int(opNode2.attrib['idx']):reg}, computeRegMemInit=False).asm]
break
if regInit:
newlatConfig = copy.deepcopy(latConfig)
newlatConfig.instrI.regMemInit.extend(regInit)
newlatConfig.notes.append('source registers initialized by an instruction of the same kind')
latConfigList.latConfigs.append(newlatConfig)
# Create a copy of each experiment with dependency-breaking instructions for all dependencies other than the dependency from opNode2 to # Create a copy of each experiment with dependency-breaking instructions for all dependencies other than the dependency from opNode2 to
# opNode1 if there aren't sufficiently many fill instructions in the chain # opNode1 if there aren't sufficiently many fill instructions in the chain
if (not 'DIV' in instrNode.attrib['iclass'] and not 'SQRT' in instrNode.attrib['iclass'] and if (not isDivOrSqrtInstr(instrNode) and not 'GATHER' in instrNode.attrib['category'] and not 'SCATTER' in instrNode.attrib['category']):
not 'GATHER' in instrNode.attrib['category'] and not 'SCATTER' in instrNode.attrib['category']):
for latConfig in list(latConfigList.latConfigs): for latConfig in list(latConfigList.latConfigs):
if latConfig.chainLatency > tpDict[instrNode].TP_single: if not isAVXInstr(instrNode) and latConfig.chainLatency > tpDict[instrNode].TP_single:
continue continue
depBreakingInstrs = getDependencyBreakingInstrs(instrNode, latConfig.instrI.opRegDict) depBreakingInstrs = getDependencyBreakingInstrs(instrNode, latConfig.instrI.opRegDict)
@@ -2429,10 +2468,12 @@ def isSSEInstr(instrNode):
extension = instrNode.attrib['extension'] extension = instrNode.attrib['extension']
return 'SSE' in extension or extension in ['AES'] return 'SSE' in extension or extension in ['AES']
def isAVXInstr(instrNode): def isAVXInstr(instrNode):
return ('vex' in instrNode.attrib or 'evex' in instrNode.attrib) return ('vex' in instrNode.attrib or 'evex' in instrNode.attrib)
def isDivOrSqrtInstr(instrNode):
return ('DIV' in instrNode.attrib['iclass']) or ('SQRT' in instrNode.attrib['iclass'])
def writeHtmlFile(folder, instrNode, title, body): def writeHtmlFile(folder, instrNode, title, body):
filename = canonicalizeInstrString(instrNode.attrib['string']) filename = canonicalizeInstrString(instrNode.attrib['string'])
@@ -2471,7 +2512,7 @@ def filterInstructions(XMLRoot):
# Not supported by assembler # Not supported by assembler
if XMLInstr.attrib['iclass'] == 'NOP' and len(XMLInstr.findall('operand')) > 1: if XMLInstr.attrib['iclass'] == 'NOP' and len(XMLInstr.findall('operand')) > 1:
instrSet.discard(XMLInstr) instrSet.discard(XMLInstr)
if extension in ['WBNOINVD']: instrSet.discard(XMLInstr) if extension in ['MCOMMIT', 'WBNOINVD']: instrSet.discard(XMLInstr)
# Only supported by VIA # Only supported by VIA
if 'VIA_' in extension: if 'VIA_' in extension:
@@ -2572,7 +2613,7 @@ def filterInstructions(XMLRoot):
if extension == 'RDTSCP' and not cpuid.get_bit(edx8_1, 27): instrSet.discard(XMLInstr) if extension == 'RDTSCP' and not cpuid.get_bit(edx8_1, 27): instrSet.discard(XMLInstr)
if extension == '3DNOW' and not cpuid.get_bit(edx8_1, 31): instrSet.discard(XMLInstr) if extension == '3DNOW' and not cpuid.get_bit(edx8_1, 31): instrSet.discard(XMLInstr)
if extension == 'CLZERO' and not cpuid.get_bit(ebx8_8, 0): instrSet.discard(XMLInstr) if extension == 'CLZERO' and not cpuid.get_bit(ebx8_8, 0): instrSet.discard(XMLInstr)
if extension == 'MCOMMIT' and not cpuid.get_bit(ebx8_8, 8): instrSet.discard(XMLInstr) #if extension == 'MCOMMIT' and not cpuid.get_bit(ebx8_8, 8): instrSet.discard(XMLInstr)
# Virtualization instructions # Virtualization instructions
if extension in ['SVM', 'VMFUNC', 'VTX']: instrSet.discard(XMLInstr) if extension in ['SVM', 'VMFUNC', 'VTX']: instrSet.discard(XMLInstr)
@@ -2674,7 +2715,7 @@ def main():
# preInstr has been measured # preInstr has been measured
instrRequiringPreInstr = [] instrRequiringPreInstr = []
if not useIACA: if not useIACA:
instrRequiringPreInstr = [x for x in instrNodeList if 'DIV' in x.attrib['iclass'] or 'SQRT' in x.attrib['iclass'] or getPreInstr(x)[0]] instrRequiringPreInstr = [x for x in instrNodeList if isDivOrSqrtInstr(x) or getPreInstr(x)[0]]
instrNodeList.sort(key=lambda x: (x in instrRequiringPreInstr, x.attrib['string'])) instrNodeList.sort(key=lambda x: (x in instrRequiringPreInstr, x.attrib['string']))
condBrInstr = [i for i in instrNodeList if i.attrib['category'] == 'COND_BR' and i.attrib['isa-set'] == 'I86' and not 'LOOP' in i.attrib['iclass']] condBrInstr = [i for i in instrNodeList if i.attrib['category'] == 'COND_BR' and i.attrib['isa-set'] == 'I86' and not 'LOOP' in i.attrib['iclass']]
@@ -2708,7 +2749,7 @@ def main():
tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()} tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()}
else: else:
for i, instrNode in enumerate(instrNodeList): for i, instrNode in enumerate(instrNodeList):
#if not 'MOV_NOREX' in instrNode.attrib['string']: continue #if not 'VPDPWSSD (XMM, K, XMM, XMM)' in instrNode.attrib['string']: continue
print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n'] htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
@@ -2729,7 +2770,8 @@ def main():
htmlReports.append('<hr><h2 id="sameReg">With the same register for for different operands</h2>\n') htmlReports.append('<hr><h2 id="sameReg">With the same register for for different operands</h2>\n')
tpResultSR = getThroughputAndUops(instrNode, False, False, htmlReports) tpResultSR = getThroughputAndUops(instrNode, False, False, htmlReports)
if tpResultSR and (tpResult.uops != tpResultSR.uops or tpResult.fused_uops != tpResultSR.fused_uops or tpResult.uops_MITE != tpResultSR.uops_MITE if tpResultSR and (tpResult.uops != tpResultSR.uops or tpResult.fused_uops != tpResultSR.fused_uops or tpResult.uops_MITE != tpResultSR.uops_MITE
or abs(tpResult.TP-tpResultSR.TP) > .05): or abs(sum(tpResult.unblocked_ports.values()) - sum(tpResultSR.unblocked_ports.values())) > .8
or tpResultSR.TP_single < .95 * tpResult.TP_single):
tpDictSameReg[instrNode] = tpResultSR tpDictSameReg[instrNode] = tpResultSR
if hasExplMemOp: if hasExplMemOp:
@@ -2773,7 +2815,7 @@ def main():
latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()} latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()}
elif not useIACA or iacaVersion == '2.1': elif not useIACA or iacaVersion == '2.1':
for i, instrNode in enumerate(instrNodeList): for i, instrNode in enumerate(instrNodeList):
#if not 'ADC' in instrNode.attrib['string']: continue #if not 'AES' in instrNode.attrib['string']: continue
print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')'
htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n'] htmlReports = ['<h1>' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '</h1>\n<hr>\n']
@@ -3056,7 +3098,8 @@ def main():
if useIACA and instrNode in latencyDict: if useIACA and instrNode in latencyDict:
resultNode.attrib['latency'] = str(latencyDict[instrNode]) resultNode.attrib['latency'] = str(latencyDict[instrNode])
resultNode.attrib['TP'+suffix] = "%.2f" % tpResult.TP resultNode.attrib['TP_unrolled'+suffix] = "%.2f" % tpResult.TP_noLoop
resultNode.attrib['TP_loop'+suffix] = "%.2f" % tpResult.TP_loop
if instrNode in tpDictNoInteriteration: if instrNode in tpDictNoInteriteration:
resultNode.attrib['TP_no_interiteration'] = "%.2f" % tpDictNoInteriteration[instrNode] resultNode.attrib['TP_no_interiteration'] = "%.2f" % tpDictNoInteriteration[instrNode]