diff --git a/kernel/nb_km.c b/kernel/nb_km.c index 13797a8..7104865 100644 --- a/kernel/nb_km.c +++ b/kernel/nb_km.c @@ -36,7 +36,7 @@ int (*set_memory_nx)(unsigned long, int) = 0; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andreas Abel"); -// __vmalloc has no langer the pgprot_t parameter so we need to hook __vmalloc_node_range directly +// __vmalloc has no longer the pgprot_t parameter, so we need to hook __vmalloc_node_range directly #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) void *(*kallsym__vmalloc_node_range)(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, @@ -44,7 +44,7 @@ void *(*kallsym__vmalloc_node_range)(unsigned long size, unsigned long align, const void *caller); #endif -// kallsyms_lookup_name is no logner supported we use a kprobes to get the address +// kallsyms_lookup_name is no longer supported; we use a kprobes to get the address #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) #include #include diff --git a/tools/CPUID/cpuid.py b/tools/CPUID/cpuid.py index 7d70ffd..d7e2cd0 100755 --- a/tools/CPUID/cpuid.py +++ b/tools/CPUID/cpuid.py @@ -218,6 +218,8 @@ def micro_arch(cpu): return 'CNL' if (vi.displ_family, vi.displ_model) in [(0x06, 0x7D), (0x06, 0x7E)]: return 'ICL' + if (vi.displ_family, vi.displ_model) in [(0x06, 0x8C), (0x06, 0x8D)]: + return 'TGL' if (vi.displ_family, vi.displ_model) in [(0x17, 0x01), (0x17, 0x11)]: return 'ZEN' if (vi.displ_family, vi.displ_model) in [(0x17, 0x08), (0x17, 0x18)]: diff --git a/tools/CacheAnalyzer/cacheLib.py b/tools/CacheAnalyzer/cacheLib.py index 4d6af0f..da368c7 100755 --- a/tools/CacheAnalyzer/cacheLib.py +++ b/tools/CacheAnalyzer/cacheLib.py @@ -23,26 +23,26 @@ def getEventConfig(event): if event == 'L1_HIT': if arch in ['Core', 'EnhancedCore']: return '40.0E ' + event # L1D_CACHE_LD.MES if arch in ['NHM', 'WSM']: return 'CB.01 ' + event - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.01 ' + event + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'TGL']: return 'D1.01 ' + event if event == 'L1_MISS': if arch in ['Core', 'EnhancedCore']: return 'CB.01.CTR=0 ' + event - if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.08 ' + event + if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'TGL']: return 'D1.08 ' + event if arch in ['ZEN+']: return '064.70 ' + event if event == 'L2_HIT': if arch in ['Core', 'EnhancedCore']: return '29.7E ' + event # L2_LD.THIS_CORE.ALL_INCL.MES if arch in ['NHM', 'WSM']: return 'CB.02 ' + event - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.02 ' + event + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'TGL']: return 'D1.02 ' + event if arch in ['ZEN+']: return '064.70 ' + event if event == 'L2_MISS': if arch in ['Core', 'EnhancedCore']: return 'CB.04.CTR=0 ' + event - if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.10 ' + event + if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'TGL']: return 'D1.10 ' + event if arch in ['ZEN+']: return '064.08 ' + event if event == 'L3_HIT': if arch in ['NHM', 'WSM']: return 'CB.04 ' + event - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.04 ' + event + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'TGL']: return 'D1.04 ' + event if event == 'L3_MISS': if arch in ['NHM', 'WSM']: return 'CB.10 ' + event - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.20 ' + event + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'TGL']: return 'D1.20 ' + event return '' def getDefaultCacheConfig(): @@ -51,7 +51,7 @@ def getDefaultCacheConfig(): def getDefaultCacheMSRConfig(): if 'Intel' in getCPUVendor() and 'L3' in getCpuidCacheInfo() and getCpuidCacheInfo()['L3']['complex']: - if getArch() in ['CNL', 'ICL']: + if getArch() in ['CNL', 'ICL', 'TGL']: dist = 8 ctrOffset = 2 else: @@ -150,8 +150,8 @@ def getNCBoxUnits(): if not hasattr(getNCBoxUnits, 'nCBoxUnits'): try: subprocess.check_output(['modprobe', 'msr']) - cbo_config = subprocess.check_output(['rdmsr', '0x396']) - if getArch() in ['CNL', 'ICL']: + cbo_config = subprocess.check_output(['rdmsr', '0x396', '-f', '3:0']) + if getArch() in ['CNL', 'ICL', 'TGL']: getNCBoxUnits.nCBoxUnits = int(cbo_config) else: getNCBoxUnits.nCBoxUnits = int(cbo_config) - 1 diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py index af39291..61caf31 100755 --- a/tools/cpuBench/cpuBench.py +++ b/tools/cpuBench/cpuBench.py @@ -205,7 +205,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, elif arch in ['NHM', 'WSM']: evt = 'UOPS_RETIRED.ANY' elif arch in ['SNB']: evt = 'UOPS_RETIRED.ALL' elif arch in ['HSW']: evt = 'UOPS_EXECUTED.CORE' - elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: evt = 'UOPS_EXECUTED.THREAD' + elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: evt = 'UOPS_EXECUTED.THREAD' localHtmlReports.append('
  • ' + evt + ': ' + str(value) + '
  • \n') localHtmlReports.append('\n') @@ -270,25 +270,25 @@ def getEventConfig(event): if arch in ['NHM', 'WSM', 'SNB' ]: return 'C2.01' # UOPS_RETIRED.ANY if arch in ['SNB']: return 'C2.01' # UOPS_RETIRED.ALL if arch in ['HSW']: return 'B1.02' # UOPS_EXECUTED.CORE; note: may undercount due to erratum HSD30 - if arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'B1.01' # UOPS_EXECUTED.THREAD + if arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return 'B1.01' # UOPS_EXECUTED.THREAD if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '0C1.00' if event == 'RETIRE_SLOTS': - if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'C2.02' + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return 'C2.02' if event == 'UOPS_MITE': - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '79.04' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return '79.04' if event == 'UOPS_MITE>0': - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '79.04.CMSK=1' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return '79.04.CMSK=1' if event == 'UOPS_MS': if arch in ['NHM', 'WSM']: return 'D1.02' - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '79.30' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return '79.30' if event == 'UOPS_PORT0': if arch in ['CON', 'WOL']: return 'A1.01.CTR=0' if arch in ['NHM', 'WSM']: return 'B1.01' - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.01' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return 'A1.01' if event == 'UOPS_PORT1': if arch in ['CON', 'WOL']: return 'A1.02.CTR=0' if arch in ['NHM', 'WSM']: return 'B1.02' - if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.02' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return 'A1.02' if event == 'UOPS_PORT2': if arch in ['CON', 'WOL']: return 'A1.04.CTR=0' if arch in ['NHM', 'WSM']: return 'B1.04' @@ -308,23 +308,23 @@ def getEventConfig(event): if arch in ['CON', 'WOL']: return 'A1.20.CTR=0' if arch in ['NHM', 'WSM']: return 'B1.20' if arch in ['SNB', 'IVB']: return 'A1.80' - if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.20' + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return 'A1.20' if event == 'UOPS_PORT6': - if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'A1.40' + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return 'A1.40' if event == 'UOPS_PORT7': if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return 'A1.80' if event == 'UOPS_PORT23': - if arch in ['ICL']: return 'A1.04' + if arch in ['ICL', 'TGL']: return 'A1.04' if event == 'UOPS_PORT49': - if arch in ['ICL']: return 'A1.10' + if arch in ['ICL', 'TGL']: return 'A1.10' if event == 'UOPS_PORT78': - if arch in ['ICL']: return 'A1.80' + if arch in ['ICL', 'TGL']: return 'A1.80' if event == 'DIV_CYCLES': if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return '14.01.CMSK=1' # undocumented on HSW, but seems to work - if arch in ['ICL']: return '14.09.CMSK=1' + if arch in ['ICL', 'TGL']: return '14.09.CMSK=1' if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '0D3.00' if event == 'ILD_STALL.LCP': - if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '87.01' + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL']: return '87.01' if event == 'INST_DECODED.DEC0': if arch in ['NHM', 'WSM']: return '18.01' if event == 'FpuPipeAssignment.Total0': @@ -335,6 +335,11 @@ def getEventConfig(event): if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '000.04' if event == 'FpuPipeAssignment.Total3': if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '000.08' + # the following two counters are undocumented so far, but seem to work + if event == 'FpuPipeAssignment.Total4': + if arch in ['ZEN3']: return '000.10' + if event == 'FpuPipeAssignment.Total5': + if arch in ['ZEN3']: return '000.20' return None @@ -566,7 +571,11 @@ def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstr else: events = ['UOPS_PORT'+str(p) for p in blockedPorts] else: - events = ['FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3'] + if arch in ['ZEN+', 'ZEN2']: + events = ['FpuPipeAssignment.Total'+str(p) for p in range(0,4)] + elif arch in ['ZEN3']: + events = ['FpuPipeAssignment.Total'+str(p) for p in range(0,6)] + configurePFCs(events) blockInstrAsm = ';'.join(islice(cycle(x.asm for x in blockInstrsList), blockInstrRep)) @@ -720,6 +729,19 @@ def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeI depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode) # instructions with multiple configs + if iclass == 'CPUID': + configs = [] + cpu = cpuid.CPUID() + for eax in (0x0, 0x80000000): + maxEax = cpu(eax)[0] + while eax <= maxEax + 1: + preInstrCode = 'mov EAX, {}; mov ECX, 0'.format(hex(eax)) + preInstrNodes = [instrNodeDict['MOV (R32, I32)'], instrNodeDict['MOV (R32, I32)']] + note = 'With EAX={}, and ECX=0'.format(hex(eax)) + configs.append(TPConfig(independentInstrs=independentInstrs, preInstrCode=preInstrCode, preInstrNodes=preInstrNodes, note=note)) + eax += 1 + return configs + if iclass in ['JB', 'JBE', 'JLE', 'JNB', 'JNBE', 'JNLE', 'JNO', 'JNP', 'JNS', 'JNZ', 'JO', 'JP', 'JS', 'JZ']: config0 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; and qword ptr [RSP], ~0x8D5; popfq'], note='With all flags set to 0') config1 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; or qword ptr [RSP], 0x8D5; popfq'], note='With all flags set to 1') @@ -770,6 +792,9 @@ def getTPConfigs(instrNode, useDistinctRegs=True, useIndexedAddr=False, computeI if iform == 'LMSW_GPR16': config.init = list(set('SMSW ' + reg for i in independentInstrs for reg in i.readRegs)) if iform == 'LMSW_MEMw': config.init = list(['SMSW [R14+'+str(i*64)+']' for i in range(0,maxTPRep)]) + if iform == 'MOVDIR64B_GPRa_MEM': + config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1: 'RSI'})] + if iform == 'POPF': config.init = ['PUSHF; POP AX'] if iform == 'POPFQ': @@ -1077,7 +1102,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports if minTP_noLoop < sys.maxint and minTP_loop < sys.maxint and minTP_noLoop > 100 and minTP_loop > 100: break paddingTypes = [''] - if (repType != 'unrollOnly') and (uopsMITE is not None) and (math.ceil(32.0/instrLen) * uopsMITE > 18) and (not 'RIP' in config.preInstrCode): + if ((repType != 'unrollOnly') and (uopsMITE is not None) and (not uopsMS) and (math.ceil(32.0/instrLen) * uopsMITE > 18) + and (not 'RIP' in config.preInstrCode)): if (instrNode.attrib.get('vex', '') != '') or (instrNode.attrib.get('evex', '') != '') or (instrNode.attrib.get('high8', '') != ''): paddingTypes.append('long NOPs') else: @@ -1096,7 +1122,7 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports if repType == 'unrollOnly': unrollCount = int(round(500/ic+49, -2)) # should still fit in the icache - if instrNode.attrib['iclass'] in ['CPUID', 'RDRAND', 'RDSEED', 'WBINVD'] or instrNode.attrib['category'] in ['IO', 'IOSTRINGOP']: + if instrNode.attrib['iclass'] in ['RDRAND', 'RDSEED', 'WBINVD'] or instrNode.attrib['category'] in ['IO', 'IOSTRINGOP']: unrollCount = 10 loopCount = 0 else: @@ -1171,7 +1197,7 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports divCycles = int(result['DIV_CYCLES']+.2) if (not config.preInstrCode) and ((uopsMITE > 1) or (uopsMS > 0) or (result.get('INST_DECODED.DEC0', 0) > .05) or - ((result.get('UOPS_MITE>0', 1) > .95) and (not isBranchInstr(instrNode)))): # ToDo: preInstrs + ((result.get('UOPS_MITE>0', 0) > .95) and (not isBranchInstr(instrNode)))): # ToDo: preInstrs complexDec = True if complexDec and ('UOPS_MITE>0' in result): @@ -1250,6 +1276,12 @@ def getBasicLatencies(instrNodeList): basicLatency['SET' + flag[0]] = 1 basicLatency['TEST'] = 1 + testSetHigh8Result = runExperiment(None, 'TEST AH, AH; SET' + flag[0] + ' AH') + testSetHigh8Cycles = int(round(testSetHigh8Result['Core cycles'])) + if testSetHigh8Cycles == 2: + basicLatency['SET' + flag[0] + '_R8h'] = 1 + basicLatency['TEST_R8h_R8h'] = 1 + testCmovResult = runExperiment(None, 'TEST RAX, RAX; CMOV' + flag[0] + ' RAX, RAX') basicLatency['CMOV' + flag[0]] = int(round(testCmovResult['Core cycles'])) - 1 @@ -1865,8 +1897,6 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem if instrNode.attrib.get('mask', '') == '1' and (startNode == targetNode): return None return getDivLatConfigLists(instrNode, startNode, targetNode, cRep) - init = [] - startNodeIdx = int(startNode.attrib['idx']) targetNodeIdx = int(targetNode.attrib['idx']) @@ -1891,6 +1921,14 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem configList.append(LatConfig(getInstrInstanceFromNode(instrNode), chainInstrs=chainInstrs, chainLatency=chainLatency)) else: return None + elif instrNode.attrib['iclass'] == 'MOVDIR64B': + if (startNodeIdx == 1) and (targetNodeIdx == 3): + instrI = getInstrInstanceFromNode(instrNode, opRegDict={1: 'RSI'}) + chainInstrs = 'MOV RSI, [RSI]' + configList.isUpperBound = True + configList.append(LatConfig(getInstrInstanceFromNode(instrNode), chainInstrs=chainInstrs, chainLatency=1, init='MOV [R14], RSI')) + else: + return None elif instrNode.attrib['iclass'] == 'XGETBV': if startNode.text == 'ECX': chainInstrs = 'MOVSX ECX, {}; '.format(regTo16(targetNode.text)) @@ -2033,7 +2071,14 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem regSize = getRegSize(reg) if regSize == 8: chainInstr = 'SET{} {};'.format(flag[0], reg) - chainLatency = basicLatency['SET' + flag[0]] + if reg in High8Regs: + if 'SET' + flag[0] + '_R8h' in basicLatency: + chainLatency = basicLatency['SET' + flag[0] + '_R8h'] + else: + chainLatency = 1 + configList.isUpperBound = True + else: + chainLatency = basicLatency['SET' + flag[0]] else: chainInstr = 'CMOV{} {}, {};'.format(flag[0], regToSize('R15', regSize), regToSize('R15', regSize)) chainInstr += 'MOVSX {}, {};'.format(regTo64(reg), regToSize('R15', min(32, regSize))) @@ -2112,7 +2157,14 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem if reg in GPRegs: instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs, opRegDict={targetNodeIdx:reg}) chainInstrs = 'TEST {0}, {0};'.format(reg) - chainLatency = basicLatency['TEST'] + if reg in High8Regs: + if 'TEST_R8h_R8h' in basicLatency: + chainLatency = basicLatency['TEST_R8h_R8h'] + else: + chainLatency = 1 + configList.isUpperBound = True + else: + chainLatency = basicLatency['TEST'] configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) if reg in High8Regs: @@ -2315,7 +2367,8 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports): return latency else: - if instrNode.attrib['iclass'] in ['CALL_NEAR', 'CALL_NEAR_MEMv', 'CLZERO', 'JMP', 'JMP_MEMv', 'RET_NEAR', 'RET_NEAR_IMMw', 'RDMSR', 'WRMSR', 'RDPMC', 'CPUID', 'POPF', 'POPFQ']: + if instrNode.attrib['iclass'] in ['CALL_NEAR', 'CALL_NEAR_MEMv', 'CLZERO', 'JMP', 'JMP_MEMv', 'MOVDIR64B', 'RET_NEAR', 'RET_NEAR_IMMw', 'RDMSR', 'WRMSR', + 'RDPMC', 'CPUID', 'POPF', 'POPFQ']: return None if 'XSAVE' in instrNode.attrib['iclass']: return None @@ -2839,7 +2892,8 @@ def main(): resetNanoBench() if arch in ['ZEN+', 'ZEN2', 'ZEN3']: - configurePFCs(['UOPS','FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3', 'DIV_CYCLES']) + configurePFCs(['UOPS','FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3', + 'FpuPipeAssignment.Total4', 'FpuPipeAssignment.Total5', 'DIV_CYCLES']) else: configurePFCs(['UOPS', 'RETIRE_SLOTS', 'UOPS_MITE', 'UOPS_MS', 'UOPS_PORT0', 'UOPS_PORT1', 'UOPS_PORT2', 'UOPS_PORT3', 'UOPS_PORT4', 'UOPS_PORT5', 'UOPS_PORT6', 'UOPS_PORT7', 'UOPS_PORT23', 'UOPS_PORT49', 'UOPS_PORT78', 'DIV_CYCLES', 'ILD_STALL.LCP', 'INST_DECODED.DEC0', @@ -2862,10 +2916,10 @@ def main(): # move instructions that need a preInstr to the end, as their throughput can only be determined after the throughput of the instructions included in the # preInstr has been measured - instrRequiringPreInstr = [] - if not useIACA: - instrRequiringPreInstr = [x for x in instrNodeList if isDivOrSqrtInstr(x) or getPreInstr(x)[0]] - instrNodeList.sort(key=lambda x: (x in instrRequiringPreInstr, x.attrib['string'])) + #instrRequiringPreInstr = [] + #if not useIACA: + # instrRequiringPreInstr = [x for x in instrNodeList if isDivOrSqrtInstr(x) or getPreInstr(x)[0]] + instrNodeList.sort(key=lambda x: x.attrib['string']) condBrInstr = [i for i in instrNodeList if i.attrib['category'] == 'COND_BR' and i.attrib['isa-set'] == 'I86' and not 'LOOP' in i.attrib['iclass']] @@ -3009,13 +3063,18 @@ def main(): or '_AL_' in instr.attrib['iform'] or '_OrAX_' in instr.attrib['iform'] or tpDict[instr].TP_noDepBreaking_noLoop - .2 > max([uops for _, uops in tpDict[instr].unblocked_ports.items()] or [0]) or '512' in instr.attrib['isa-set']) # on SKX, some AVX-512 instructions can 'shut down' vector units on port 1 + if isAMDCPU(): disallowedBlockingInstrs |= set(instr for instr in instrNodeList for op in instr.findall('./operand[@type="mem"]')) # combining SHA instr. with other instr. leads to wrong port counts disallowedBlockingInstrs |= set(instr for instr in instrNodeList if instr.attrib['extension'] == 'SHA') # combining FP with non-FP instr. can lead to wrong port counts - disallowedBlockingInstrs |= set(instr for instr in instrNodeList if instr.attrib['category'] in ['LOGICAL_FP'] or - any(not 'f' in o.attrib.get('xtype','f') for o in instr.findall('./operand'))) + #disallowedBlockingInstrs |= set(instr for instr in instrNodeList if instr.attrib['category'] in ['LOGICAL_FP'] or + # any(not 'f' in o.attrib.get('xtype','f') for o in instr.findall('./operand'))) + if arch in ['ZEN3']: + # we need one instruction with 1*FP45; + # their throughput is limited to 1 per cycle; thus, they are disallowed by the TP_noDepBreaking_noLoop check above + disallowedBlockingInstrs.remove(instrNodeDict['MOVD (R32, XMM)']) print 'disallowedBlockingInstrs' for instrNode in disallowedBlockingInstrs: @@ -3060,19 +3119,20 @@ def main(): #print str(blockingInstructionsDictNonAVX.items()) - # mov to mem has always two uops: store address and store data; there is no instruction that uses just one of them - movMemInstrNode = instrNodeDict['MOV (M64, R64)'] + if isIntelCPU(): + # mov to mem has always two uops: store address and store data; there is no instruction that uses just one of them + movMemInstrNode = instrNodeDict['MOV (M64, R64)'] - if arch in ['ICL']: - storeDataPort = 49 - else: - storeDataPort = 4 - blockingInstructionsDictNonAVX[frozenset({storeDataPort})] = movMemInstrNode - blockingInstructionsDictNonSSE[frozenset({storeDataPort})] = movMemInstrNode + if arch in ['ICL', 'TGL']: + storeDataPort = 49 + else: + storeDataPort = 4 + blockingInstructionsDictNonAVX[frozenset({storeDataPort})] = movMemInstrNode + blockingInstructionsDictNonSSE[frozenset({storeDataPort})] = movMemInstrNode - storeAddressPorts = frozenset({p for p, x in tpDict[movMemInstrNode].unblocked_ports.items() if x>=0.1 and not p == storeDataPort}) - if storeAddressPorts not in blockingInstructionsDictNonAVX: blockingInstructionsDictNonAVX[storeAddressPorts] = movMemInstrNode - if storeAddressPorts not in blockingInstructionsDictNonSSE: blockingInstructionsDictNonSSE[storeAddressPorts] = movMemInstrNode + storeAddressPorts = frozenset({p for p, x in tpDict[movMemInstrNode].unblocked_ports.items() if x>=0.1 and not p == storeDataPort}) + if storeAddressPorts not in blockingInstructionsDictNonAVX: blockingInstructionsDictNonAVX[storeAddressPorts] = movMemInstrNode + if storeAddressPorts not in blockingInstructionsDictNonSSE: blockingInstructionsDictNonSSE[storeAddressPorts] = movMemInstrNode print 'Non-AVX:' for k,v in blockingInstructionsDictNonAVX.items(): @@ -3085,12 +3145,8 @@ def main(): sortedPortCombinationsNonSSE = sorted(blockingInstructionsDictNonSSE.keys(), key=lambda x:(len(x), sorted(x))) print 'sortedPortCombinations: ' + str(sortedPortCombinationsNonAVX) - for i, instrNode in enumerate(instrNodeList): - if not instrNode in tpDict: - # don't iterate over the keys of tpDict directly because of the ordering - continue - - #if not 'LEA' in instrNode.attrib['string']: continue + for i, instrNode in enumerate(sorted(tpDict.keys(), key=lambda x: (tpDict[x].config.preInstrNodes, x.attrib['string']))): + #if not 'CVTPD2PI' in instrNode.attrib['string']: continue print 'Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' @@ -3133,7 +3189,7 @@ def main(): # one uop instruction uopsCombinationList = [(frozenset(used_ports), 1)] htmlReports.append('
    Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports)) - elif rem_uops > 0 and not isAMDCPU(): + elif (rem_uops > 0) and (arch not in ['ZEN+', 'ZEN2']): for combination in sortedPortCombinations: if not combination.intersection(used_ports): continue @@ -3142,6 +3198,10 @@ def main(): if prev_combination.issubset(combination): prevUopsOnCombination += prev_uops + uopsOnCombinationUnblocked = sum(x for p, x in tpResult.unblocked_ports.items() if p in combination) + if uopsOnCombinationUnblocked - prevUopsOnCombination < .8: + continue + if not useIACA: if tpResult.config.preInstrNodes: for preInstrNode in tpResult.config.preInstrNodes: @@ -3164,7 +3224,7 @@ def main(): if uopsOnBlockedPorts <= 0: continue - if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1': + if isIntelCPU() and (combination == {storeDataPort}) and (instrNode.attrib.get('locked', '') == '1'): # for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which # seems to lead to replays of the store data uops uopsOnBlockedPorts = 1 @@ -3270,7 +3330,12 @@ def main(): portUsageWithDivList = list(portUsageList) if divCycles: portUsageWithDivList.append((frozenset(['div']), divCycles)) - resultNode.attrib['TP_ports'+suffix] = "%.2f" % getTP_LP(portUsageWithDivList) + + try: + resultNode.attrib['TP_ports'+suffix] = "%.2f" % getTP_LP(portUsageWithDivList) + except ValueError as err: + print 'Could not solve LP for ' + instrNode.attrib['string'] + ':' + print err with open(args.output, "w") as f: reparsed = XMLRoot diff --git a/user/nanoBench_main.c b/user/nanoBench_main.c index 7fbe763..c221005 100644 --- a/user/nanoBench_main.c +++ b/user/nanoBench_main.c @@ -211,11 +211,11 @@ int main(int argc, char **argv) { return 1; } - runtime_r14 = malloc(RUNTIME_R_SIZE); - runtime_rbp = malloc(RUNTIME_R_SIZE); - runtime_rdi = malloc(RUNTIME_R_SIZE); - runtime_rsi = malloc(RUNTIME_R_SIZE); - runtime_rsp = malloc(RUNTIME_R_SIZE); + posix_memalign((void**)&runtime_r14, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE); + posix_memalign((void**)&runtime_rbp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE); + posix_memalign((void**)&runtime_rdi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE); + posix_memalign((void**)&runtime_rsi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE); + posix_memalign((void**)&runtime_rsp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE); if (!runtime_r14 || !runtime_rbp || !runtime_rdi || !runtime_rsi || !runtime_rsp) { fprintf(stderr, "Error: Could not allocate memory for runtime_r*\n"); return 1;