From 428b37cd518bef2fdf50ea403e8b58e0d76da845 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Thu, 12 Nov 2020 23:24:26 +0100 Subject: [PATCH 1/4] Add ZEN3 uarch to CPUID tool. --- tools/CPUID/cpuid.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/CPUID/cpuid.py b/tools/CPUID/cpuid.py index 7db45eb..a33ef70 100755 --- a/tools/CPUID/cpuid.py +++ b/tools/CPUID/cpuid.py @@ -224,6 +224,8 @@ def micro_arch(cpu): return 'ZEN+' if (vi.displ_family, vi.displ_model) in [(0x17, 0x71)]: return 'ZEN2' + if (vi.displ_family, vi.displ_model) in [(0x19, 0x21)]: + return 'ZEN3' return 'unknown' From d87adcc19e299c614dcdfc74ec7026444662239c Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Thu, 12 Nov 2020 23:37:01 +0100 Subject: [PATCH 2/4] Add workaround for Zen 3 L3 cache associativity. --- tools/CPUID/cpuid.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/CPUID/cpuid.py b/tools/CPUID/cpuid.py index a33ef70..7d70ffd 100755 --- a/tools/CPUID/cpuid.py +++ b/tools/CPUID/cpuid.py @@ -505,6 +505,9 @@ def get_cache_info(cpu): elif d_15_12 == 0x4: L3Assoc = 4 elif d_15_12 == 0x6: L3Assoc = 8 elif d_15_12 == 0x8: L3Assoc = 16 + # Value 0x9, returned by Zen 3, is reserved according to AMD CPUID Specification document. + # The Software Optimization Guide for AMD Family 19h Processors specifies L3 cache to be 16-way associative and shared by 8 cores inside a CPU complex. + elif d_15_12 == 0x9: L3Assoc = 16 elif d_15_12 == 0xA: L3Assoc = 32 elif d_15_12 == 0xB: L3Assoc = 48 elif d_15_12 == 0xC: L3Assoc = 64 From 210e09b7ab33d211679dda3430b177a37a490274 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Thu, 12 Nov 2020 23:41:35 +0100 Subject: [PATCH 3/4] Add ZEN3 to cpuBench.py. --- tools/cpuBench/cpuBench.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py index ed0af0c..bfb4120 100755 --- a/tools/cpuBench/cpuBench.py +++ b/tools/cpuBench/cpuBench.py @@ -54,7 +54,7 @@ serializingInstructions = {'INVD', 'INVEPT', 'INVLPG', 'INVVPID', 'LGDT', 'LIDT' 'CPUID', 'IRET', 'RSM', 'SFENCE', 'LFENCE', 'MFENCE'} def isAMDCPU(): - return arch in ['ZEN+', 'ZEN2'] + return arch in ['ZEN+', 'ZEN2', 'ZEN3'] def isIntelCPU(): return not isAMDCPU() @@ -247,7 +247,7 @@ def getEventConfig(event): if arch in ['CON', 'WOL']: return 'A0.00' # RS_UOPS_DISPATCHED if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW']: return 'C2.01' # UOPS_RETIRED.ALL if arch in ['SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'B1.01' # UOPS_EXECUTED.THREAD - if arch in ['ZEN+', 'ZEN2']: return '0C1.00' + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '0C1.00' if event == 'RETIRE_SLOTS': if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return 'C2.02' if event == 'UOPS_MITE': @@ -296,19 +296,19 @@ def getEventConfig(event): if event == 'DIV_CYCLES': if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'CLX']: return '14.01.CMSK=1' # undocumented on HSW, but seems to work if arch in ['ICL']: return '14.09.CMSK=1' - if arch in ['ZEN+', 'ZEN2']: return '0D3.00' + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '0D3.00' if event == 'ILD_STALL.LCP': if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX']: return '87.01' if event == 'INST_DECODED.DEC0': if arch in ['NHM', 'WSM']: return '18.01' if event == 'FpuPipeAssignment.Total0': - if arch in ['ZEN+', 'ZEN2']: return '000.01' + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '000.01' if event == 'FpuPipeAssignment.Total1': - if arch in ['ZEN+', 'ZEN2']: return '000.02' + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '000.02' if event == 'FpuPipeAssignment.Total2': - if arch in ['ZEN+', 'ZEN2']: return '000.04' + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '000.04' if event == 'FpuPipeAssignment.Total3': - if arch in ['ZEN+', 'ZEN2']: return '000.08' + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '000.08' return None @@ -1731,7 +1731,7 @@ def getChainInstrForVectorRegs(instrNode, startReg, targetReg, cRep, cType): # We use (V)SHUFPD instead of (V)MOV*PD because the latter is a 0-latency operation on some CPUs in some cases if cType == 'FP': if isAVXInstr(instrNode): - if arch in ['ZEN+', 'ZEN2']: + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: # on ZEN, all shuffles are integer operations chainInstrFP = 'VANDPD {0}, {1}, {1};'.format(targetReg, startReg) chainInstrFP += 'VANDPD {0}, {0}, {0};'.format(targetReg) * cRep @@ -1741,7 +1741,7 @@ def getChainInstrForVectorRegs(instrNode, startReg, targetReg, cRep, cType): chainInstrFP += 'VSHUFPD {0}, {0}, {0}, 0;'.format(targetReg) * cRep chainLatencyFP = basicLatency['VSHUFPD'] * (cRep+1) else: - if arch in ['ZEN+', 'ZEN2']: + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: # on ZEN, all shuffles are integer operations chainInstrFP = 'VANDPD {0}, {1}, {1};'.format(targetReg, startReg) chainInstrFP += 'VANDPD {0}, {0}, {0};'.format(targetReg) * cRep @@ -2690,7 +2690,7 @@ def main(): resetNanoBench() - if arch in ['ZEN+', 'ZEN2']: + if arch in ['ZEN+', 'ZEN2', 'ZEN3']: configurePFCs(['UOPS','FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3', 'DIV_CYCLES']) else: configurePFCs(['UOPS', 'RETIRE_SLOTS', 'UOPS_MITE', 'UOPS_MS', 'UOPS_PORT0', 'UOPS_PORT1', 'UOPS_PORT2', 'UOPS_PORT3', 'UOPS_PORT4', 'UOPS_PORT5', From 91cb312015bfb2bc12583b8d97c8faa78ec8e82f Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sun, 15 Nov 2020 17:26:53 +0100 Subject: [PATCH 4/4] Add missing checks for AVX512VL. There is one variant of VPCLMULQDQ which requires AVX512F instead. Couldn't be bothered to find which one it is. --- tools/cpuBench/cpuBench.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py index bfb4120..969cac0 100755 --- a/tools/cpuBench/cpuBench.py +++ b/tools/cpuBench/cpuBench.py @@ -2591,8 +2591,16 @@ def filterInstructions(XMLRoot): if extension == 'WAITPKG' and not cpuid.get_bit(ecx7, 5): instrSet.discard(XMLInstr) if isaSet.startswith('AVX512_VBMI2') and not cpuid.get_bit(ecx7, 6): instrSet.discard(XMLInstr) if category == 'GFNI' and not cpuid.get_bit(ecx7, 8): instrSet.discard(XMLInstr) - if 'VAES' in isaSet and not cpuid.get_bit(ecx7, 9): instrSet.discard(XMLInstr) - if 'VPCLMULQDQ' in isaSet and not cpuid.get_bit(ecx7, 10): instrSet.discard(XMLInstr) + if 'VAES' in isaSet: + if not cpuid.get_bit(ecx7, 9): + instrSet.discard(XMLInstr) + else: + if 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31): instrSet.discard(XMLInstr) + if 'VPCLMULQDQ' in isaSet: + if not cpuid.get_bit(ecx7, 10): + instrSet.discard(XMLInstr) + else: + if 'AVX512' in isaSet and not cpuid.get_bit(ebx7, 31): instrSet.discard(XMLInstr) if isaSet.startswith('AVX512_VNNI') and not cpuid.get_bit(ecx7, 11): instrSet.discard(XMLInstr) if isaSet.startswith('AVX512_BITALG') and not cpuid.get_bit(ecx7, 12): instrSet.discard(XMLInstr) if isaSet.startswith('AVX512_VPOPCNTDQ') and not cpuid.get_bit(ecx7, 14): instrSet.discard(XMLInstr)