mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-16 11:30:07 +01:00
Ice Lake support
This commit is contained in:
232
configs/cfg_IceLake_all.txt
Normal file
232
configs/cfg_IceLake_all.txt
Normal file
@@ -0,0 +1,232 @@
|
||||
# Performance monitoring events for processors based on the Ice Lake microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_7DH and 06_7EH.
|
||||
# See Table 19-5 of Intel's "System Programming Guide" (May 2019)
|
||||
|
||||
00.01 INST_RETIRED.ANY
|
||||
00.01 INST_RETIRED.PREC_DIST
|
||||
00.02 CPU_CLK_UNHALTED.THREAD
|
||||
00.03 CPU_CLK_UNHALTED.REF_TSC
|
||||
00.04 TOPDOWN.SLOTS
|
||||
03.02 LD_BLOCKS.STORE_FORWARD
|
||||
03.08 LD_BLOCKS.NO_SR
|
||||
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
|
||||
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
|
||||
08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
|
||||
08.0E DTLB_LOAD_MISSES.WALK_COMPLETED
|
||||
08.10 DTLB_LOAD_MISSES.WALK_PENDING
|
||||
08.10 DTLB_LOAD_MISSES.WALK_ACTIVE
|
||||
08.20 DTLB_LOAD_MISSES.STLB_HIT
|
||||
0D.01 INT_MISC.RECOVERY_CYCLES
|
||||
0D.03 INT_MISC.ALL_RECOVERY_CYCLES
|
||||
0D.80 INT_MISC.CLEAR_RESTEER_CYCLES
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
0E.01 UOPS_ISSUED.STALL_CYCLES
|
||||
14.09 ARITH.DIVIDER_ACTIVE
|
||||
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
|
||||
24.22 L2_RQSTS.RFO_MISS
|
||||
24.24 L2_RQSTS.CODE_RD_MISS
|
||||
24.27 L2_RQSTS.ALL_DEMAND_MISS
|
||||
24.28 L2_RQSTS.SWPF_MISS
|
||||
24.C1 L2_RQSTS.DEMAND_DATA_RD_HIT
|
||||
24.C2 L2_RQSTS.RFO_HIT
|
||||
24.C4 L2_RQSTS.CODE_RD_HIT
|
||||
24.C8 L2_RQSTS.SWPF_HIT
|
||||
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
|
||||
24.E2 L2_RQSTS.ALL_RFO
|
||||
24.E4 L2_RQSTS.ALL_CODE_RD
|
||||
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
|
||||
28.07 CORE_POWER.LVL0_TURBO_LICENSE
|
||||
28.18 CORE_POWER.LVL1_TURBO_LICENSE
|
||||
28.20 CORE_POWER.LVL2_TURBO_LICENSE
|
||||
32.01 SW_PREFETCH_ACCESS.NTA
|
||||
32.02 SW_PREFETCH_ACCESS.T0
|
||||
32.04 SW_PREFETCH_ACCESS.T1_T2
|
||||
32.08 SW_PREFETCH_ACCESS.PREFETCHW
|
||||
3C.00 CPU_CLK_UNHALTED.THREAD_P
|
||||
3C.01 CPU_CLK_UNHALTED.REF_XCLK
|
||||
3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
|
||||
48.01 L1D_PEND_MISS.PENDING
|
||||
48.01 L1D_PEND_MISS.PENDING_CYCLES
|
||||
48.02 L1D_PEND_MISS.FB_FULL
|
||||
48.02 L1D_PEND_MISS.FB_FULL_PERIODS
|
||||
48.04 L1D_PEND_MISS.L2_STALL
|
||||
49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
|
||||
49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
|
||||
49.0E DTLB_STORE_MISSES.WALK_COMPLETED
|
||||
49.10 DTLB_STORE_MISSES.WALK_PENDING
|
||||
49.10 DTLB_STORE_MISSES.WALK_ACTIVE
|
||||
49.20 DTLB_STORE_MISSES.STLB_HIT
|
||||
4C.01 LOAD_HIT_PREFETCH.SWPF
|
||||
51.01 L1D.REPLACEMENT
|
||||
54.01 TX_MEM.ABORT_CONFLICT
|
||||
54.02 TX_MEM.ABORT_CAPACITY_WRITE
|
||||
54.04 TX_MEM.ABORT_HLE_STORE_TO_ELIDED_LOCK
|
||||
54.08 TX_MEM.ABORT_HLE_ELISION_BUFFER_NOT_EMPTY
|
||||
54.10 TX_MEM.ABORT_HLE_ELISION_BUFFER_MISMATCH
|
||||
54.20 TX_MEM.ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGN
|
||||
54.40 TX_MEM.HLE_ELISION_BUFFER_FULL
|
||||
5D.02 TX_EXEC.MISC2
|
||||
5D.04 TX_EXEC.MISC3
|
||||
5E.01 RS_EVENTS.EMPTY_CYCLES
|
||||
5E.01 RS_EVENTS.EMPTY_END
|
||||
60.04 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_R
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
|
||||
60.08 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
|
||||
79.04 IDQ.MITE_UOPS
|
||||
79.04 IDQ.MITE_CYCLES_OK
|
||||
79.04 IDQ.MITE_CYCLES_ANY
|
||||
79.08 IDQ.DSB_UOPS
|
||||
79.08 IDQ.DSB_CYCLES_OK
|
||||
79.08 IDQ.DSB_CYCLES_ANY
|
||||
79.30 IDQ.MS_SWITCHES
|
||||
79.30 IDQ.MS_UOPS
|
||||
79.30 IDQ.MS_CYCLES_ANY
|
||||
80.04 ICACHE_16B.IFDATA_STALL
|
||||
83.01 ICACHE_64B.IFTAG_HIT
|
||||
83.02 ICACHE_64B.IFTAG_MISS
|
||||
83.04 ICACHE_64B.IFTAG_STALL
|
||||
85.02 ITLB_MISSES.WALK_COMPLETED_4K
|
||||
85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
|
||||
85.0E ITLB_MISSES.WALK_COMPLETED
|
||||
85.10 ITLB_MISSES.WALK_PENDING
|
||||
85.10 ITLB_MISSES.WALK_ACTIVE
|
||||
85.20 ITLB_MISSES.STLB_HIT
|
||||
87.01 ILD_STALL.LCP
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE
|
||||
9C.01 IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
|
||||
A1.01 UOPS_DISPATCHED.PORT_0
|
||||
A1.02 UOPS_DISPATCHED.PORT_1
|
||||
A1.04 UOPS_DISPATCHED.PORT_2_3
|
||||
A1.10 UOPS_DISPATCHED.PORT_4_9
|
||||
A1.20 UOPS_DISPATCHED.PORT_5
|
||||
A1.40 UOPS_DISPATCHED.PORT_6
|
||||
A1.80 UOPS_DISPATCHED.PORT_7_8
|
||||
A2.02 RESOURCE_STALLS.SCOREBOARD
|
||||
A2.08 RESOURCE_STALLS.SB
|
||||
A3.01 CYCLE_ACTIVITY.CYCLES_L2_MISS
|
||||
A3.02 CYCLE_ACTIVITY.CYCLES_L3_MISS
|
||||
A3.04 CYCLE_ACTIVITY.STALLS_TOTAL
|
||||
A3.05 CYCLE_ACTIVITY.STALLS_L2_MISS
|
||||
A3.06 CYCLE_ACTIVITY.STALLS_L3_MISS
|
||||
A3.08 CYCLE_ACTIVITY.CYCLES_L1D_MISS
|
||||
A3.0C CYCLE_ACTIVITY.STALLS_L1D_MISS
|
||||
A3.10 CYCLE_ACTIVITY.CYCLES_MEM_ANY
|
||||
A3.14 CYCLE_ACTIVITY.STALLS_MEM_ANY
|
||||
A4.01 TOPDOWN.SLOTS_P
|
||||
A4.02 TOPDOWN.BACKEND_BOUND_SLOTS
|
||||
A6.02 EXE_ACTIVITY.1_PORTS_UTIL
|
||||
A6.04 EXE_ACTIVITY.2_PORTS_UTIL
|
||||
A6.40 EXE_ACTIVITY.BOUND_ON_STORES
|
||||
A6.80 EXE_ACTIVITY.EXE_BOUND_0_PORTS
|
||||
A8.01 LSD.UOPS
|
||||
A8.01 LSD.CYCLES_ACTIVE
|
||||
A8.01 LSD.CYCLES_OK
|
||||
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
|
||||
AE.01 ITLB.ITLB_FLUSH
|
||||
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
|
||||
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
|
||||
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
|
||||
B0.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
|
||||
B0.80 OFFCORE_REQUESTS.ALL_REQUESTS
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
B1.01 UOPS_EXECUTED.STALL_CYCLES
|
||||
B1.01 UOPS_EXECUTED.CYCLES_GE_1
|
||||
B1.01 UOPS_EXECUTED.CYCLES_GE_2
|
||||
B1.01 UOPS_EXECUTED.CYCLES_GE_3
|
||||
B1.01 UOPS_EXECUTED.CYCLES_GE_4
|
||||
B1.02 UOPS_EXECUTED.CORE
|
||||
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_1
|
||||
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_2
|
||||
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_3
|
||||
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_4
|
||||
B1.10 UOPS_EXECUTED.X87
|
||||
BD.01 TLB_FLUSH.DTLB_THREAD
|
||||
BD.20 TLB_FLUSH.STLB_ANY
|
||||
C0.00 INST_RETIRED.ANY_P
|
||||
C1.02 ASSISTS.FP
|
||||
C1.07 ASSISTS.ANY
|
||||
C2.02 UOPS_RETIRED.TOTAL_CYCLES
|
||||
C2.02 UOPS_RETIRED.SLOTS
|
||||
C3.01 MACHINE_CLEARS.COUNT
|
||||
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
|
||||
C3.04 MACHINE_CLEARS.SMC
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C4.01 BR_INST_RETIRED.COND_TAKEN
|
||||
C4.02 BR_INST_RETIRED.NEAR_CALL
|
||||
C4.08 BR_INST_RETIRED.NEAR_RETURN
|
||||
C4.10 BR_INST_RETIRED.COND_NTAKEN
|
||||
C4.11 BR_INST_RETIRED.COND
|
||||
C4.20 BR_INST_RETIRED.NEAR_TAKEN
|
||||
C4.40 BR_INST_RETIRED.FAR_BRANCH
|
||||
C4.80 BR_INST_RETIRED.INDIRECT
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
C5.01 BR_MISP_RETIRED.COND_TAKEN
|
||||
C5.11 BR_MISP_RETIRED.COND
|
||||
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
|
||||
C5.80 BR_MISP_RETIRED.INDIRECT
|
||||
C6.01 FRONTEND_RETIRED.DSB_MISS
|
||||
C6.01 FRONTEND_RETIRED.L1I_MISS
|
||||
C6.01 FRONTEND_RETIRED.L2_MISS
|
||||
C6.01 FRONTEND_RETIRED.ITLB_MISS
|
||||
C6.01 FRONTEND_RETIRED.STLB_MISS
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_2
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_4
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_8
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_16
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_32
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_64
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_128
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_256
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_512
|
||||
C6.01 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
|
||||
C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
|
||||
C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
|
||||
C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
|
||||
C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
|
||||
C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
|
||||
C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
|
||||
C7.40 FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE
|
||||
C7.80 FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE
|
||||
C8.01 HLE_RETIRED.START
|
||||
C8.02 HLE_RETIRED.COMMIT
|
||||
C8.04 HLE_RETIRED.ABORTED
|
||||
C8.08 HLE_RETIRED.ABORTED_MEM
|
||||
C8.20 HLE_RETIRED.ABORTED_UNFRIENDLY
|
||||
C8.80 HLE_RETIRED.ABORTED_EVENTS
|
||||
C9.01 RTM_RETIRED.START
|
||||
C9.02 RTM_RETIRED.COMMIT
|
||||
C9.04 RTM_RETIRED.ABORTED
|
||||
C9.08 RTM_RETIRED.ABORTED_MEM
|
||||
C9.20 RTM_RETIRED.ABORTED_UNFRIENDLY
|
||||
C9.40 RTM_RETIRED.ABORTED_MEMTYPE
|
||||
C9.80 RTM_RETIRED.ABORTED_EVENTS
|
||||
CC.20 MISC_RETIRED.LBR_INSERTS
|
||||
CC.40 MISC_RETIRED.PAUSE_INST
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
|
||||
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
|
||||
D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
|
||||
D0.41 MEM_INST_RETIRED.SPLIT_LOADS
|
||||
D0.81 MEM_INST_RETIRED.ALL_LOADS
|
||||
D0.82 MEM_INST_RETIRED.ALL_STORES
|
||||
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
||||
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
D1.04 MEM_LOAD_RETIRED.L3_HIT
|
||||
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
||||
D1.10 MEM_LOAD_RETIRED.L2_MISS
|
||||
D1.20 MEM_LOAD_RETIRED.L3_MISS
|
||||
D1.40 MEM_LOAD_RETIRED.FB_HIT
|
||||
D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
|
||||
D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT
|
||||
D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM
|
||||
D2.08 MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE
|
||||
E6.01 BACLEARS.ANY
|
||||
EC.02 CPU_CLK_UNHALTED.DISTRIBUTED
|
||||
F1.1F L2_LINES_IN.ALL
|
||||
F4.04 SQ_MISC.SQ_FULL
|
||||
21
configs/cfg_IceLake_common.txt
Normal file
21
configs/cfg_IceLake_common.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
# Performance monitoring events for processors based on the Ice Lake microarchitecture.
|
||||
# Applies to processors with DisplayFamily_DisplayModel of 06_7DH and 06_7EH.
|
||||
# See Table 19-5 of Intel's "System Programming Guide" (May 2019)
|
||||
|
||||
0E.01 UOPS_ISSUED.ANY
|
||||
B1.01 UOPS_EXECUTED.THREAD
|
||||
A1.01 UOPS_DISPATCHED.PORT_0
|
||||
A1.02 UOPS_DISPATCHED.PORT_1
|
||||
A1.04 UOPS_DISPATCHED.PORT_2_3
|
||||
A1.10 UOPS_DISPATCHED.PORT_4_9
|
||||
A1.20 UOPS_DISPATCHED.PORT_5
|
||||
A1.40 UOPS_DISPATCHED.PORT_6
|
||||
A1.80 UOPS_DISPATCHED.PORT_7_8
|
||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
||||
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
||||
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
||||
D1.10 MEM_LOAD_RETIRED.L2_MISS
|
||||
D1.04 MEM_LOAD_RETIRED.L3_HIT
|
||||
D1.20 MEM_LOAD_RETIRED.L3_MISS
|
||||
@@ -168,12 +168,14 @@ VersionInfo = collections.namedtuple('VersionInfo', 'displ_family displ_model st
|
||||
def version_info(cpu):
|
||||
a, _, _, _ = cpu(0x01)
|
||||
|
||||
displ_family = (a >> 8) & 0xF
|
||||
if (displ_family == 0x0F):
|
||||
family_ID = (a >> 8) & 0xF
|
||||
|
||||
displ_family = family_ID
|
||||
if (family_ID == 0x0F):
|
||||
displ_family += (a >> 20) & 0xFF
|
||||
|
||||
displ_model = (a >> 4) & 0xF
|
||||
if (displ_family == 0x06 or displ_family == 0x0F):
|
||||
if (family_ID == 0x06 or family_ID == 0x0F):
|
||||
displ_model += (a >> 12) & 0xF0
|
||||
|
||||
stepping = a & 0xF
|
||||
@@ -211,6 +213,8 @@ def micro_arch(cpu):
|
||||
return 'CFL'
|
||||
if (vi.displ_family, vi.displ_model) in [(0x06, 0x66)]:
|
||||
return 'CNL'
|
||||
if (vi.displ_family, vi.displ_model) in [(0x06, 0x7D), (0x06, 0x7E)]:
|
||||
return 'ICL'
|
||||
if (vi.displ_family, vi.displ_model) in [(0x17, 0x01), (0x17, 0x11)]:
|
||||
return 'ZEN'
|
||||
if (vi.displ_family, vi.displ_model) in [(0x17, 0x08), (0x17, 0x18)]:
|
||||
@@ -491,7 +495,11 @@ def get_cache_info(cpu):
|
||||
L3Size = int(get_bits(d, 18, 31)*512)
|
||||
L3Assoc = 0
|
||||
d_15_12 = get_bits(d, 12, 15)
|
||||
if d_15_12 == 0x8: L3Assoc = 16
|
||||
if d_15_12 == 0x1: L3Assoc = 1
|
||||
elif d_15_12 == 0x2: L3Assoc = 2
|
||||
elif d_15_12 == 0x4: L3Assoc = 4
|
||||
elif d_15_12 == 0x6: L3Assoc = 8
|
||||
elif d_15_12 == 0x8: L3Assoc = 16
|
||||
elif d_15_12 == 0xA: L3Assoc = 32
|
||||
elif d_15_12 == 0xB: L3Assoc = 48
|
||||
elif d_15_12 == 0xC: L3Assoc = 64
|
||||
@@ -81,13 +81,9 @@ If the replacement policy is a permutation policy (see [Measurement-based Modeli
|
||||
|
||||
Generates graphs that show the number of core cycles and the number of hits/misses (per access) when accessing memory areas of different sizes repeatedly using a given stride (which can be specified with the `-stride` option). An example can be seen [here](https://uops.info/cache/lat_CFL.html).
|
||||
|
||||
## cpuid.py
|
||||
|
||||
Obtains cache and TLB information using the `CPUID` instruction.
|
||||
|
||||
## cacheInfo.py
|
||||
|
||||
Combines information from `cpuid.py` with information on the number of slices of the L3 cache that is obtained through measurements.
|
||||
Combines information obtained by executing the `CPUID` instruction with information on the number of slices of the L3 cache that is obtained through measurements.
|
||||
|
||||
## setDueling.py
|
||||
|
||||
|
||||
@@ -3,15 +3,17 @@ from itertools import count
|
||||
from collections import namedtuple
|
||||
|
||||
import math
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import cpuid
|
||||
|
||||
sys.path.append('../..')
|
||||
from kernelNanoBench import *
|
||||
|
||||
sys.path.append('../CPUID')
|
||||
import cpuid
|
||||
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@@ -21,26 +23,26 @@ def getEventConfig(event):
|
||||
if event == 'L1_HIT':
|
||||
if arch in ['Core', 'EnhancedCore']: return '40.0E ' + event # L1D_CACHE_LD.MES
|
||||
if arch in ['NHM', 'WSM']: return 'CB.01 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.01 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.01 ' + event
|
||||
if event == 'L1_MISS':
|
||||
if arch in ['Core', 'EnhancedCore']: return 'CB.01.CTR=0 ' + event
|
||||
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.08 ' + event
|
||||
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.08 ' + event
|
||||
if arch in ['ZEN+']: return '064.70 ' + event
|
||||
if event == 'L2_HIT':
|
||||
if arch in ['Core', 'EnhancedCore']: return '29.7E ' + event # L2_LD.THIS_CORE.ALL_INCL.MES
|
||||
if arch in ['NHM', 'WSM']: return 'CB.02 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.02 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.02 ' + event
|
||||
if arch in ['ZEN+']: return '064.70 ' + event
|
||||
if event == 'L2_MISS':
|
||||
if arch in ['Core', 'EnhancedCore']: return 'CB.04.CTR=0 ' + event
|
||||
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.10 ' + event
|
||||
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.10 ' + event
|
||||
if arch in ['ZEN+']: return '064.08 ' + event
|
||||
if event == 'L3_HIT':
|
||||
if arch in ['NHM', 'WSM']: return 'CB.04 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.04 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.04 ' + event
|
||||
if event == 'L3_MISS':
|
||||
if arch in ['NHM', 'WSM']: return 'CB.10 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.20 ' + event
|
||||
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.20 ' + event
|
||||
return ''
|
||||
|
||||
def getDefaultCacheConfig():
|
||||
@@ -49,7 +51,7 @@ def getDefaultCacheConfig():
|
||||
|
||||
def getDefaultCacheMSRConfig():
|
||||
if 'Intel' in getCPUVendor() and 'L3' in getCpuidCacheInfo() and getCpuidCacheInfo()['L3']['complex']:
|
||||
if getArch() in ['CNL']:
|
||||
if getArch() in ['CNL', 'ICL']:
|
||||
dist = 8
|
||||
ctrOffset = 2
|
||||
else:
|
||||
@@ -149,7 +151,7 @@ def getNCBoxUnits():
|
||||
try:
|
||||
subprocess.check_output(['modprobe', 'msr'])
|
||||
cbo_config = subprocess.check_output(['rdmsr', '0x396'])
|
||||
if getArch() in ['CNL']:
|
||||
if getArch() in ['CNL', 'ICL']:
|
||||
getNCBoxUnits.nCBoxUnits = int(cbo_config)
|
||||
else:
|
||||
getNCBoxUnits.nCBoxUnits = int(cbo_config) - 1
|
||||
@@ -207,6 +209,10 @@ def getPointerChasingInit(addresses):
|
||||
if tuple(addresses) in pointerChasingInits:
|
||||
return pointerChasingInits[tuple(addresses)]
|
||||
|
||||
#addresses_tail = addresses[1:]
|
||||
#random.shuffle(addresses_tail)
|
||||
#adresses = [addresses[0]] + addresses_tail
|
||||
|
||||
init = 'lea RAX, [R14+' + str(addresses[0]) + ']; '
|
||||
init += 'mov RBX, RAX; '
|
||||
|
||||
@@ -235,7 +241,7 @@ def getPointerChasingInit(addresses):
|
||||
|
||||
ExperimentCode = namedtuple('ExperimentCode', 'code init oneTimeInit')
|
||||
|
||||
def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False):
|
||||
def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False, afterEveryAcc=''):
|
||||
distinctAddrLists = set(tuple(l.addresses) for l in initAddressLists+codeAddressLists)
|
||||
if len(distinctAddrLists) > 1 and set.intersection(*list(set(l) for l in distinctAddrLists)):
|
||||
raise ValueError('same address in different lists')
|
||||
@@ -272,7 +278,7 @@ def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False):
|
||||
|
||||
if addressList.flush:
|
||||
for address in addresses:
|
||||
codeList.append('clflush [R14 + ' + str(address) + ']; ')
|
||||
codeList.append('clflush [R14 + ' + str(address) + ']; ' + afterEveryAcc)
|
||||
else:
|
||||
if len(addresses) == 1:
|
||||
codeList.append('mov RCX, [R14 + ' + str(addresses[0]) + ']; ')
|
||||
@@ -281,7 +287,7 @@ def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False):
|
||||
oneTimeInit.append(getPointerChasingInit(addresses))
|
||||
alreadyAddedOneTimeInits.add(tuple(addresses))
|
||||
|
||||
codeList.append('lea RCX, [R14+' + str(addresses[0]) + ']; 1: mov RCX, [RCX]; jrcxz 2f; jmp 1b; 2: ')
|
||||
codeList.append('lea RCX, [R14+' + str(addresses[0]) + ']; 1: mov RCX, [RCX]; ' + afterEveryAcc + 'jrcxz 2f; jmp 1b; 2: ')
|
||||
|
||||
if not isInit and not pfcEnabled:
|
||||
codeList.append(PFC_START_ASM + '; ')
|
||||
@@ -361,7 +367,7 @@ def getAddresses(level, wayID, cacheSetList, cBox=1, clearHL=True):
|
||||
if getCacheInfo(3).nSlices == getNCBoxUnits():
|
||||
L3SetToWayIDMap[cBox][L3Set][wayID] = next(iter(getNewAddressesInCBox(1, cBox, L3Set, L3SetToWayIDMap[cBox][L3Set].values())))
|
||||
else:
|
||||
L3SetToWayIDMap[cBox][L3Set][wayID] = next(iter(findCongruentL3Addresses(1, L3SetToWayIDMap[cBox][L3Set].values())))
|
||||
L3SetToWayIDMap[cBox][L3Set][wayID] = next(iter(findCongruentL3Addresses(1, L3Set, cBox, L3SetToWayIDMap[cBox][L3Set].values())))
|
||||
addresses.append(L3SetToWayIDMap[cBox][L3Set][wayID])
|
||||
|
||||
return addresses
|
||||
@@ -447,14 +453,16 @@ def printNB(nb_result):
|
||||
def findMinimalL3EvictionSet(cacheSet, cBox):
|
||||
setNanoBenchParameters(config='\n'.join([getEventConfig('L3_HIT'), getEventConfig('L3_MISS')]), msrConfig=None, nMeasurements=10, unrollCount=1, loopCount=10,
|
||||
warmUpCount=None, initialWarmUpCount=None, aggregateFunction='med', basicMode=True, noMem=True, verbose=None)
|
||||
|
||||
if not hasattr(findMinimalL3EvictionSet, 'evSetForCacheSet'):
|
||||
findMinimalL3EvictionSet.evSetForCacheSet = dict()
|
||||
evSetForCacheSet = findMinimalL3EvictionSet.evSetForCacheSet
|
||||
if not cBox in findMinimalL3EvictionSet.evSetForCacheSet:
|
||||
findMinimalL3EvictionSet.evSetForCacheSet[cBox] = dict()
|
||||
evSetForCacheSet = findMinimalL3EvictionSet.evSetForCacheSet[cBox]
|
||||
|
||||
if cacheSet in evSetForCacheSet:
|
||||
return evSetForCacheSet[cacheSet]
|
||||
|
||||
clearHLAddrList = AddressList(getClearHLAddresses(3, [cacheSet], cBox), True, False)
|
||||
addresses = []
|
||||
curAddress = cacheSet*getCacheInfo(3).lineSize
|
||||
|
||||
@@ -468,7 +476,7 @@ def findMinimalL3EvictionSet(cacheSet, cBox):
|
||||
if not getCBoxOfAddress(curAddress) == cBox: continue
|
||||
|
||||
addresses += [curAddress]
|
||||
ec = getCodeForAddressLists([AddressList(addresses,False,False)])
|
||||
ec = getCodeForAddressLists([AddressList(addresses,False,False), clearHLAddrList])
|
||||
|
||||
setNanoBenchParameters(config=getDefaultCacheConfig(), msrConfig='', nMeasurements=10, unrollCount=1, loopCount=100,
|
||||
aggregateFunction='med', basicMode=True, noMem=True)
|
||||
@@ -480,7 +488,7 @@ def findMinimalL3EvictionSet(cacheSet, cBox):
|
||||
for i in reversed(range(0, len(addresses))):
|
||||
tmpAddresses = addresses[:i] + addresses[(i+1):]
|
||||
|
||||
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False)])
|
||||
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False), clearHLAddrList])
|
||||
nb = runNanoBench(code=ec.code, oneTimeInit=ec.oneTimeInit)
|
||||
|
||||
if nb['L3_HIT'] < len(tmpAddresses) - 0.9:
|
||||
@@ -490,16 +498,20 @@ def findMinimalL3EvictionSet(cacheSet, cBox):
|
||||
return addresses
|
||||
|
||||
|
||||
def findCongruentL3Addresses(n, L3EvictionSet):
|
||||
setNanoBenchParameters(config=getEventConfig('L3_HIT'), msrConfig=None, nMeasurements=10, unrollCount=1, loopCount=100,
|
||||
warmUpCount=None, initialWarmUpCount=None, aggregateFunction='med', basicMode=True, noMem=True, verbose=None)
|
||||
def findCongruentL3Addresses(n, cacheSet, cBox, L3EvictionSet):
|
||||
clearHLAddrList = AddressList(getClearHLAddresses(3, [cacheSet], cBox), True, False)
|
||||
|
||||
congrAddresses = []
|
||||
L3WaySize = getCacheInfo(3).waySize
|
||||
|
||||
for newAddr in count(max(L3EvictionSet)+L3WaySize, L3WaySize):
|
||||
tmpAddresses = L3EvictionSet[:getCacheInfo(3).assoc] + [newAddr]
|
||||
if not getCBoxOfAddress(newAddr) == cBox: continue
|
||||
|
||||
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False)])
|
||||
tmpAddresses = L3EvictionSet[:getCacheInfo(3).assoc] + [newAddr]
|
||||
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False), clearHLAddrList])
|
||||
|
||||
setNanoBenchParameters(config=getEventConfig('L3_HIT'), msrConfig=None, nMeasurements=10, unrollCount=1, loopCount=100,
|
||||
aggregateFunction='med', basicMode=True, noMem=True, verbose=None)
|
||||
nb = runNanoBench(code=ec.code, oneTimeInit=ec.oneTimeInit)
|
||||
|
||||
if nb['L3_HIT'] < len(tmpAddresses) - 0.9:
|
||||
@@ -511,9 +523,17 @@ def findCongruentL3Addresses(n, L3EvictionSet):
|
||||
|
||||
|
||||
def findMaximalNonEvictingL3SetInCBox(start, stride, L3Assoc, cBox):
|
||||
curAddress = start
|
||||
clearHLAddresses = []
|
||||
addresses = []
|
||||
|
||||
curAddress = start
|
||||
while len(clearHLAddresses) < 2*(getCacheInfo(1).assoc+getCacheInfo(2).assoc):
|
||||
if getCBoxOfAddress(curAddress) != cBox:
|
||||
clearHLAddresses.append(curAddress)
|
||||
curAddress += stride
|
||||
clearHLAddrList = AddressList(clearHLAddresses, True, False)
|
||||
|
||||
curAddress = start
|
||||
while len(addresses) < L3Assoc:
|
||||
if getCBoxOfAddress(curAddress) == cBox:
|
||||
addresses.append(curAddress)
|
||||
@@ -527,7 +547,7 @@ def findMaximalNonEvictingL3SetInCBox(start, stride, L3Assoc, cBox):
|
||||
continue
|
||||
|
||||
newAddresses = addresses + [curAddress]
|
||||
ec = getCodeForAddressLists([AddressList(newAddresses,False,False)])
|
||||
ec = getCodeForAddressLists([AddressList(newAddresses,False,False), clearHLAddrList])
|
||||
|
||||
setNanoBenchParameters(config=getEventConfig('L3_HIT'), msrConfig='', nMeasurements=10, unrollCount=1, loopCount=10,
|
||||
aggregateFunction='med', basicMode=True, noMem=True)
|
||||
|
||||
@@ -109,6 +109,28 @@ AllRandPLRUVariants = {
|
||||
'PLRURand': PLRURandSim,
|
||||
}
|
||||
|
||||
class LRU_PLRU4Sim(ReplPolicySim):
|
||||
def __init__(self, assoc):
|
||||
self.PLRUs = [PLRUSim(4, linearInit=True) for _ in range(0, assoc/4)]
|
||||
self.PLRUOrdered = list(self.PLRUs) # from MRU to LRU
|
||||
|
||||
def acc(self, block):
|
||||
hit = False
|
||||
curPLRU = self.PLRUOrdered[-1]
|
||||
for plru in self.PLRUs:
|
||||
if block in plru.blocks:
|
||||
curPLRU = plru
|
||||
hit = True
|
||||
break
|
||||
else:
|
||||
for plru in self.PLRUs:
|
||||
if None in plru.blocks:
|
||||
curPLRU = plru
|
||||
break
|
||||
curPLRU.acc(block)
|
||||
self.PLRUOrdered = [curPLRU] + [plru for plru in self.PLRUOrdered if plru!=curPLRU]
|
||||
return hit
|
||||
|
||||
class QLRUSim(ReplPolicySim):
|
||||
def __init__(self, assoc, hitFunc, missFunc, replIdxFunc, updFunc, updOnMissOnly=False):
|
||||
super(QLRUSim, self).__init__(assoc)
|
||||
@@ -259,6 +281,7 @@ CommonPolicies = {
|
||||
'LRU': LRUSim,
|
||||
'PLRU': PLRUSim,
|
||||
'PLRUl': PLRUlSim,
|
||||
'LRU_PLRU4': LRU_PLRU4Sim,
|
||||
'MRU': MRUSim, # NHM
|
||||
'MRU_N': MRUNSim, # SNB
|
||||
'NRU': NRUSim,
|
||||
@@ -297,7 +320,7 @@ def getAges(blocks, seq, policySimClass, assoc):
|
||||
for block in blocks:
|
||||
for i in count(0):
|
||||
curSeq = seq + ' ' + ' '.join('N' + str(n) for n in range(0,i)) + ' ' + block + '?'
|
||||
if getHits(policySimClass(assoc), curSeq) == 0:
|
||||
if getHits(curSeq, policySimClass, assoc, 1) == 0:
|
||||
ages[block] = i
|
||||
break
|
||||
return ages
|
||||
@@ -321,7 +344,7 @@ def getGraph(blocks, seq, policySimClass, assoc, maxAge, nSets=1, nRep=1, agg="m
|
||||
return traces
|
||||
|
||||
|
||||
def getPermutations(policySimClass, assoc, maxAge=None):
|
||||
def getPermutations(policySimClass, assoc):
|
||||
# initial ages
|
||||
initBlocks = ['I' + str(i) for i in range(0, assoc)]
|
||||
seq = ' '.join(initBlocks)
|
||||
|
||||
@@ -14,6 +14,7 @@ import plotly.graph_objects as go
|
||||
|
||||
from cacheLib import *
|
||||
from cacheGraph import *
|
||||
import cacheSim
|
||||
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -83,21 +84,27 @@ def main():
|
||||
parser.add_argument("-noInit", help="Do not fill sets with associativity many elements first", action='store_true')
|
||||
parser.add_argument("-maxAge", help="Maximum age", type=int)
|
||||
parser.add_argument("-cBox", help="cBox (default: 1)", type=int, default=1)
|
||||
parser.add_argument("-sim", help="Simulate the given policy instead of running the experiment on the hardware")
|
||||
parser.add_argument("-simAssoc", help="Associativity of the simulated cache (default: 8)", type=int, default=8)
|
||||
parser.add_argument("-logLevel", help="Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)", default='WARNING')
|
||||
parser.add_argument("-output", help="Output file name", default='permPolicy.html')
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, format='%(message)s', level=logging.getLevelName(args.logLevel))
|
||||
|
||||
title = cpuid.cpu_name(cpuid.CPUID()) + ', Level: ' + str(args.level)
|
||||
if not args.sim:
|
||||
title = cpuid.cpu_name(cpuid.CPUID()) + ', Level: ' + str(args.level)
|
||||
|
||||
html = ['<html>', '<head>', '<title>' + title + '</title>', '<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>', '</head>', '<body>']
|
||||
html += ['<h3>' + title + '</h3>']
|
||||
getPermutations(args.level, html, cacheSets=args.sets, getInitialAges=(not args.noInit), maxAge=args.maxAge, cBox=args.cBox)
|
||||
html += ['</body>', '</html>']
|
||||
html = ['<html>', '<head>', '<title>' + title + '</title>', '<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>', '</head>', '<body>']
|
||||
html += ['<h3>' + title + '</h3>']
|
||||
getPermutations(args.level, html, cacheSets=args.sets, getInitialAges=(not args.noInit), maxAge=args.maxAge, cBox=args.cBox)
|
||||
html += ['</body>', '</html>']
|
||||
|
||||
with open(args.output ,'w') as f:
|
||||
f.write('\n'.join(html))
|
||||
with open(args.output ,'w') as f:
|
||||
f.write('\n'.join(html))
|
||||
else:
|
||||
policyClass = cacheSim.AllPolicies[args.sim]
|
||||
cacheSim.getPermutations(policyClass, args.simAssoc)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -64,6 +64,7 @@ def main():
|
||||
parser.add_argument("-nMeasurements", help="Number of measurements", type=int, default=3)
|
||||
parser.add_argument("-findCtrEx", help="Tries to find a small counterexample for each policy (only available for deterministic policies)", action='store_true')
|
||||
parser.add_argument("-policies", help="Comma-separated list of policies to consider (Default: all deterministic policies)")
|
||||
parser.add_argument("-best", help="Find the best matching policy (Default: abort if no policy agrees with all results)", action='store_true')
|
||||
parser.add_argument("-randPolicies", help="Test randomized policies", action='store_true')
|
||||
parser.add_argument("-allQLRUVariants", help="Test all QLRU variants", action='store_true')
|
||||
parser.add_argument("-assoc", help="Override the associativity", type=int)
|
||||
@@ -126,6 +127,7 @@ def main():
|
||||
|
||||
if sim != actual:
|
||||
possiblePolicies.discard(p)
|
||||
dists[p] += 1
|
||||
color = 'red'
|
||||
if args.findCtrEx and not p in counterExamples:
|
||||
counterExamples[p] = findSmallCounterexample(p, ((args.initSeq + ' ') if args.initSeq else ''), args.level, args.sets, cBox, assoc, seq,
|
||||
@@ -145,7 +147,7 @@ def main():
|
||||
|
||||
html += ['</tr>']
|
||||
|
||||
if not args.randPolicies:
|
||||
if not args.randPolicies and not args.best:
|
||||
print 'Possible policies: ' + ', '.join(possiblePolicies)
|
||||
if not possiblePolicies: break
|
||||
|
||||
@@ -160,7 +162,7 @@ def main():
|
||||
with open(args.output ,'w') as f:
|
||||
f.write('\n'.join(html))
|
||||
|
||||
if not args.randPolicies:
|
||||
if not args.randPolicies and not args.best:
|
||||
print 'Possible policies: ' + ', '.join(possiblePolicies)
|
||||
else:
|
||||
for p, d in reversed(sorted(dists.items(), key=lambda d: d[1])):
|
||||
|
||||
Reference in New Issue
Block a user