Ice Lake support

This commit is contained in:
Andreas Abel
2019-11-08 15:57:52 +01:00
parent 824e7d2a02
commit 6e63f0404b
8 changed files with 354 additions and 45 deletions

232
configs/cfg_IceLake_all.txt Normal file
View File

@@ -0,0 +1,232 @@
# Performance monitoring events for processors based on the Ice Lake microarchitecture.
# Applies to processors with DisplayFamily_DisplayModel of 06_7DH and 06_7EH.
# See Table 19-5 of Intel's "System Programming Guide" (May 2019)
00.01 INST_RETIRED.ANY
00.01 INST_RETIRED.PREC_DIST
00.02 CPU_CLK_UNHALTED.THREAD
00.03 CPU_CLK_UNHALTED.REF_TSC
00.04 TOPDOWN.SLOTS
03.02 LD_BLOCKS.STORE_FORWARD
03.08 LD_BLOCKS.NO_SR
07.01 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS
08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
08.0E DTLB_LOAD_MISSES.WALK_COMPLETED
08.10 DTLB_LOAD_MISSES.WALK_PENDING
08.10 DTLB_LOAD_MISSES.WALK_ACTIVE
08.20 DTLB_LOAD_MISSES.STLB_HIT
0D.01 INT_MISC.RECOVERY_CYCLES
0D.03 INT_MISC.ALL_RECOVERY_CYCLES
0D.80 INT_MISC.CLEAR_RESTEER_CYCLES
0E.01 UOPS_ISSUED.ANY
0E.01 UOPS_ISSUED.STALL_CYCLES
14.09 ARITH.DIVIDER_ACTIVE
24.21 L2_RQSTS.DEMAND_DATA_RD_MISS
24.22 L2_RQSTS.RFO_MISS
24.24 L2_RQSTS.CODE_RD_MISS
24.27 L2_RQSTS.ALL_DEMAND_MISS
24.28 L2_RQSTS.SWPF_MISS
24.C1 L2_RQSTS.DEMAND_DATA_RD_HIT
24.C2 L2_RQSTS.RFO_HIT
24.C4 L2_RQSTS.CODE_RD_HIT
24.C8 L2_RQSTS.SWPF_HIT
24.E1 L2_RQSTS.ALL_DEMAND_DATA_RD
24.E2 L2_RQSTS.ALL_RFO
24.E4 L2_RQSTS.ALL_CODE_RD
24.E7 L2_RQSTS.ALL_DEMAND_REFERENCES
28.07 CORE_POWER.LVL0_TURBO_LICENSE
28.18 CORE_POWER.LVL1_TURBO_LICENSE
28.20 CORE_POWER.LVL2_TURBO_LICENSE
32.01 SW_PREFETCH_ACCESS.NTA
32.02 SW_PREFETCH_ACCESS.T0
32.04 SW_PREFETCH_ACCESS.T1_T2
32.08 SW_PREFETCH_ACCESS.PREFETCHW
3C.00 CPU_CLK_UNHALTED.THREAD_P
3C.01 CPU_CLK_UNHALTED.REF_XCLK
3C.02 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
48.01 L1D_PEND_MISS.PENDING
48.01 L1D_PEND_MISS.PENDING_CYCLES
48.02 L1D_PEND_MISS.FB_FULL
48.02 L1D_PEND_MISS.FB_FULL_PERIODS
48.04 L1D_PEND_MISS.L2_STALL
49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
49.0E DTLB_STORE_MISSES.WALK_COMPLETED
49.10 DTLB_STORE_MISSES.WALK_PENDING
49.10 DTLB_STORE_MISSES.WALK_ACTIVE
49.20 DTLB_STORE_MISSES.STLB_HIT
4C.01 LOAD_HIT_PREFETCH.SWPF
51.01 L1D.REPLACEMENT
54.01 TX_MEM.ABORT_CONFLICT
54.02 TX_MEM.ABORT_CAPACITY_WRITE
54.04 TX_MEM.ABORT_HLE_STORE_TO_ELIDED_LOCK
54.08 TX_MEM.ABORT_HLE_ELISION_BUFFER_NOT_EMPTY
54.10 TX_MEM.ABORT_HLE_ELISION_BUFFER_MISMATCH
54.20 TX_MEM.ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGN
54.40 TX_MEM.HLE_ELISION_BUFFER_FULL
5D.02 TX_EXEC.MISC2
5D.04 TX_EXEC.MISC3
5E.01 RS_EVENTS.EMPTY_CYCLES
5E.01 RS_EVENTS.EMPTY_END
60.04 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_R
60.08 OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD
60.08 OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD
79.04 IDQ.MITE_UOPS
79.04 IDQ.MITE_CYCLES_OK
79.04 IDQ.MITE_CYCLES_ANY
79.08 IDQ.DSB_UOPS
79.08 IDQ.DSB_CYCLES_OK
79.08 IDQ.DSB_CYCLES_ANY
79.30 IDQ.MS_SWITCHES
79.30 IDQ.MS_UOPS
79.30 IDQ.MS_CYCLES_ANY
80.04 ICACHE_16B.IFDATA_STALL
83.01 ICACHE_64B.IFTAG_HIT
83.02 ICACHE_64B.IFTAG_MISS
83.04 ICACHE_64B.IFTAG_STALL
85.02 ITLB_MISSES.WALK_COMPLETED_4K
85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
85.0E ITLB_MISSES.WALK_COMPLETED
85.10 ITLB_MISSES.WALK_PENDING
85.10 ITLB_MISSES.WALK_ACTIVE
85.20 ITLB_MISSES.STLB_HIT
87.01 ILD_STALL.LCP
9C.01 IDQ_UOPS_NOT_DELIVERED.CORE
9C.01 IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE
9C.01 IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK
A1.01 UOPS_DISPATCHED.PORT_0
A1.02 UOPS_DISPATCHED.PORT_1
A1.04 UOPS_DISPATCHED.PORT_2_3
A1.10 UOPS_DISPATCHED.PORT_4_9
A1.20 UOPS_DISPATCHED.PORT_5
A1.40 UOPS_DISPATCHED.PORT_6
A1.80 UOPS_DISPATCHED.PORT_7_8
A2.02 RESOURCE_STALLS.SCOREBOARD
A2.08 RESOURCE_STALLS.SB
A3.01 CYCLE_ACTIVITY.CYCLES_L2_MISS
A3.02 CYCLE_ACTIVITY.CYCLES_L3_MISS
A3.04 CYCLE_ACTIVITY.STALLS_TOTAL
A3.05 CYCLE_ACTIVITY.STALLS_L2_MISS
A3.06 CYCLE_ACTIVITY.STALLS_L3_MISS
A3.08 CYCLE_ACTIVITY.CYCLES_L1D_MISS
A3.0C CYCLE_ACTIVITY.STALLS_L1D_MISS
A3.10 CYCLE_ACTIVITY.CYCLES_MEM_ANY
A3.14 CYCLE_ACTIVITY.STALLS_MEM_ANY
A4.01 TOPDOWN.SLOTS_P
A4.02 TOPDOWN.BACKEND_BOUND_SLOTS
A6.02 EXE_ACTIVITY.1_PORTS_UTIL
A6.04 EXE_ACTIVITY.2_PORTS_UTIL
A6.40 EXE_ACTIVITY.BOUND_ON_STORES
A6.80 EXE_ACTIVITY.EXE_BOUND_0_PORTS
A8.01 LSD.UOPS
A8.01 LSD.CYCLES_ACTIVE
A8.01 LSD.CYCLES_OK
AB.02 DSB2MITE_SWITCHES.PENALTY_CYCLES
AE.01 ITLB.ITLB_FLUSH
B0.01 OFFCORE_REQUESTS.DEMAND_DATA_RD
B0.04 OFFCORE_REQUESTS.DEMAND_RFO
B0.08 OFFCORE_REQUESTS.ALL_DATA_RD
B0.10 OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
B0.80 OFFCORE_REQUESTS.ALL_REQUESTS
B1.01 UOPS_EXECUTED.THREAD
B1.01 UOPS_EXECUTED.STALL_CYCLES
B1.01 UOPS_EXECUTED.CYCLES_GE_1
B1.01 UOPS_EXECUTED.CYCLES_GE_2
B1.01 UOPS_EXECUTED.CYCLES_GE_3
B1.01 UOPS_EXECUTED.CYCLES_GE_4
B1.02 UOPS_EXECUTED.CORE
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_1
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_2
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_3
B1.02 UOPS_EXECUTED.CORE_CYCLES_GE_4
B1.10 UOPS_EXECUTED.X87
BD.01 TLB_FLUSH.DTLB_THREAD
BD.20 TLB_FLUSH.STLB_ANY
C0.00 INST_RETIRED.ANY_P
C1.02 ASSISTS.FP
C1.07 ASSISTS.ANY
C2.02 UOPS_RETIRED.TOTAL_CYCLES
C2.02 UOPS_RETIRED.SLOTS
C3.01 MACHINE_CLEARS.COUNT
C3.02 MACHINE_CLEARS.MEMORY_ORDERING
C3.04 MACHINE_CLEARS.SMC
C4.00 BR_INST_RETIRED.ALL_BRANCHES
C4.01 BR_INST_RETIRED.COND_TAKEN
C4.02 BR_INST_RETIRED.NEAR_CALL
C4.08 BR_INST_RETIRED.NEAR_RETURN
C4.10 BR_INST_RETIRED.COND_NTAKEN
C4.11 BR_INST_RETIRED.COND
C4.20 BR_INST_RETIRED.NEAR_TAKEN
C4.40 BR_INST_RETIRED.FAR_BRANCH
C4.80 BR_INST_RETIRED.INDIRECT
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
C5.01 BR_MISP_RETIRED.COND_TAKEN
C5.11 BR_MISP_RETIRED.COND
C5.20 BR_MISP_RETIRED.NEAR_TAKEN
C5.80 BR_MISP_RETIRED.INDIRECT
C6.01 FRONTEND_RETIRED.DSB_MISS
C6.01 FRONTEND_RETIRED.L1I_MISS
C6.01 FRONTEND_RETIRED.L2_MISS
C6.01 FRONTEND_RETIRED.ITLB_MISS
C6.01 FRONTEND_RETIRED.STLB_MISS
C6.01 FRONTEND_RETIRED.LATENCY_GE_2
C6.01 FRONTEND_RETIRED.LATENCY_GE_4
C6.01 FRONTEND_RETIRED.LATENCY_GE_8
C6.01 FRONTEND_RETIRED.LATENCY_GE_16
C6.01 FRONTEND_RETIRED.LATENCY_GE_32
C6.01 FRONTEND_RETIRED.LATENCY_GE_64
C6.01 FRONTEND_RETIRED.LATENCY_GE_128
C6.01 FRONTEND_RETIRED.LATENCY_GE_256
C6.01 FRONTEND_RETIRED.LATENCY_GE_512
C6.01 FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1
C7.01 FP_ARITH_INST_RETIRED.SCALAR_DOUBLE
C7.02 FP_ARITH_INST_RETIRED.SCALAR_SINGLE
C7.04 FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE
C7.08 FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE
C7.10 FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE
C7.20 FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE
C7.40 FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE
C7.80 FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE
C8.01 HLE_RETIRED.START
C8.02 HLE_RETIRED.COMMIT
C8.04 HLE_RETIRED.ABORTED
C8.08 HLE_RETIRED.ABORTED_MEM
C8.20 HLE_RETIRED.ABORTED_UNFRIENDLY
C8.80 HLE_RETIRED.ABORTED_EVENTS
C9.01 RTM_RETIRED.START
C9.02 RTM_RETIRED.COMMIT
C9.04 RTM_RETIRED.ABORTED
C9.08 RTM_RETIRED.ABORTED_MEM
C9.20 RTM_RETIRED.ABORTED_UNFRIENDLY
C9.40 RTM_RETIRED.ABORTED_MEMTYPE
C9.80 RTM_RETIRED.ABORTED_EVENTS
CC.20 MISC_RETIRED.LBR_INSERTS
CC.40 MISC_RETIRED.PAUSE_INST
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256
CD.01 MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512
D0.12 MEM_INST_RETIRED.STLB_MISS_STORES
D0.41 MEM_INST_RETIRED.SPLIT_LOADS
D0.81 MEM_INST_RETIRED.ALL_LOADS
D0.82 MEM_INST_RETIRED.ALL_STORES
D1.01 MEM_LOAD_RETIRED.L1_HIT
D1.02 MEM_LOAD_RETIRED.L2_HIT
D1.04 MEM_LOAD_RETIRED.L3_HIT
D1.08 MEM_LOAD_RETIRED.L1_MISS
D1.10 MEM_LOAD_RETIRED.L2_MISS
D1.20 MEM_LOAD_RETIRED.L3_MISS
D1.40 MEM_LOAD_RETIRED.FB_HIT
D2.01 MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS
D2.02 MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT
D2.04 MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM
D2.08 MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE
E6.01 BACLEARS.ANY
EC.02 CPU_CLK_UNHALTED.DISTRIBUTED
F1.1F L2_LINES_IN.ALL
F4.04 SQ_MISC.SQ_FULL

View File

@@ -0,0 +1,21 @@
# Performance monitoring events for processors based on the Ice Lake microarchitecture.
# Applies to processors with DisplayFamily_DisplayModel of 06_7DH and 06_7EH.
# See Table 19-5 of Intel's "System Programming Guide" (May 2019)
0E.01 UOPS_ISSUED.ANY
B1.01 UOPS_EXECUTED.THREAD
A1.01 UOPS_DISPATCHED.PORT_0
A1.02 UOPS_DISPATCHED.PORT_1
A1.04 UOPS_DISPATCHED.PORT_2_3
A1.10 UOPS_DISPATCHED.PORT_4_9
A1.20 UOPS_DISPATCHED.PORT_5
A1.40 UOPS_DISPATCHED.PORT_6
A1.80 UOPS_DISPATCHED.PORT_7_8
C4.00 BR_INST_RETIRED.ALL_BRANCHES
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
D1.01 MEM_LOAD_RETIRED.L1_HIT
D1.08 MEM_LOAD_RETIRED.L1_MISS
D1.02 MEM_LOAD_RETIRED.L2_HIT
D1.10 MEM_LOAD_RETIRED.L2_MISS
D1.04 MEM_LOAD_RETIRED.L3_HIT
D1.20 MEM_LOAD_RETIRED.L3_MISS

View File

@@ -168,12 +168,14 @@ VersionInfo = collections.namedtuple('VersionInfo', 'displ_family displ_model st
def version_info(cpu):
a, _, _, _ = cpu(0x01)
displ_family = (a >> 8) & 0xF
if (displ_family == 0x0F):
family_ID = (a >> 8) & 0xF
displ_family = family_ID
if (family_ID == 0x0F):
displ_family += (a >> 20) & 0xFF
displ_model = (a >> 4) & 0xF
if (displ_family == 0x06 or displ_family == 0x0F):
if (family_ID == 0x06 or family_ID == 0x0F):
displ_model += (a >> 12) & 0xF0
stepping = a & 0xF
@@ -211,6 +213,8 @@ def micro_arch(cpu):
return 'CFL'
if (vi.displ_family, vi.displ_model) in [(0x06, 0x66)]:
return 'CNL'
if (vi.displ_family, vi.displ_model) in [(0x06, 0x7D), (0x06, 0x7E)]:
return 'ICL'
if (vi.displ_family, vi.displ_model) in [(0x17, 0x01), (0x17, 0x11)]:
return 'ZEN'
if (vi.displ_family, vi.displ_model) in [(0x17, 0x08), (0x17, 0x18)]:
@@ -491,7 +495,11 @@ def get_cache_info(cpu):
L3Size = int(get_bits(d, 18, 31)*512)
L3Assoc = 0
d_15_12 = get_bits(d, 12, 15)
if d_15_12 == 0x8: L3Assoc = 16
if d_15_12 == 0x1: L3Assoc = 1
elif d_15_12 == 0x2: L3Assoc = 2
elif d_15_12 == 0x4: L3Assoc = 4
elif d_15_12 == 0x6: L3Assoc = 8
elif d_15_12 == 0x8: L3Assoc = 16
elif d_15_12 == 0xA: L3Assoc = 32
elif d_15_12 == 0xB: L3Assoc = 48
elif d_15_12 == 0xC: L3Assoc = 64

View File

@@ -81,13 +81,9 @@ If the replacement policy is a permutation policy (see [Measurement-based Modeli
Generates graphs that show the number of core cycles and the number of hits/misses (per access) when accessing memory areas of different sizes repeatedly using a given stride (which can be specified with the `-stride` option). An example can be seen [here](https://uops.info/cache/lat_CFL.html).
## cpuid.py
Obtains cache and TLB information using the `CPUID` instruction.
## cacheInfo.py
Combines information from `cpuid.py` with information on the number of slices of the L3 cache that is obtained through measurements.
Combines information obtained by executing the `CPUID` instruction with information on the number of slices of the L3 cache that is obtained through measurements.
## setDueling.py

View File

@@ -3,15 +3,17 @@ from itertools import count
from collections import namedtuple
import math
import random
import re
import subprocess
import sys
import cpuid
sys.path.append('../..')
from kernelNanoBench import *
sys.path.append('../CPUID')
import cpuid
import logging
log = logging.getLogger(__name__)
@@ -21,26 +23,26 @@ def getEventConfig(event):
if event == 'L1_HIT':
if arch in ['Core', 'EnhancedCore']: return '40.0E ' + event # L1D_CACHE_LD.MES
if arch in ['NHM', 'WSM']: return 'CB.01 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.01 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.01 ' + event
if event == 'L1_MISS':
if arch in ['Core', 'EnhancedCore']: return 'CB.01.CTR=0 ' + event
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.08 ' + event
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.08 ' + event
if arch in ['ZEN+']: return '064.70 ' + event
if event == 'L2_HIT':
if arch in ['Core', 'EnhancedCore']: return '29.7E ' + event # L2_LD.THIS_CORE.ALL_INCL.MES
if arch in ['NHM', 'WSM']: return 'CB.02 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.02 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.02 ' + event
if arch in ['ZEN+']: return '064.70 ' + event
if event == 'L2_MISS':
if arch in ['Core', 'EnhancedCore']: return 'CB.04.CTR=0 ' + event
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.10 ' + event
if arch in ['IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.10 ' + event
if arch in ['ZEN+']: return '064.08 ' + event
if event == 'L3_HIT':
if arch in ['NHM', 'WSM']: return 'CB.04 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.04 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.04 ' + event
if event == 'L3_MISS':
if arch in ['NHM', 'WSM']: return 'CB.10 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'D1.20 ' + event
if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'D1.20 ' + event
return ''
def getDefaultCacheConfig():
@@ -49,7 +51,7 @@ def getDefaultCacheConfig():
def getDefaultCacheMSRConfig():
if 'Intel' in getCPUVendor() and 'L3' in getCpuidCacheInfo() and getCpuidCacheInfo()['L3']['complex']:
if getArch() in ['CNL']:
if getArch() in ['CNL', 'ICL']:
dist = 8
ctrOffset = 2
else:
@@ -149,7 +151,7 @@ def getNCBoxUnits():
try:
subprocess.check_output(['modprobe', 'msr'])
cbo_config = subprocess.check_output(['rdmsr', '0x396'])
if getArch() in ['CNL']:
if getArch() in ['CNL', 'ICL']:
getNCBoxUnits.nCBoxUnits = int(cbo_config)
else:
getNCBoxUnits.nCBoxUnits = int(cbo_config) - 1
@@ -207,6 +209,10 @@ def getPointerChasingInit(addresses):
if tuple(addresses) in pointerChasingInits:
return pointerChasingInits[tuple(addresses)]
#addresses_tail = addresses[1:]
#random.shuffle(addresses_tail)
#adresses = [addresses[0]] + addresses_tail
init = 'lea RAX, [R14+' + str(addresses[0]) + ']; '
init += 'mov RBX, RAX; '
@@ -235,7 +241,7 @@ def getPointerChasingInit(addresses):
ExperimentCode = namedtuple('ExperimentCode', 'code init oneTimeInit')
def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False):
def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False, afterEveryAcc=''):
distinctAddrLists = set(tuple(l.addresses) for l in initAddressLists+codeAddressLists)
if len(distinctAddrLists) > 1 and set.intersection(*list(set(l) for l in distinctAddrLists)):
raise ValueError('same address in different lists')
@@ -272,7 +278,7 @@ def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False):
if addressList.flush:
for address in addresses:
codeList.append('clflush [R14 + ' + str(address) + ']; ')
codeList.append('clflush [R14 + ' + str(address) + ']; ' + afterEveryAcc)
else:
if len(addresses) == 1:
codeList.append('mov RCX, [R14 + ' + str(addresses[0]) + ']; ')
@@ -281,7 +287,7 @@ def getCodeForAddressLists(codeAddressLists, initAddressLists=[], wbinvd=False):
oneTimeInit.append(getPointerChasingInit(addresses))
alreadyAddedOneTimeInits.add(tuple(addresses))
codeList.append('lea RCX, [R14+' + str(addresses[0]) + ']; 1: mov RCX, [RCX]; jrcxz 2f; jmp 1b; 2: ')
codeList.append('lea RCX, [R14+' + str(addresses[0]) + ']; 1: mov RCX, [RCX]; ' + afterEveryAcc + 'jrcxz 2f; jmp 1b; 2: ')
if not isInit and not pfcEnabled:
codeList.append(PFC_START_ASM + '; ')
@@ -361,7 +367,7 @@ def getAddresses(level, wayID, cacheSetList, cBox=1, clearHL=True):
if getCacheInfo(3).nSlices == getNCBoxUnits():
L3SetToWayIDMap[cBox][L3Set][wayID] = next(iter(getNewAddressesInCBox(1, cBox, L3Set, L3SetToWayIDMap[cBox][L3Set].values())))
else:
L3SetToWayIDMap[cBox][L3Set][wayID] = next(iter(findCongruentL3Addresses(1, L3SetToWayIDMap[cBox][L3Set].values())))
L3SetToWayIDMap[cBox][L3Set][wayID] = next(iter(findCongruentL3Addresses(1, L3Set, cBox, L3SetToWayIDMap[cBox][L3Set].values())))
addresses.append(L3SetToWayIDMap[cBox][L3Set][wayID])
return addresses
@@ -447,14 +453,16 @@ def printNB(nb_result):
def findMinimalL3EvictionSet(cacheSet, cBox):
setNanoBenchParameters(config='\n'.join([getEventConfig('L3_HIT'), getEventConfig('L3_MISS')]), msrConfig=None, nMeasurements=10, unrollCount=1, loopCount=10,
warmUpCount=None, initialWarmUpCount=None, aggregateFunction='med', basicMode=True, noMem=True, verbose=None)
if not hasattr(findMinimalL3EvictionSet, 'evSetForCacheSet'):
findMinimalL3EvictionSet.evSetForCacheSet = dict()
evSetForCacheSet = findMinimalL3EvictionSet.evSetForCacheSet
if not cBox in findMinimalL3EvictionSet.evSetForCacheSet:
findMinimalL3EvictionSet.evSetForCacheSet[cBox] = dict()
evSetForCacheSet = findMinimalL3EvictionSet.evSetForCacheSet[cBox]
if cacheSet in evSetForCacheSet:
return evSetForCacheSet[cacheSet]
clearHLAddrList = AddressList(getClearHLAddresses(3, [cacheSet], cBox), True, False)
addresses = []
curAddress = cacheSet*getCacheInfo(3).lineSize
@@ -468,7 +476,7 @@ def findMinimalL3EvictionSet(cacheSet, cBox):
if not getCBoxOfAddress(curAddress) == cBox: continue
addresses += [curAddress]
ec = getCodeForAddressLists([AddressList(addresses,False,False)])
ec = getCodeForAddressLists([AddressList(addresses,False,False), clearHLAddrList])
setNanoBenchParameters(config=getDefaultCacheConfig(), msrConfig='', nMeasurements=10, unrollCount=1, loopCount=100,
aggregateFunction='med', basicMode=True, noMem=True)
@@ -480,7 +488,7 @@ def findMinimalL3EvictionSet(cacheSet, cBox):
for i in reversed(range(0, len(addresses))):
tmpAddresses = addresses[:i] + addresses[(i+1):]
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False)])
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False), clearHLAddrList])
nb = runNanoBench(code=ec.code, oneTimeInit=ec.oneTimeInit)
if nb['L3_HIT'] < len(tmpAddresses) - 0.9:
@@ -490,16 +498,20 @@ def findMinimalL3EvictionSet(cacheSet, cBox):
return addresses
def findCongruentL3Addresses(n, L3EvictionSet):
setNanoBenchParameters(config=getEventConfig('L3_HIT'), msrConfig=None, nMeasurements=10, unrollCount=1, loopCount=100,
warmUpCount=None, initialWarmUpCount=None, aggregateFunction='med', basicMode=True, noMem=True, verbose=None)
def findCongruentL3Addresses(n, cacheSet, cBox, L3EvictionSet):
clearHLAddrList = AddressList(getClearHLAddresses(3, [cacheSet], cBox), True, False)
congrAddresses = []
L3WaySize = getCacheInfo(3).waySize
for newAddr in count(max(L3EvictionSet)+L3WaySize, L3WaySize):
tmpAddresses = L3EvictionSet[:getCacheInfo(3).assoc] + [newAddr]
if not getCBoxOfAddress(newAddr) == cBox: continue
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False)])
tmpAddresses = L3EvictionSet[:getCacheInfo(3).assoc] + [newAddr]
ec = getCodeForAddressLists([AddressList(tmpAddresses,False,False), clearHLAddrList])
setNanoBenchParameters(config=getEventConfig('L3_HIT'), msrConfig=None, nMeasurements=10, unrollCount=1, loopCount=100,
aggregateFunction='med', basicMode=True, noMem=True, verbose=None)
nb = runNanoBench(code=ec.code, oneTimeInit=ec.oneTimeInit)
if nb['L3_HIT'] < len(tmpAddresses) - 0.9:
@@ -511,9 +523,17 @@ def findCongruentL3Addresses(n, L3EvictionSet):
def findMaximalNonEvictingL3SetInCBox(start, stride, L3Assoc, cBox):
curAddress = start
clearHLAddresses = []
addresses = []
curAddress = start
while len(clearHLAddresses) < 2*(getCacheInfo(1).assoc+getCacheInfo(2).assoc):
if getCBoxOfAddress(curAddress) != cBox:
clearHLAddresses.append(curAddress)
curAddress += stride
clearHLAddrList = AddressList(clearHLAddresses, True, False)
curAddress = start
while len(addresses) < L3Assoc:
if getCBoxOfAddress(curAddress) == cBox:
addresses.append(curAddress)
@@ -527,7 +547,7 @@ def findMaximalNonEvictingL3SetInCBox(start, stride, L3Assoc, cBox):
continue
newAddresses = addresses + [curAddress]
ec = getCodeForAddressLists([AddressList(newAddresses,False,False)])
ec = getCodeForAddressLists([AddressList(newAddresses,False,False), clearHLAddrList])
setNanoBenchParameters(config=getEventConfig('L3_HIT'), msrConfig='', nMeasurements=10, unrollCount=1, loopCount=10,
aggregateFunction='med', basicMode=True, noMem=True)

View File

@@ -109,6 +109,28 @@ AllRandPLRUVariants = {
'PLRURand': PLRURandSim,
}
class LRU_PLRU4Sim(ReplPolicySim):
def __init__(self, assoc):
self.PLRUs = [PLRUSim(4, linearInit=True) for _ in range(0, assoc/4)]
self.PLRUOrdered = list(self.PLRUs) # from MRU to LRU
def acc(self, block):
hit = False
curPLRU = self.PLRUOrdered[-1]
for plru in self.PLRUs:
if block in plru.blocks:
curPLRU = plru
hit = True
break
else:
for plru in self.PLRUs:
if None in plru.blocks:
curPLRU = plru
break
curPLRU.acc(block)
self.PLRUOrdered = [curPLRU] + [plru for plru in self.PLRUOrdered if plru!=curPLRU]
return hit
class QLRUSim(ReplPolicySim):
def __init__(self, assoc, hitFunc, missFunc, replIdxFunc, updFunc, updOnMissOnly=False):
super(QLRUSim, self).__init__(assoc)
@@ -259,6 +281,7 @@ CommonPolicies = {
'LRU': LRUSim,
'PLRU': PLRUSim,
'PLRUl': PLRUlSim,
'LRU_PLRU4': LRU_PLRU4Sim,
'MRU': MRUSim, # NHM
'MRU_N': MRUNSim, # SNB
'NRU': NRUSim,
@@ -297,7 +320,7 @@ def getAges(blocks, seq, policySimClass, assoc):
for block in blocks:
for i in count(0):
curSeq = seq + ' ' + ' '.join('N' + str(n) for n in range(0,i)) + ' ' + block + '?'
if getHits(policySimClass(assoc), curSeq) == 0:
if getHits(curSeq, policySimClass, assoc, 1) == 0:
ages[block] = i
break
return ages
@@ -321,7 +344,7 @@ def getGraph(blocks, seq, policySimClass, assoc, maxAge, nSets=1, nRep=1, agg="m
return traces
def getPermutations(policySimClass, assoc, maxAge=None):
def getPermutations(policySimClass, assoc):
# initial ages
initBlocks = ['I' + str(i) for i in range(0, assoc)]
seq = ' '.join(initBlocks)

View File

@@ -14,6 +14,7 @@ import plotly.graph_objects as go
from cacheLib import *
from cacheGraph import *
import cacheSim
import logging
log = logging.getLogger(__name__)
@@ -83,21 +84,27 @@ def main():
parser.add_argument("-noInit", help="Do not fill sets with associativity many elements first", action='store_true')
parser.add_argument("-maxAge", help="Maximum age", type=int)
parser.add_argument("-cBox", help="cBox (default: 1)", type=int, default=1)
parser.add_argument("-sim", help="Simulate the given policy instead of running the experiment on the hardware")
parser.add_argument("-simAssoc", help="Associativity of the simulated cache (default: 8)", type=int, default=8)
parser.add_argument("-logLevel", help="Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)", default='WARNING')
parser.add_argument("-output", help="Output file name", default='permPolicy.html')
args = parser.parse_args()
logging.basicConfig(stream=sys.stdout, format='%(message)s', level=logging.getLevelName(args.logLevel))
title = cpuid.cpu_name(cpuid.CPUID()) + ', Level: ' + str(args.level)
if not args.sim:
title = cpuid.cpu_name(cpuid.CPUID()) + ', Level: ' + str(args.level)
html = ['<html>', '<head>', '<title>' + title + '</title>', '<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>', '</head>', '<body>']
html += ['<h3>' + title + '</h3>']
getPermutations(args.level, html, cacheSets=args.sets, getInitialAges=(not args.noInit), maxAge=args.maxAge, cBox=args.cBox)
html += ['</body>', '</html>']
html = ['<html>', '<head>', '<title>' + title + '</title>', '<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>', '</head>', '<body>']
html += ['<h3>' + title + '</h3>']
getPermutations(args.level, html, cacheSets=args.sets, getInitialAges=(not args.noInit), maxAge=args.maxAge, cBox=args.cBox)
html += ['</body>', '</html>']
with open(args.output ,'w') as f:
f.write('\n'.join(html))
with open(args.output ,'w') as f:
f.write('\n'.join(html))
else:
policyClass = cacheSim.AllPolicies[args.sim]
cacheSim.getPermutations(policyClass, args.simAssoc)
if __name__ == "__main__":

View File

@@ -64,6 +64,7 @@ def main():
parser.add_argument("-nMeasurements", help="Number of measurements", type=int, default=3)
parser.add_argument("-findCtrEx", help="Tries to find a small counterexample for each policy (only available for deterministic policies)", action='store_true')
parser.add_argument("-policies", help="Comma-separated list of policies to consider (Default: all deterministic policies)")
parser.add_argument("-best", help="Find the best matching policy (Default: abort if no policy agrees with all results)", action='store_true')
parser.add_argument("-randPolicies", help="Test randomized policies", action='store_true')
parser.add_argument("-allQLRUVariants", help="Test all QLRU variants", action='store_true')
parser.add_argument("-assoc", help="Override the associativity", type=int)
@@ -126,6 +127,7 @@ def main():
if sim != actual:
possiblePolicies.discard(p)
dists[p] += 1
color = 'red'
if args.findCtrEx and not p in counterExamples:
counterExamples[p] = findSmallCounterexample(p, ((args.initSeq + ' ') if args.initSeq else ''), args.level, args.sets, cBox, assoc, seq,
@@ -145,7 +147,7 @@ def main():
html += ['</tr>']
if not args.randPolicies:
if not args.randPolicies and not args.best:
print 'Possible policies: ' + ', '.join(possiblePolicies)
if not possiblePolicies: break
@@ -160,7 +162,7 @@ def main():
with open(args.output ,'w') as f:
f.write('\n'.join(html))
if not args.randPolicies:
if not args.randPolicies and not args.best:
print 'Possible policies: ' + ', '.join(possiblePolicies)
else:
for p, d in reversed(sorted(dists.items(), key=lambda d: d[1])):