From 6e55106bf565ec973c9d81aff33d099b02c84c8a Mon Sep 17 00:00:00 2001
From: Andreas Abel <abel@cs.uni-saarland.de>
Date: Sat, 8 Jan 2022 00:53:37 +0100
Subject: [PATCH] support for Goldmont Plus

---
 configs/cfg_GoldmontPlus_all_core.txt    | 290 +++++++++++++++++++++++
 configs/cfg_GoldmontPlus_all_offcore.txt | 242 +++++++++++++++++++
 configs/cfg_GoldmontPlus_common.txt      |  15 ++
 kernelNanoBench.py                       |   2 -
 tools/cpuBench/cpuBench.py               |   8 +-
 5 files changed, 552 insertions(+), 5 deletions(-)
 create mode 100644 configs/cfg_GoldmontPlus_all_core.txt
 create mode 100644 configs/cfg_GoldmontPlus_all_offcore.txt
 create mode 100644 configs/cfg_GoldmontPlus_common.txt

diff --git a/configs/cfg_GoldmontPlus_all_core.txt b/configs/cfg_GoldmontPlus_all_core.txt
new file mode 100644
index 0000000..8fa6b8d
--- /dev/null
+++ b/configs/cfg_GoldmontPlus_all_core.txt
@@ -0,0 +1,290 @@
+# Based on https://download.01.org/perfmon/GLP/goldmontplus_core_v1.01.json
+# Applies to processors with family-model in {6-7A}
+
+# Loads blocked due to store data not ready (Precise event capable)
+03.01 LD_BLOCKS.DATA_UNKNOWN
+
+# Loads blocked due to store forward restriction (Precise event capable)
+03.02 LD_BLOCKS.STORE_FORWARD
+
+# Loads blocked because address has 4k partial address false dependence (Precise event capable)
+03.04 LD_BLOCKS.4K_ALIAS
+
+# Loads blocked because address in not in the UTLB (Precise event capable)
+03.08 LD_BLOCKS.UTLB_MISS
+
+# Loads blocked (Precise event capable)
+03.10 LD_BLOCKS.ALL_BLOCK
+
+# Page walk completed due to a demand load to a 4K page
+08.02 DTLB_LOAD_MISSES.WALK_COMPLETED_4K
+
+# Page walk completed due to a demand load to a 2M or 4M page
+08.04 DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walk completed due to a demand load to a 1GB page
+08.08 DTLB_LOAD_MISSES.WALK_COMPLETED_1GB
+
+# Page walks outstanding due to a demand load every cycle.
+08.10 DTLB_LOAD_MISSES.WALK_PENDING
+
+# Uops issued to the back end per cycle
+0E.00 UOPS_ISSUED.ANY
+
+# Load uops that split a page (Precise event capable)
+13.02 MISALIGN_MEM_REF.LOAD_PAGE_SPLIT
+
+# Store uops that split a page (Precise event capable)
+13.04 MISALIGN_MEM_REF.STORE_PAGE_SPLIT
+
+# L2 cache request misses
+2E.41 LONGEST_LAT_CACHE.MISS
+
+# L2 cache requests
+2E.4F LONGEST_LAT_CACHE.REFERENCE
+
+# Requests rejected by the XQ
+30.00 L2_REJECT_XQ.ALL
+
+# Requests rejected by the L2Q
+31.00 CORE_REJECT_L2Q.ALL
+
+# Core cycles when core is not halted
+3C.00 CPU_CLK_UNHALTED.CORE_P
+
+# Reference cycles when core is not halted
+3C.01 CPU_CLK_UNHALTED.REF
+
+# Page walk completed due to a demand data store to a 4K page
+49.02 DTLB_STORE_MISSES.WALK_COMPLETED_4K
+
+# Page walk completed due to a demand data store to a 2M or 4M page
+49.04 DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walk completed due to a demand data store to a 1GB page
+49.08 DTLB_STORE_MISSES.WALK_COMPLETED_1GB
+
+# Page walks outstanding due to a demand data store every cycle.
+49.10 DTLB_STORE_MISSES.WALK_PENDING
+
+# Page walks outstanding due to walking the EPT every cycle
+4F.10 EPT.WALK_PENDING
+
+# L1 Cache evictions for dirty data
+51.01 DL1.REPLACEMENT
+
+# References per ICache line that are available in the ICache (hit). This event counts differently than Intel processors based on Silvermont microarchitecture
+80.01 ICACHE.HIT
+
+# References per ICache line that are not available in the ICache (miss). This event counts differently than Intel processors based on Silvermont microarchitecture
+80.02 ICACHE.MISSES
+
+# References per ICache line. This event counts differently than Intel processors based on Silvermont microarchitecture
+80.03 ICACHE.ACCESSES
+
+# ITLB misses
+81.04 ITLB.MISS
+
+# Page walk completed due to an instruction fetch in a 4K page
+85.02 ITLB_MISSES.WALK_COMPLETED_4K
+
+# Page walk completed due to an instruction fetch in a 2M or 4M page
+85.04 ITLB_MISSES.WALK_COMPLETED_2M_4M
+
+# Page walk completed due to an instruction fetch in a 1GB page
+85.08 ITLB_MISSES.WALK_COMPLETED_1GB
+
+# Page walks outstanding due to an instruction fetch every cycle.
+85.10 ITLB_MISSES.WALK_PENDING
+
+# Cycles code-fetch stalled due to any reason.
+86.00 FETCH_STALL.ALL
+
+# Cycles the code-fetch stalls and an ITLB miss is outstanding.
+86.01 FETCH_STALL.ITLB_FILL_PENDING_CYCLES
+
+# Cycles code-fetch stalled due to an outstanding ICache miss.
+86.02 FETCH_STALL.ICACHE_FILL_PENDING_CYCLES
+
+# Uops requested but not-delivered to the back-end per cycle
+9C.00 UOPS_NOT_DELIVERED.ANY
+
+# STLB flushes
+BD.20 TLB_FLUSHES.STLB_ANY
+
+# Instructions retired (Precise event capable)
+C0.00 INST_RETIRED.ANY_P
+
+# Instructions retired - using Reduced Skid PEBS feature
+C0.00 INST_RETIRED.PREC_DIST
+
+# Uops retired (Precise event capable)
+C2.00 UOPS_RETIRED.ANY
+
+# MS uops retired (Precise event capable)
+C2.01 UOPS_RETIRED.MS
+
+# Floating point divide uops retired (Precise Event Capable)
+C2.08 UOPS_RETIRED.FPDIV
+
+# Integer divide uops retired (Precise Event Capable)
+C2.10 UOPS_RETIRED.IDIV
+
+# All machine clears
+C3.00 MACHINE_CLEARS.ALL
+
+# Self-Modifying Code detected
+C3.01 MACHINE_CLEARS.SMC
+
+# Machine clears due to memory ordering issue
+C3.02 MACHINE_CLEARS.MEMORY_ORDERING
+
+# Machine clears due to FP assists
+C3.04 MACHINE_CLEARS.FP_ASSIST
+
+# Machine clears due to memory disambiguation
+C3.08 MACHINE_CLEARS.DISAMBIGUATION
+
+# Machines clear due to a page fault
+C3.20 MACHINE_CLEARS.PAGE_FAULT
+
+# Retired branch instructions (Precise event capable)
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+
+# Retired conditional branch instructions (Precise event capable)
+C4.7E BR_INST_RETIRED.JCC
+
+# Retired taken branch instructions (Precise event capable)
+C4.80 BR_INST_RETIRED.ALL_TAKEN_BRANCHES
+
+# Retired far branch instructions (Precise event capable)
+C4.BF BR_INST_RETIRED.FAR_BRANCH
+
+# Retired instructions of near indirect Jmp or call (Precise event capable)
+C4.EB BR_INST_RETIRED.NON_RETURN_IND
+
+# Retired near return instructions (Precise event capable)
+C4.F7 BR_INST_RETIRED.RETURN
+
+# Retired near call instructions (Precise event capable)
+C4.F9 BR_INST_RETIRED.CALL
+
+# Retired near indirect call instructions (Precise event capable)
+C4.FB BR_INST_RETIRED.IND_CALL
+
+# Retired near relative call instructions (Precise event capable)
+C4.FD BR_INST_RETIRED.REL_CALL
+
+# Retired conditional branch instructions that were taken (Precise event capable)
+C4.FE BR_INST_RETIRED.TAKEN_JCC
+
+# Retired mispredicted branch instructions (Precise event capable)
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+
+# Retired mispredicted conditional branch instructions (Precise event capable)
+C5.7E BR_MISP_RETIRED.JCC
+
+# Retired mispredicted instructions of near indirect Jmp or near indirect call (Precise event capable)
+C5.EB BR_MISP_RETIRED.NON_RETURN_IND
+
+# Retired mispredicted near return instructions (Precise event capable)
+C5.F7 BR_MISP_RETIRED.RETURN
+
+# Retired mispredicted near indirect call instructions (Precise event capable)
+C5.FB BR_MISP_RETIRED.IND_CALL
+
+# Retired mispredicted conditional branch instructions that were taken (Precise event capable)
+C5.FE BR_MISP_RETIRED.TAKEN_JCC
+
+# Unfilled issue slots per cycle
+CA.00 ISSUE_SLOTS_NOT_CONSUMED.ANY
+
+# Unfilled issue slots per cycle because of a full resource in the backend
+CA.01 ISSUE_SLOTS_NOT_CONSUMED.RESOURCE_FULL
+
+# Unfilled issue slots per cycle to recover
+CA.02 ISSUE_SLOTS_NOT_CONSUMED.RECOVERY
+
+# Hardware interrupts received
+CB.01 HW_INTERRUPTS.RECEIVED
+
+# Cycles hardware interrupts are masked
+CB.02 HW_INTERRUPTS.MASKED
+
+# Cycles pending interrupts are masked
+CB.04 HW_INTERRUPTS.PENDING_AND_MASKED
+
+# Cycles a divider is busy
+CD.00 CYCLES_DIV_BUSY.ALL
+
+# Cycles the integer divide unit is busy
+CD.01 CYCLES_DIV_BUSY.IDIV
+
+# Cycles the FP divide unit is busy
+CD.02 CYCLES_DIV_BUSY.FPDIV
+
+# Load uops retired that missed the DTLB (Precise event capable)
+D0.11 MEM_UOPS_RETIRED.DTLB_MISS_LOADS
+
+# Store uops retired that missed the DTLB (Precise event capable)
+D0.12 MEM_UOPS_RETIRED.DTLB_MISS_STORES
+
+# Memory uops retired that missed the DTLB (Precise event capable)
+D0.13 MEM_UOPS_RETIRED.DTLB_MISS
+
+# Locked load uops retired (Precise event capable)
+D0.21 MEM_UOPS_RETIRED.LOCK_LOADS
+
+# Load uops retired that split a cache-line (Precise event capable)
+D0.41 MEM_UOPS_RETIRED.SPLIT_LOADS
+
+# Stores uops retired that split a cache-line (Precise event capable)
+D0.42 MEM_UOPS_RETIRED.SPLIT_STORES
+
+# Memory uops retired that split a cache-line (Precise event capable)
+D0.43 MEM_UOPS_RETIRED.SPLIT
+
+# Load uops retired (Precise event capable)
+D0.81 MEM_UOPS_RETIRED.ALL_LOADS
+
+# Store uops retired (Precise event capable)
+D0.82 MEM_UOPS_RETIRED.ALL_STORES
+
+# Memory uops retired (Precise event capable)
+D0.83 MEM_UOPS_RETIRED.ALL
+
+# Load uops retired that hit L1 data cache (Precise event capable)
+D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
+
+# Load uops retired that hit L2 (Precise event capable)
+D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
+
+# Load uops retired that missed L1 data cache (Precise event capable)
+D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
+
+# Load uops retired that missed L2 (Precise event capable)
+D1.10 MEM_LOAD_UOPS_RETIRED.L2_MISS
+
+# Memory uop retired where cross core or cross module HITM occurred (Precise event capable)
+D1.20 MEM_LOAD_UOPS_RETIRED.HITM
+
+# Loads retired that hit WCB (Precise event capable)
+D1.40 MEM_LOAD_UOPS_RETIRED.WCB_HIT
+
+# Loads retired that came from DRAM (Precise event capable)
+D1.80 MEM_LOAD_UOPS_RETIRED.DRAM_HIT
+
+# BACLEARs asserted for any branch type
+E6.01 BACLEARS.ALL
+
+# BACLEARs asserted for return branch
+E6.08 BACLEARS.RETURN
+
+# BACLEARs asserted for conditional branch
+E6.10 BACLEARS.COND
+
+# MS decode starts
+E7.01 MS_DECODED.MS_ENTRY
+
+# Decode restrictions due to predicting wrong instruction length
+E9.01 DECODE_RESTRICTION.PREDECODE_WRONG
diff --git a/configs/cfg_GoldmontPlus_all_offcore.txt b/configs/cfg_GoldmontPlus_all_offcore.txt
new file mode 100644
index 0000000..4690b67
--- /dev/null
+++ b/configs/cfg_GoldmontPlus_all_offcore.txt
@@ -0,0 +1,242 @@
+# Based on https://download.01.org/perfmon/GLP/goldmontplus_core_v1.01.json
+# Applies to processors with family-model in {6-7A}
+
+# Counts demand cacheable data reads of full cache lines have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010001.TakenAlone OFFCORE_RESPONSE.DEMAND_DATA_RD.ANY_RESPONSE
+
+# Counts demand reads for ownership (RFO) requests generated by a write to full data cache line have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010002.TakenAlone OFFCORE_RESPONSE.DEMAND_RFO.ANY_RESPONSE
+
+# Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010004.TakenAlone OFFCORE_RESPONSE.DEMAND_CODE_RD.ANY_RESPONSE
+
+# Counts the number of writeback transactions caused by L1 or L2 cache evictions have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010008.TakenAlone OFFCORE_RESPONSE.COREWB.ANY_RESPONSE
+
+# Counts data cacheline reads generated by hardware L2 cache prefetcher have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010010.TakenAlone OFFCORE_RESPONSE.PF_L2_DATA_RD.ANY_RESPONSE
+
+# Counts reads for ownership (RFO) requests generated by L2 prefetcher have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010020.TakenAlone OFFCORE_RESPONSE.PF_L2_RFO.ANY_RESPONSE
+
+# Counts reads for ownership (RFO) requests (demand & prefetch) have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010022.TakenAlone OFFCORE_RESPONSE.ANY_RFO.ANY_RESPONSE
+
+# Counts bus lock and split lock requests have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010400.TakenAlone OFFCORE_RESPONSE.BUS_LOCKS.ANY_RESPONSE
+
+# Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000010800.TakenAlone OFFCORE_RESPONSE.FULL_STREAMING_STORES.ANY_RESPONSE
+
+# Counts data cache lines requests by software prefetch instructions have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000011000.TakenAlone OFFCORE_RESPONSE.SW_PREFETCH.ANY_RESPONSE
+
+# Counts data cache line reads generated by hardware L1 data cache prefetcher have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000012000.TakenAlone OFFCORE_RESPONSE.PF_L1_DATA_RD.ANY_RESPONSE
+
+# Counts data reads generated by L1 or L2 prefetchers have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000013010.TakenAlone OFFCORE_RESPONSE.ANY_PF_DATA_RD.ANY_RESPONSE
+
+# Counts data reads (demand & prefetch) have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000013091.TakenAlone OFFCORE_RESPONSE.ANY_DATA_RD.ANY_RESPONSE
+
+# Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x00000132b7.TakenAlone OFFCORE_RESPONSE.ANY_READ.ANY_RESPONSE
+
+# Counts any data writes to uncacheable write combining (USWC) memory region  have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000014800.TakenAlone OFFCORE_RESPONSE.STREAMING_STORES.ANY_RESPONSE
+
+# Counts requests to the uncore subsystem have any transaction responses from the uncore subsystem.
+B7.01.MSR_RSP0=0x0000018000.TakenAlone OFFCORE_RESPONSE.ANY_REQUEST.ANY_RESPONSE
+
+# Counts demand cacheable data reads of full cache lines hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040001.TakenAlone OFFCORE_RESPONSE.DEMAND_DATA_RD.L2_HIT
+
+# Counts demand reads for ownership (RFO) requests generated by a write to full data cache line hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040002.TakenAlone OFFCORE_RESPONSE.DEMAND_RFO.L2_HIT
+
+# Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040004.TakenAlone OFFCORE_RESPONSE.DEMAND_CODE_RD.L2_HIT
+
+# Counts the number of writeback transactions caused by L1 or L2 cache evictions hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040008.TakenAlone OFFCORE_RESPONSE.COREWB.L2_HIT
+
+# Counts data cacheline reads generated by hardware L2 cache prefetcher hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040010.TakenAlone OFFCORE_RESPONSE.PF_L2_DATA_RD.L2_HIT
+
+# Counts reads for ownership (RFO) requests generated by L2 prefetcher hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040020.TakenAlone OFFCORE_RESPONSE.PF_L2_RFO.L2_HIT
+
+# Counts reads for ownership (RFO) requests (demand & prefetch) hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040022.TakenAlone OFFCORE_RESPONSE.ANY_RFO.L2_HIT
+
+# Counts bus lock and split lock requests hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040400.TakenAlone OFFCORE_RESPONSE.BUS_LOCKS.L2_HIT
+
+# Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes hit the L2 cache.
+B7.01.MSR_RSP0=0x0000040800.TakenAlone OFFCORE_RESPONSE.FULL_STREAMING_STORES.L2_HIT
+
+# Counts data cache lines requests by software prefetch instructions hit the L2 cache.
+B7.01.MSR_RSP0=0x0000041000.TakenAlone OFFCORE_RESPONSE.SW_PREFETCH.L2_HIT
+
+# Counts data cache line reads generated by hardware L1 data cache prefetcher hit the L2 cache.
+B7.01.MSR_RSP0=0x0000042000.TakenAlone OFFCORE_RESPONSE.PF_L1_DATA_RD.L2_HIT
+
+# Counts data reads generated by L1 or L2 prefetchers hit the L2 cache.
+B7.01.MSR_RSP0=0x0000043010.TakenAlone OFFCORE_RESPONSE.ANY_PF_DATA_RD.L2_HIT
+
+# Counts data reads (demand & prefetch) hit the L2 cache.
+B7.01.MSR_RSP0=0x0000043091.TakenAlone OFFCORE_RESPONSE.ANY_DATA_RD.L2_HIT
+
+# Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) hit the L2 cache.
+B7.01.MSR_RSP0=0x00000432b7.TakenAlone OFFCORE_RESPONSE.ANY_READ.L2_HIT
+
+# Counts any data writes to uncacheable write combining (USWC) memory region  hit the L2 cache.
+B7.01.MSR_RSP0=0x0000044800.TakenAlone OFFCORE_RESPONSE.STREAMING_STORES.L2_HIT
+
+# Counts requests to the uncore subsystem hit the L2 cache.
+B7.01.MSR_RSP0=0x0000048000.TakenAlone OFFCORE_RESPONSE.ANY_REQUEST.L2_HIT
+
+# Counts demand cacheable data reads of full cache lines true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000001.TakenAlone OFFCORE_RESPONSE.DEMAND_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts demand reads for ownership (RFO) requests generated by a write to full data cache line true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000002.TakenAlone OFFCORE_RESPONSE.DEMAND_RFO.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000004.TakenAlone OFFCORE_RESPONSE.DEMAND_CODE_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts the number of writeback transactions caused by L1 or L2 cache evictions true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000008.TakenAlone OFFCORE_RESPONSE.COREWB.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts data cacheline reads generated by hardware L2 cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000010.TakenAlone OFFCORE_RESPONSE.PF_L2_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts reads for ownership (RFO) requests generated by L2 prefetcher true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000020.TakenAlone OFFCORE_RESPONSE.PF_L2_RFO.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts reads for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000022.TakenAlone OFFCORE_RESPONSE.ANY_RFO.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts bus lock and split lock requests true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000400.TakenAlone OFFCORE_RESPONSE.BUS_LOCKS.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200000800.TakenAlone OFFCORE_RESPONSE.FULL_STREAMING_STORES.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts data cache lines requests by software prefetch instructions true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200001000.TakenAlone OFFCORE_RESPONSE.SW_PREFETCH.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts data cache line reads generated by hardware L1 data cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200002000.TakenAlone OFFCORE_RESPONSE.PF_L1_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts data reads generated by L1 or L2 prefetchers true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200003010.TakenAlone OFFCORE_RESPONSE.ANY_PF_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts data reads (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200003091.TakenAlone OFFCORE_RESPONSE.ANY_DATA_RD.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x02000032b7.TakenAlone OFFCORE_RESPONSE.ANY_READ.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts any data writes to uncacheable write combining (USWC) memory region  true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200004800.TakenAlone OFFCORE_RESPONSE.STREAMING_STORES.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts requests to the uncore subsystem true miss for the L2 cache with a snoop miss in the other processor module. 
+B7.01.MSR_RSP0=0x0200008000.TakenAlone OFFCORE_RESPONSE.ANY_REQUEST.L2_MISS.SNOOP_MISS_OR_NO_SNOOP_NEEDED
+
+# Counts demand cacheable data reads of full cache lines miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000001.TakenAlone OFFCORE_RESPONSE.DEMAND_DATA_RD.L2_MISS.HITM_OTHER_CORE
+
+# Counts demand reads for ownership (RFO) requests generated by a write to full data cache line miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000002.TakenAlone OFFCORE_RESPONSE.DEMAND_RFO.L2_MISS.HITM_OTHER_CORE
+
+# Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000004.TakenAlone OFFCORE_RESPONSE.DEMAND_CODE_RD.L2_MISS.HITM_OTHER_CORE
+
+# Counts the number of writeback transactions caused by L1 or L2 cache evictions miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000008.TakenAlone OFFCORE_RESPONSE.COREWB.L2_MISS.HITM_OTHER_CORE
+
+# Counts data cacheline reads generated by hardware L2 cache prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000010.TakenAlone OFFCORE_RESPONSE.PF_L2_DATA_RD.L2_MISS.HITM_OTHER_CORE
+
+# Counts reads for ownership (RFO) requests generated by L2 prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000020.TakenAlone OFFCORE_RESPONSE.PF_L2_RFO.L2_MISS.HITM_OTHER_CORE
+
+# Counts reads for ownership (RFO) requests (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000022.TakenAlone OFFCORE_RESPONSE.ANY_RFO.L2_MISS.HITM_OTHER_CORE
+
+# Counts bus lock and split lock requests miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000400.TakenAlone OFFCORE_RESPONSE.BUS_LOCKS.L2_MISS.HITM_OTHER_CORE
+
+# Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000000800.TakenAlone OFFCORE_RESPONSE.FULL_STREAMING_STORES.L2_MISS.HITM_OTHER_CORE
+
+# Counts data cache lines requests by software prefetch instructions miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000001000.TakenAlone OFFCORE_RESPONSE.SW_PREFETCH.L2_MISS.HITM_OTHER_CORE
+
+# Counts data cache line reads generated by hardware L1 data cache prefetcher miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000002000.TakenAlone OFFCORE_RESPONSE.PF_L1_DATA_RD.L2_MISS.HITM_OTHER_CORE
+
+# Counts data reads generated by L1 or L2 prefetchers miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000003010.TakenAlone OFFCORE_RESPONSE.ANY_PF_DATA_RD.L2_MISS.HITM_OTHER_CORE
+
+# Counts data reads (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000003091.TakenAlone OFFCORE_RESPONSE.ANY_DATA_RD.L2_MISS.HITM_OTHER_CORE
+
+# Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x10000032b7.TakenAlone OFFCORE_RESPONSE.ANY_READ.L2_MISS.HITM_OTHER_CORE
+
+# Counts any data writes to uncacheable write combining (USWC) memory region  miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000004800.TakenAlone OFFCORE_RESPONSE.STREAMING_STORES.L2_MISS.HITM_OTHER_CORE
+
+# Counts requests to the uncore subsystem miss the L2 cache with a snoop hit in the other processor module, data forwarding is required.
+B7.01.MSR_RSP0=0x1000008000.TakenAlone OFFCORE_RESPONSE.ANY_REQUEST.L2_MISS.HITM_OTHER_CORE
+
+# Counts demand cacheable data reads of full cache lines outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000001.TakenAlone OFFCORE_RESPONSE.DEMAND_DATA_RD.OUTSTANDING
+
+# Counts demand reads for ownership (RFO) requests generated by a write to full data cache line outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000002.TakenAlone OFFCORE_RESPONSE.DEMAND_RFO.OUTSTANDING
+
+# Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000004.TakenAlone OFFCORE_RESPONSE.DEMAND_CODE_RD.OUTSTANDING
+
+# Counts the number of writeback transactions caused by L1 or L2 cache evictions outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000008.TakenAlone OFFCORE_RESPONSE.COREWB.OUTSTANDING
+
+# Counts data cacheline reads generated by hardware L2 cache prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000010.TakenAlone OFFCORE_RESPONSE.PF_L2_DATA_RD.OUTSTANDING
+
+# Counts reads for ownership (RFO) requests generated by L2 prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000020.TakenAlone OFFCORE_RESPONSE.PF_L2_RFO.OUTSTANDING
+
+# Counts reads for ownership (RFO) requests (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000022.TakenAlone OFFCORE_RESPONSE.ANY_RFO.OUTSTANDING
+
+# Counts bus lock and split lock requests outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000400.TakenAlone OFFCORE_RESPONSE.BUS_LOCKS.OUTSTANDING
+
+# Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000000800.TakenAlone OFFCORE_RESPONSE.FULL_STREAMING_STORES.OUTSTANDING
+
+# Counts data cache lines requests by software prefetch instructions outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000001000.TakenAlone OFFCORE_RESPONSE.SW_PREFETCH.OUTSTANDING
+
+# Counts data cache line reads generated by hardware L1 data cache prefetcher outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000002000.TakenAlone OFFCORE_RESPONSE.PF_L1_DATA_RD.OUTSTANDING
+
+# Counts data reads generated by L1 or L2 prefetchers outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000003010.TakenAlone OFFCORE_RESPONSE.ANY_PF_DATA_RD.OUTSTANDING
+
+# Counts data reads (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000003091.TakenAlone OFFCORE_RESPONSE.ANY_DATA_RD.OUTSTANDING
+
+# Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x40000032b7.TakenAlone OFFCORE_RESPONSE.ANY_READ.OUTSTANDING
+
+# Counts any data writes to uncacheable write combining (USWC) memory region  outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000004800.TakenAlone OFFCORE_RESPONSE.STREAMING_STORES.OUTSTANDING
+
+# Counts requests to the uncore subsystem outstanding, per cycle, from the time of the L2 miss to when any response is received.
+B7.01.MSR_RSP0=0x4000008000.TakenAlone OFFCORE_RESPONSE.ANY_REQUEST.OUTSTANDING
diff --git a/configs/cfg_GoldmontPlus_common.txt b/configs/cfg_GoldmontPlus_common.txt
new file mode 100644
index 0000000..95605c0
--- /dev/null
+++ b/configs/cfg_GoldmontPlus_common.txt
@@ -0,0 +1,15 @@
+# Based on https://download.01.org/perfmon/GLM/goldmont_core_v13.json
+# Applies to processors with family-model in {6-7A}
+
+3C.00 CORE_CYCLES
+C0.00 INST_RETIRED
+0E.00 UOPS_ISSUED.ANY
+C2.00 UOPS_RETIRED.ANY
+C2.01 UOPS_RETIRED.MS
+C4.00 BR_INST_RETIRED.ALL_BRANCHES
+C5.00 BR_MISP_RETIRED.ALL_BRANCHES
+D0.81 MEM_UOPS_RETIRED.ALL_LOADS
+D0.82 MEM_UOPS_RETIRED.ALL_STORES
+D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
+D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
+D1.80 MEM_LOAD_UOPS_RETIRED.DRAM_HIT
diff --git a/kernelNanoBench.py b/kernelNanoBench.py
index cc46cb7..7b36904 100755
--- a/kernelNanoBench.py
+++ b/kernelNanoBench.py
@@ -272,8 +272,6 @@ if readFile('/sys/devices/system/cpu/smt/active').startswith('1'):
    print('Note: Hyper-threading is enabled; it can be disabled with "sudo ./disable-HT.sh"', file=sys.stderr)
 
 prevNMIWatchdogState = readFile('/proc/sys/kernel/nmi_watchdog').strip()
-print(prevNMIWatchdogState)
-print(prevNMIWatchdogState != '0')
 if prevNMIWatchdogState != '0':
    writeFile('/proc/sys/kernel/nmi_watchdog', '0')
 
diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py
index be24d2a..1ecaa7a 100755
--- a/tools/cpuBench/cpuBench.py
+++ b/tools/cpuBench/cpuBench.py
@@ -208,7 +208,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
       if 'RDTSC' in evt: continue
       if evt == 'UOPS':
          if arch in ['CON', 'WOL']: evt = 'RS_UOPS_DISPATCHED'
-         elif arch in ['NHM', 'WSM']: evt = 'UOPS_RETIRED.ANY'
+         elif arch in ['NHM', 'WSM', 'GLP']: evt = 'UOPS_RETIRED.ANY'
          elif arch in ['SNB', 'ADL-E']: evt = 'UOPS_RETIRED.ALL'
          elif arch in ['HSW']: evt = 'UOPS_EXECUTED.CORE'
          elif arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: evt = 'UOPS_EXECUTED.THREAD'
@@ -275,7 +275,7 @@ def getEventConfig(event):
       if arch in ['CON', 'WOL']: return 'A0.00' # RS_UOPS_DISPATCHED
       if arch in ['NHM', 'WSM', 'SNB' ]: return 'C2.01' # UOPS_RETIRED.ANY
       if arch in ['SNB']: return 'C2.01' # UOPS_RETIRED.ALL
-      if arch in ['ADL-E']: return 'C2.00' # UOPS_RETIRED.ALL
+      if arch in ['GLP', 'ADL-E']: return 'C2.00' # UOPS_RETIRED.ALL
       if arch in ['HSW']: return 'B1.02' # UOPS_EXECUTED.CORE; note: may undercount due to erratum HSD30
       if arch in ['IVB', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL', 'ADL-P']: return 'B1.01' # UOPS_EXECUTED.THREAD
       if arch in ['ZEN+', 'ZEN2', 'ZEN3']: return '0C1.00'
@@ -289,7 +289,7 @@ def getEventConfig(event):
       if arch in ['NHM', 'WSM']: return 'D1.02'
       if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL', 'CLX', 'TGL', 'RKL']: return '79.30'
       if arch in ['ADL-P']: return '79.20'
-      if arch in ['ADL-E']: return 'C2.01'
+      if arch in ['GLP', 'ADL-E']: return 'C2.01'
    if event == 'UOPS_PORT_0':
       if arch in ['CON', 'WOL']: return 'A1.01.CTR=0'
       if arch in ['NHM', 'WSM']: return 'B1.01'
@@ -1197,6 +1197,8 @@ def getThroughputAndUops(instrNode, useDistinctRegs, useIndexedAddr, htmlReports
                         else:
                            # we test with a small loop body so that uops may be delivered from the loop stream detector (LSD)
                            # we also test with a larger loop body to minimize potential overhead from the loop itself
+                           if instrNode.attrib['iclass'] in ['RDRAND', 'RDSEED', 'WBINVD'] or instrNode.attrib['category'] in ['IO', 'IOSTRINGOP']:
+                              continue
                            unrollCount = max(1, int(round(10.0/ic)))
                            if repType == 'loopSmall':
                               loopCount = 1000