improved counter configuration

This commit is contained in:
Andreas Abel
2022-01-12 17:11:46 +01:00
parent 73b5ac65e3
commit 3330672330
6 changed files with 113 additions and 43 deletions

View File

@@ -7,6 +7,8 @@ There are two variants of the tool: A user-space implementation and a kernel mod
*nanoBench* is used for running the microbenchmarks for obtaining the latency, throughput, and port usage data that is available on [uops.info](http:www.uops.info).
More information about *nanoBench* can be found in the paper [nanoBench: A Low-Overhead Tool for Running Microbenchmarks on x86 Systems](https://arxiv.org/abs/1911.03282).
## Installation
### User-space Version

View File

@@ -329,36 +329,63 @@ void write_msr(unsigned int msr, uint64_t value) {
#endif
}
void configure_perf_ctrs_FF_Intel(bool usr, bool os) {
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
global_ctrl |= ((uint64_t)7 << 32) | 15;
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL);
// disable fixed counters
fixed_ctrl &= ~((1 << 12) - 1);
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
// clear
for (int i=0; i<3; i++) {
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
void clear_perf_counters() {
if (is_Intel_CPU) {
for (int i=0; i<3; i++) {
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
}
for (int i=0; i<n_programmable_counters; i++) {
write_msr(MSR_IA32_PMC0+i, 0);
}
} else {
for (int i=0; i<n_programmable_counters; i++) {
write_msr(CORE_X86_MSR_PERF_CTR+(2*i), 0);
}
}
//enable fixed counters
}
void clear_perf_counter_configurations() {
if (is_Intel_CPU) {
write_msr(MSR_IA32_FIXED_CTR_CTRL, 0);
for (int i=0; i<n_programmable_counters; i++) {
write_msr(MSR_IA32_PERFEVTSEL0+i, 0);
}
} else {
for (int i=0; i<n_programmable_counters; i++) {
write_msr(CORE_X86_MSR_PERF_CTL + (2*i), 0);
}
}
}
void clear_overflow_status_bits() {
if (is_Intel_CPU) {
write_msr(IA32_PERF_GLOBAL_STATUS_RESET, read_msr(IA32_PERF_GLOBAL_STATUS));
}
}
void enable_perf_ctrs_globally() {
if (is_Intel_CPU) {
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, ((uint64_t)7 << 32) | ((1 << n_programmable_counters) - 1));
}
}
void disable_perf_ctrs_globally() {
if (is_Intel_CPU) {
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, 0);
}
}
void configure_perf_ctrs_FF_Intel(bool usr, bool os) {
uint64_t fixed_ctrl = 0;
fixed_ctrl |= (os << 8) | (os << 4) | os;
fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1);
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
}
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, bool usr, bool os, char* descriptions[]) {
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, bool usr, bool os, int n_counters, int avoid_counters, char* descriptions[]) {
if (is_Intel_CPU) {
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
global_ctrl |= ((uint64_t)7 << 32) | 15;
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
bool evt_added = false;
for (int i=0; i<n_counters; i++) {
// clear
write_msr(MSR_IA32_PMC0+i, 0);
if (next_pfc_config >= n_pfc_configs) {
break;
}
@@ -368,6 +395,13 @@ size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters,
break;
}
if ((config.ctr != -1) && (config.ctr != i)) {
if (config.ctr >= n_counters) {
print_error("Counter %u is not available", config.ctr);
next_pfc_config++;
}
continue;
}
if (((avoid_counters >> i) & 1) && (config.ctr != i)) {
continue;
}
next_pfc_config++;
@@ -405,15 +439,23 @@ size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters,
}
} else {
for (int i=0; i<n_counters; i++) {
// clear
write_msr(CORE_X86_MSR_PERF_CTR+(2*i), 0);
if (next_pfc_config >= n_pfc_configs) {
write_msr(CORE_X86_MSR_PERF_CTL + (2*i), 0);
continue;
}
struct pfc_config config = pfc_configs[next_pfc_config];
if ((config.ctr != -1) && (config.ctr != i)) {
if (config.ctr >= n_counters) {
print_error("Counter %u is not available", config.ctr);
next_pfc_config++;
}
continue;
}
if (((avoid_counters >> i) & 1) && (config.ctr != i)) {
print_error("avoiding %d", i);
continue;
}
next_pfc_config++;
descriptions[i] = config.description;

View File

@@ -41,35 +41,43 @@
#define min(a,b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef MSR_IA32_PMC0
#define MSR_IA32_PMC0 0x0C1
#endif
#undef MSR_IA32_PMC0
#define MSR_IA32_PMC0 0x0C1
#ifndef MSR_IA32_PERFEVTSEL0
#define MSR_IA32_PERFEVTSEL0 0x186
#define MSR_IA32_PERFEVTSEL0 0x186
#endif
#ifndef MSR_OFFCORE_RSP0
#define MSR_OFFCORE_RSP0 0x1A6
#define MSR_OFFCORE_RSP0 0x1A6
#endif
#ifndef MSR_OFFCORE_RSP1
#define MSR_OFFCORE_RSP1 0x1A7
#define MSR_OFFCORE_RSP1 0x1A7
#endif
#ifndef MSR_IA32_DEBUGCTL
#define MSR_IA32_DEBUGCTL 0x1D9
#endif
#ifndef MSR_IA32_FIXED_CTR0
#define MSR_IA32_FIXED_CTR0 0x309
#define MSR_IA32_FIXED_CTR0 0x309
#endif
#ifndef MSR_IA32_FIXED_CTR_CTRL
#define MSR_IA32_FIXED_CTR_CTRL 0x38D
#define MSR_IA32_FIXED_CTR_CTRL 0x38D
#endif
#ifndef IA32_PERF_GLOBAL_STATUS
#define IA32_PERF_GLOBAL_STATUS 0x38E
#endif
#ifndef MSR_IA32_PERF_GLOBAL_CTRL
#define MSR_IA32_PERF_GLOBAL_CTRL 0x38F
#define MSR_IA32_PERF_GLOBAL_CTRL 0x38F
#endif
#ifndef IA32_PERF_GLOBAL_STATUS_RESET
#define IA32_PERF_GLOBAL_STATUS_RESET 0x390
#endif
#ifndef MSR_PEBS_FRONTEND
#define MSR_PEBS_FRONTEND 0x3F7
#define MSR_PEBS_FRONTEND 0x3F7
#endif
#ifndef CORE_X86_MSR_PERF_CTL
#define CORE_X86_MSR_PERF_CTL 0xC0010200
#define CORE_X86_MSR_PERF_CTL 0xC0010200
#endif
#ifndef CORE_X86_MSR_PERF_CTR
#define CORE_X86_MSR_PERF_CTR 0xC0010201
#define CORE_X86_MSR_PERF_CTR 0xC0010201
#endif
@@ -224,13 +232,21 @@ uint64_t read_value_from_cmd(char* cmd);
uint64_t read_msr(unsigned int msr);
void write_msr(unsigned int msr, uint64_t value);
// Enables and clears the fixed-function performance counters.
void clear_perf_counters(void);
void clear_perf_counter_configurations(void);
void clear_overflow_status_bits(void);
void enable_perf_ctrs_globally(void);
void disable_perf_ctrs_globally(void);
// Enables the fixed-function performance counters locally.
void configure_perf_ctrs_FF_Intel(bool usr, bool os);
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
// Writes the configurations of the programmable performance counters to the corresponding MSRs.
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_counters many configurations from this array;
// it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array.
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, bool usr, bool os, char* descriptions[]);
// If the i-th bit in avoid_counters is set, then counter i is not used, except for events that can only be counted on counter i.
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, bool usr, bool os, int n_counters, int avoid_counters, char* descriptions[]);
void configure_MSRs(struct msr_config config);

View File

@@ -576,6 +576,11 @@ static int run_nanoBench(struct seq_file *m, void *v) {
kernel_fpu_begin();
disable_interrupts_preemption();
clear_perf_counter_configurations();
clear_perf_counters();
clear_overflow_status_bits();
enable_perf_ctrs_globally();
long base_unroll_count = (basic_mode?0:unroll_count);
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
long base_loop_count = (basic_mode?0:loop_count);
@@ -670,7 +675,7 @@ static int run_nanoBench(struct seq_file *m, void *v) {
size_t next_pfc_config = 0;
while (next_pfc_config < n_pfc_configs) {
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, true, true, pfc_descriptions);
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, true, true, n_used_counters, 0, pfc_descriptions);
// on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);

View File

@@ -72,7 +72,7 @@ iTCO_wdt_prev_loaded=$?
iTCO_vendor_support_prev_loaded=$?
prev_nmi_watchdog=$(cat /proc/sys/kernel/nmi_watchdog)
echo 0 > /proc/sys/kernel/nmi_watchdog
[ $prev_nmi_watchdog != 0 ] && echo 0 > /proc/sys/kernel/nmi_watchdog
if [ "$debug" = true ]; then
gdb -ex=run --args user/nanoBench $@
@@ -84,7 +84,7 @@ fi
rm -f asm-*.bin
echo $prev_nmi_watchdog > /proc/sys/kernel/nmi_watchdog
[ $prev_nmi_watchdog != 0 ] && echo $prev_nmi_watchdog > /proc/sys/kernel/nmi_watchdog
if [ -d "/sys/bus/event_source/devices/cpu" ]; then
echo $prev_rdpmc > /sys/bus/event_source/devices/cpu/rdpmc

View File

@@ -237,6 +237,11 @@ int main(int argc, char **argv) {
}
}
clear_perf_counter_configurations();
clear_perf_counters();
clear_overflow_status_bits();
enable_perf_ctrs_globally();
long base_unroll_count = (basic_mode?0:unroll_count);
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
long base_loop_count = (basic_mode?0:loop_count);
@@ -329,7 +334,7 @@ int main(int argc, char **argv) {
size_t next_pfc_config = 0;
while (next_pfc_config < n_pfc_configs) {
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, usr, os, pfc_descriptions);
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, usr, os, n_used_counters, 0, pfc_descriptions);
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);
run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count);