mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-13 10:10:04 +01:00
improved counter configuration
This commit is contained in:
@@ -7,6 +7,8 @@ There are two variants of the tool: A user-space implementation and a kernel mod
|
||||
|
||||
*nanoBench* is used for running the microbenchmarks for obtaining the latency, throughput, and port usage data that is available on [uops.info](http:www.uops.info).
|
||||
|
||||
More information about *nanoBench* can be found in the paper [nanoBench: A Low-Overhead Tool for Running Microbenchmarks on x86 Systems](https://arxiv.org/abs/1911.03282).
|
||||
|
||||
## Installation
|
||||
|
||||
### User-space Version
|
||||
|
||||
@@ -329,36 +329,63 @@ void write_msr(unsigned int msr, uint64_t value) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void configure_perf_ctrs_FF_Intel(bool usr, bool os) {
|
||||
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
||||
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
||||
|
||||
uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL);
|
||||
// disable fixed counters
|
||||
fixed_ctrl &= ~((1 << 12) - 1);
|
||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||
// clear
|
||||
for (int i=0; i<3; i++) {
|
||||
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
|
||||
void clear_perf_counters() {
|
||||
if (is_Intel_CPU) {
|
||||
for (int i=0; i<3; i++) {
|
||||
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
|
||||
}
|
||||
for (int i=0; i<n_programmable_counters; i++) {
|
||||
write_msr(MSR_IA32_PMC0+i, 0);
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<n_programmable_counters; i++) {
|
||||
write_msr(CORE_X86_MSR_PERF_CTR+(2*i), 0);
|
||||
}
|
||||
}
|
||||
//enable fixed counters
|
||||
}
|
||||
|
||||
void clear_perf_counter_configurations() {
|
||||
if (is_Intel_CPU) {
|
||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, 0);
|
||||
for (int i=0; i<n_programmable_counters; i++) {
|
||||
write_msr(MSR_IA32_PERFEVTSEL0+i, 0);
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<n_programmable_counters; i++) {
|
||||
write_msr(CORE_X86_MSR_PERF_CTL + (2*i), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void clear_overflow_status_bits() {
|
||||
if (is_Intel_CPU) {
|
||||
write_msr(IA32_PERF_GLOBAL_STATUS_RESET, read_msr(IA32_PERF_GLOBAL_STATUS));
|
||||
}
|
||||
}
|
||||
|
||||
void enable_perf_ctrs_globally() {
|
||||
if (is_Intel_CPU) {
|
||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, ((uint64_t)7 << 32) | ((1 << n_programmable_counters) - 1));
|
||||
}
|
||||
}
|
||||
|
||||
void disable_perf_ctrs_globally() {
|
||||
if (is_Intel_CPU) {
|
||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void configure_perf_ctrs_FF_Intel(bool usr, bool os) {
|
||||
uint64_t fixed_ctrl = 0;
|
||||
fixed_ctrl |= (os << 8) | (os << 4) | os;
|
||||
fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1);
|
||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||
}
|
||||
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, bool usr, bool os, char* descriptions[]) {
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, bool usr, bool os, int n_counters, int avoid_counters, char* descriptions[]) {
|
||||
if (is_Intel_CPU) {
|
||||
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
||||
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
||||
|
||||
bool evt_added = false;
|
||||
for (int i=0; i<n_counters; i++) {
|
||||
// clear
|
||||
write_msr(MSR_IA32_PMC0+i, 0);
|
||||
|
||||
if (next_pfc_config >= n_pfc_configs) {
|
||||
break;
|
||||
}
|
||||
@@ -368,6 +395,13 @@ size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters,
|
||||
break;
|
||||
}
|
||||
if ((config.ctr != -1) && (config.ctr != i)) {
|
||||
if (config.ctr >= n_counters) {
|
||||
print_error("Counter %u is not available", config.ctr);
|
||||
next_pfc_config++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (((avoid_counters >> i) & 1) && (config.ctr != i)) {
|
||||
continue;
|
||||
}
|
||||
next_pfc_config++;
|
||||
@@ -405,15 +439,23 @@ size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters,
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<n_counters; i++) {
|
||||
// clear
|
||||
write_msr(CORE_X86_MSR_PERF_CTR+(2*i), 0);
|
||||
|
||||
if (next_pfc_config >= n_pfc_configs) {
|
||||
write_msr(CORE_X86_MSR_PERF_CTL + (2*i), 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
struct pfc_config config = pfc_configs[next_pfc_config];
|
||||
if ((config.ctr != -1) && (config.ctr != i)) {
|
||||
if (config.ctr >= n_counters) {
|
||||
print_error("Counter %u is not available", config.ctr);
|
||||
next_pfc_config++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (((avoid_counters >> i) & 1) && (config.ctr != i)) {
|
||||
print_error("avoiding %d", i);
|
||||
continue;
|
||||
}
|
||||
next_pfc_config++;
|
||||
|
||||
descriptions[i] = config.description;
|
||||
|
||||
@@ -41,35 +41,43 @@
|
||||
#define min(a,b) (((a) < (b)) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#ifndef MSR_IA32_PMC0
|
||||
#define MSR_IA32_PMC0 0x0C1
|
||||
#endif
|
||||
#undef MSR_IA32_PMC0
|
||||
#define MSR_IA32_PMC0 0x0C1
|
||||
#ifndef MSR_IA32_PERFEVTSEL0
|
||||
#define MSR_IA32_PERFEVTSEL0 0x186
|
||||
#define MSR_IA32_PERFEVTSEL0 0x186
|
||||
#endif
|
||||
#ifndef MSR_OFFCORE_RSP0
|
||||
#define MSR_OFFCORE_RSP0 0x1A6
|
||||
#define MSR_OFFCORE_RSP0 0x1A6
|
||||
#endif
|
||||
#ifndef MSR_OFFCORE_RSP1
|
||||
#define MSR_OFFCORE_RSP1 0x1A7
|
||||
#define MSR_OFFCORE_RSP1 0x1A7
|
||||
#endif
|
||||
#ifndef MSR_IA32_DEBUGCTL
|
||||
#define MSR_IA32_DEBUGCTL 0x1D9
|
||||
#endif
|
||||
#ifndef MSR_IA32_FIXED_CTR0
|
||||
#define MSR_IA32_FIXED_CTR0 0x309
|
||||
#define MSR_IA32_FIXED_CTR0 0x309
|
||||
#endif
|
||||
#ifndef MSR_IA32_FIXED_CTR_CTRL
|
||||
#define MSR_IA32_FIXED_CTR_CTRL 0x38D
|
||||
#define MSR_IA32_FIXED_CTR_CTRL 0x38D
|
||||
#endif
|
||||
#ifndef IA32_PERF_GLOBAL_STATUS
|
||||
#define IA32_PERF_GLOBAL_STATUS 0x38E
|
||||
#endif
|
||||
#ifndef MSR_IA32_PERF_GLOBAL_CTRL
|
||||
#define MSR_IA32_PERF_GLOBAL_CTRL 0x38F
|
||||
#define MSR_IA32_PERF_GLOBAL_CTRL 0x38F
|
||||
#endif
|
||||
#ifndef IA32_PERF_GLOBAL_STATUS_RESET
|
||||
#define IA32_PERF_GLOBAL_STATUS_RESET 0x390
|
||||
#endif
|
||||
#ifndef MSR_PEBS_FRONTEND
|
||||
#define MSR_PEBS_FRONTEND 0x3F7
|
||||
#define MSR_PEBS_FRONTEND 0x3F7
|
||||
#endif
|
||||
#ifndef CORE_X86_MSR_PERF_CTL
|
||||
#define CORE_X86_MSR_PERF_CTL 0xC0010200
|
||||
#define CORE_X86_MSR_PERF_CTL 0xC0010200
|
||||
#endif
|
||||
#ifndef CORE_X86_MSR_PERF_CTR
|
||||
#define CORE_X86_MSR_PERF_CTR 0xC0010201
|
||||
#define CORE_X86_MSR_PERF_CTR 0xC0010201
|
||||
#endif
|
||||
|
||||
|
||||
@@ -224,13 +232,21 @@ uint64_t read_value_from_cmd(char* cmd);
|
||||
uint64_t read_msr(unsigned int msr);
|
||||
void write_msr(unsigned int msr, uint64_t value);
|
||||
|
||||
// Enables and clears the fixed-function performance counters.
|
||||
void clear_perf_counters(void);
|
||||
void clear_perf_counter_configurations(void);
|
||||
void clear_overflow_status_bits(void);
|
||||
|
||||
void enable_perf_ctrs_globally(void);
|
||||
void disable_perf_ctrs_globally(void);
|
||||
|
||||
// Enables the fixed-function performance counters locally.
|
||||
void configure_perf_ctrs_FF_Intel(bool usr, bool os);
|
||||
|
||||
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
|
||||
// Writes the configurations of the programmable performance counters to the corresponding MSRs.
|
||||
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_counters many configurations from this array;
|
||||
// it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array.
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, bool usr, bool os, char* descriptions[]);
|
||||
// If the i-th bit in avoid_counters is set, then counter i is not used, except for events that can only be counted on counter i.
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, bool usr, bool os, int n_counters, int avoid_counters, char* descriptions[]);
|
||||
|
||||
void configure_MSRs(struct msr_config config);
|
||||
|
||||
|
||||
@@ -576,6 +576,11 @@ static int run_nanoBench(struct seq_file *m, void *v) {
|
||||
kernel_fpu_begin();
|
||||
disable_interrupts_preemption();
|
||||
|
||||
clear_perf_counter_configurations();
|
||||
clear_perf_counters();
|
||||
clear_overflow_status_bits();
|
||||
enable_perf_ctrs_globally();
|
||||
|
||||
long base_unroll_count = (basic_mode?0:unroll_count);
|
||||
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
|
||||
long base_loop_count = (basic_mode?0:loop_count);
|
||||
@@ -670,7 +675,7 @@ static int run_nanoBench(struct seq_file *m, void *v) {
|
||||
size_t next_pfc_config = 0;
|
||||
while (next_pfc_config < n_pfc_configs) {
|
||||
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
|
||||
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, true, true, pfc_descriptions);
|
||||
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, true, true, n_used_counters, 0, pfc_descriptions);
|
||||
// on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set
|
||||
|
||||
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);
|
||||
|
||||
@@ -72,7 +72,7 @@ iTCO_wdt_prev_loaded=$?
|
||||
iTCO_vendor_support_prev_loaded=$?
|
||||
|
||||
prev_nmi_watchdog=$(cat /proc/sys/kernel/nmi_watchdog)
|
||||
echo 0 > /proc/sys/kernel/nmi_watchdog
|
||||
[ $prev_nmi_watchdog != 0 ] && echo 0 > /proc/sys/kernel/nmi_watchdog
|
||||
|
||||
if [ "$debug" = true ]; then
|
||||
gdb -ex=run --args user/nanoBench $@
|
||||
@@ -84,7 +84,7 @@ fi
|
||||
|
||||
rm -f asm-*.bin
|
||||
|
||||
echo $prev_nmi_watchdog > /proc/sys/kernel/nmi_watchdog
|
||||
[ $prev_nmi_watchdog != 0 ] && echo $prev_nmi_watchdog > /proc/sys/kernel/nmi_watchdog
|
||||
|
||||
if [ -d "/sys/bus/event_source/devices/cpu" ]; then
|
||||
echo $prev_rdpmc > /sys/bus/event_source/devices/cpu/rdpmc
|
||||
|
||||
@@ -237,6 +237,11 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
clear_perf_counter_configurations();
|
||||
clear_perf_counters();
|
||||
clear_overflow_status_bits();
|
||||
enable_perf_ctrs_globally();
|
||||
|
||||
long base_unroll_count = (basic_mode?0:unroll_count);
|
||||
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
|
||||
long base_loop_count = (basic_mode?0:loop_count);
|
||||
@@ -329,7 +334,7 @@ int main(int argc, char **argv) {
|
||||
size_t next_pfc_config = 0;
|
||||
while (next_pfc_config < n_pfc_configs) {
|
||||
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
|
||||
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, usr, os, pfc_descriptions);
|
||||
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, usr, os, n_used_counters, 0, pfc_descriptions);
|
||||
|
||||
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count);
|
||||
|
||||
Reference in New Issue
Block a user