mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-16 11:30:07 +01:00
make fixed counters optional
This commit is contained in:
@@ -21,6 +21,7 @@ int drain_frontend = DRAIN_FRONTEND_DEFAULT;
|
||||
int no_mem = NO_MEM_DEFAULT;
|
||||
int no_normalization = NO_NORMALIZATION_DEFAULT;
|
||||
int basic_mode = BASIC_MODE_DEFAULT;
|
||||
int use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT;
|
||||
int aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
|
||||
int verbose = VERBOSE_DEFAULT;
|
||||
int debug = DEBUG_DEFAULT;
|
||||
@@ -113,19 +114,13 @@ int check_cpuid() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned int n_available_counters = ((eax >> 8) & 0xFF);
|
||||
print_user_verbose("Number of general-purpose performance counters: %u\n", n_available_counters);
|
||||
if (n_available_counters >= 4) {
|
||||
n_programmable_counters = 4;
|
||||
} else if (n_available_counters >= 2) {
|
||||
n_programmable_counters = 2;
|
||||
} else {
|
||||
print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_available_counters);
|
||||
n_programmable_counters = ((eax >> 8) & 0xFF);
|
||||
print_user_verbose("Number of general-purpose performance counters: %u\n", n_programmable_counters);
|
||||
if (n_programmable_counters < 2) {
|
||||
print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_programmable_counters);
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF));
|
||||
|
||||
} else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) {
|
||||
is_AMD_CPU = 1;
|
||||
n_programmable_counters = 6;
|
||||
@@ -299,34 +294,32 @@ void write_msr(unsigned int msr, uint64_t value) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void configure_perf_ctrs_FF(unsigned int usr, unsigned int os) {
|
||||
if (is_Intel_CPU) {
|
||||
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
||||
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
||||
void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os) {
|
||||
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
||||
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
||||
|
||||
uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL);
|
||||
// disable fixed counters
|
||||
fixed_ctrl &= ~((1 << 12) - 1);
|
||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||
// clear
|
||||
for (int i=0; i<3; i++) {
|
||||
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
|
||||
}
|
||||
//enable fixed counters
|
||||
fixed_ctrl |= (os << 8) | (os << 4) | os;
|
||||
fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1);
|
||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||
uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL);
|
||||
// disable fixed counters
|
||||
fixed_ctrl &= ~((1 << 12) - 1);
|
||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||
// clear
|
||||
for (int i=0; i<3; i++) {
|
||||
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
|
||||
}
|
||||
//enable fixed counters
|
||||
fixed_ctrl |= (os << 8) | (os << 4) | os;
|
||||
fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1);
|
||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||
}
|
||||
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, unsigned int usr, unsigned int os, char* descriptions[]) {
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]) {
|
||||
if (is_Intel_CPU) {
|
||||
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
||||
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
||||
|
||||
for (int i=0; i<n_programmable_counters; i++) {
|
||||
for (int i=0; i<n_counters; i++) {
|
||||
// clear
|
||||
write_msr(MSR_IA32_PMC0+i, 0);
|
||||
|
||||
@@ -367,7 +360,7 @@ size_t configure_perf_ctrs_programmable(size_t next_pfc_config, unsigned int usr
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<n_programmable_counters; i++) {
|
||||
for (int i=0; i<n_counters; i++) {
|
||||
// clear
|
||||
write_msr(CORE_X86_MSR_PERF_CTR+(2*i), 0);
|
||||
|
||||
@@ -610,10 +603,10 @@ void create_and_run_one_time_init_code() {
|
||||
((void(*)(void))runtime_one_time_init_code)();
|
||||
}
|
||||
|
||||
void run_warmup_experiment(char* measurement_template) {
|
||||
void run_initial_warmup_experiment() {
|
||||
if (!initial_warm_up_count) return;
|
||||
|
||||
create_runtime_code(measurement_template, unroll_count, loop_count);
|
||||
create_runtime_code((char*)&initial_warm_up_template, unroll_count, loop_count);
|
||||
|
||||
for (int i=0; i<initial_warm_up_count; i++) {
|
||||
((void(*)(void))runtime_code)();
|
||||
@@ -735,8 +728,7 @@ int starts_with_magic_bytes(char* c, int64_t magic_bytes) {
|
||||
|
||||
void measurement_template_Intel_2() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
@@ -784,8 +776,7 @@ void measurement_template_Intel_2() {
|
||||
|
||||
void measurement_template_Intel_4() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
@@ -851,8 +842,7 @@ void measurement_template_Intel_4() {
|
||||
|
||||
void measurement_template_Intel_noMem_2() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"mov r8, 0 \n"
|
||||
"mov r9, 0 \n"
|
||||
@@ -887,8 +877,7 @@ void measurement_template_Intel_noMem_2() {
|
||||
|
||||
void measurement_template_Intel_noMem_4() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"mov r8, 0 \n"
|
||||
"mov r9, 0 \n"
|
||||
@@ -943,8 +932,7 @@ void measurement_template_Intel_noMem_4() {
|
||||
|
||||
void measurement_template_AMD() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
@@ -1028,8 +1016,7 @@ void measurement_template_AMD() {
|
||||
|
||||
void measurement_template_AMD_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"mov r8, 0 \n"
|
||||
"mov r9, 0 \n"
|
||||
@@ -1104,8 +1091,7 @@ void measurement_template_AMD_noMem() {
|
||||
|
||||
void measurement_FF_template_Intel() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
@@ -1168,8 +1154,7 @@ void measurement_FF_template_Intel() {
|
||||
|
||||
void measurement_FF_template_Intel_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"mov r8, 0 \n"
|
||||
"mov r9, 0 \n"
|
||||
@@ -1222,8 +1207,7 @@ void measurement_FF_template_Intel_noMem() {
|
||||
|
||||
void measurement_FF_template_AMD() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
@@ -1278,8 +1262,7 @@ void measurement_FF_template_AMD() {
|
||||
|
||||
void measurement_FF_template_AMD_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"mov r8, 0 \n"
|
||||
"mov r9, 0 \n"
|
||||
@@ -1321,8 +1304,7 @@ void measurement_FF_template_AMD_noMem() {
|
||||
|
||||
void measurement_RDTSC_template() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
@@ -1356,8 +1338,7 @@ void measurement_RDTSC_template() {
|
||||
|
||||
void measurement_RDTSC_template_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"mov r8, 0 \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
||||
@@ -1380,8 +1361,7 @@ void measurement_RDTSC_template_noMem() {
|
||||
|
||||
void measurement_RDMSR_template() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
@@ -1419,8 +1399,7 @@ void measurement_RDMSR_template() {
|
||||
|
||||
void measurement_RDMSR_template_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
asm(".intel_syntax noprefix \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
"mov r8, 0 \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
||||
@@ -1448,4 +1427,14 @@ void one_time_init_template() {
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
RESTORE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||
}
|
||||
}
|
||||
|
||||
void initial_warm_up_template() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_CODE)" \n"
|
||||
".quad "STRINGIFY(MAGIC_BYTES_PFC_END)" \n");
|
||||
RESTORE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||
}
|
||||
|
||||
@@ -115,6 +115,10 @@ extern int no_normalization;
|
||||
extern int basic_mode;
|
||||
#define BASIC_MODE_DEFAULT 0;
|
||||
|
||||
// If enabled, the result includes measurements using the fixed-function performance counters and the RDTSC instruction.
|
||||
extern int use_fixed_counters;
|
||||
#define USE_FIXED_COUNTERS_DEFAULT 0;
|
||||
|
||||
enum agg_enum {AVG_20_80, MIN, MAX, MED};
|
||||
extern int aggregate_function;
|
||||
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
|
||||
@@ -170,7 +174,7 @@ extern char* msr_config_file_content;
|
||||
extern int is_Intel_CPU;
|
||||
extern int is_AMD_CPU;
|
||||
|
||||
#define MAX_PROGRAMMABLE_COUNTERS 6
|
||||
#define MAX_PROGRAMMABLE_COUNTERS 8
|
||||
extern int n_programmable_counters;
|
||||
|
||||
// Pointers to a memory regions that are writable and executable.
|
||||
@@ -212,19 +216,19 @@ uint64_t read_msr(unsigned int msr);
|
||||
void write_msr(unsigned int msr, uint64_t value);
|
||||
|
||||
// Enables and clears the fixed-function performance counters.
|
||||
void configure_perf_ctrs_FF(unsigned int usr, unsigned int os);
|
||||
void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os);
|
||||
|
||||
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
|
||||
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_programmable_counters many configurations from this array;
|
||||
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_counters many configurations from this array;
|
||||
// it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array.
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, unsigned int usr, unsigned int os, char* descriptions[]);
|
||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]);
|
||||
|
||||
void configure_MSRs(struct msr_config config);
|
||||
|
||||
size_t get_required_runtime_code_length(void);
|
||||
|
||||
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
|
||||
void run_warmup_experiment(char* measurement_template);
|
||||
void run_initial_warmup_experiment(void);
|
||||
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
|
||||
void create_and_run_one_time_init_code(void);
|
||||
|
||||
@@ -236,22 +240,22 @@ long long ll_abs(long long val);
|
||||
void print_all_measurement_results(int64_t* results[], int n_counters);
|
||||
|
||||
|
||||
#define MAGIC_BYTES_INIT 0x10b513b1C2813F04
|
||||
#define MAGIC_BYTES_CODE 0x20b513b1C2813F04
|
||||
#define MAGIC_BYTES_RSP_ADDRESS 0x30b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_R14 0x40b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RBP 0x50b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RDI 0x60b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RSI 0x70b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RSP 0x80b513b1C2813F04
|
||||
#define MAGIC_BYTES_PFC 0x90b513b1C2813F04
|
||||
#define MAGIC_BYTES_MSR 0xA0b513b1C2813F04
|
||||
#define MAGIC_BYTES_TEMPLATE_END 0xB0b513b1C2813F04
|
||||
#define MAGIC_BYTES_PFC_START 0xC0b513b1C2813F04
|
||||
#define MAGIC_BYTES_PFC_END 0xD0b513b1C2813F04
|
||||
#define MAGIC_BYTES_INIT 0x10B513B1C2813F04
|
||||
#define MAGIC_BYTES_CODE 0x20B513B1C2813F04
|
||||
#define MAGIC_BYTES_RSP_ADDRESS 0x30B513B1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_R14 0x40B513B1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RBP 0x50B513B1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RDI 0x60B513B1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RSI 0x70B513B1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RSP 0x80B513B1C2813F04
|
||||
#define MAGIC_BYTES_PFC 0x90B513B1C2813F04
|
||||
#define MAGIC_BYTES_MSR 0xA0B513B1C2813F04
|
||||
#define MAGIC_BYTES_TEMPLATE_END 0xB0B513B1C2813F04
|
||||
#define MAGIC_BYTES_PFC_START 0xC0B513B1C2813F04
|
||||
#define MAGIC_BYTES_PFC_END 0xD0B513B1C2813F04
|
||||
|
||||
#define MAGIC_BYTES_CODE_PFC_START 0xE0b513b1C2813F04
|
||||
#define MAGIC_BYTES_CODE_PFC_STOP 0xF0b513b1C2813F04
|
||||
#define MAGIC_BYTES_CODE_PFC_START 0xE0B513B1C2813F04
|
||||
#define MAGIC_BYTES_CODE_PFC_STOP 0xF0B513B1C2813F04
|
||||
|
||||
|
||||
#define STRINGIFY2(X) #X
|
||||
@@ -275,6 +279,7 @@ void measurement_RDTSC_template_noMem(void);
|
||||
void measurement_RDMSR_template(void);
|
||||
void measurement_RDMSR_template_noMem(void);
|
||||
void one_time_init_template(void);
|
||||
void initial_warm_up_template(void);
|
||||
|
||||
// RBX, RBP, and R12–R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
|
||||
#define SAVE_REGS_FLAGS() \
|
||||
|
||||
Reference in New Issue
Block a user