This commit is contained in:
Andreas Abel
2021-11-02 17:25:33 +01:00
parent 1752910a15
commit a69f6e5596
4 changed files with 45 additions and 47 deletions

View File

@@ -47,8 +47,8 @@ size_t n_msr_configs = 0;
char* msr_config_file_content = NULL; char* msr_config_file_content = NULL;
unsigned long cur_rdmsr = 0; unsigned long cur_rdmsr = 0;
int is_Intel_CPU = 0; bool is_Intel_CPU = false;
int is_AMD_CPU = 0; bool is_AMD_CPU = false;
int n_programmable_counters; int n_programmable_counters;
@@ -74,7 +74,7 @@ void build_cpuid_string(char* buf, unsigned int r0, unsigned int r1, unsigned in
memcpy(buf+12, (char*)&r3, 4); memcpy(buf+12, (char*)&r3, 4);
} }
int check_cpuid() { bool check_cpuid() {
unsigned int eax, ebx, ecx, edx; unsigned int eax, ebx, ecx, edx;
__cpuid(0, eax, ebx, ecx, edx); __cpuid(0, eax, ebx, ecx, edx);
@@ -104,32 +104,32 @@ int check_cpuid() {
print_user_verbose("Stepping ID: %u\n", (eax & 0xF)); print_user_verbose("Stepping ID: %u\n", (eax & 0xF));
if (strcmp(proc_vendor_string, "GenuineIntel") == 0) { if (strcmp(proc_vendor_string, "GenuineIntel") == 0) {
is_Intel_CPU = 1; is_Intel_CPU = true;
__cpuid(0x0A, eax, ebx, ecx, edx); __cpuid(0x0A, eax, ebx, ecx, edx);
unsigned int perf_mon_ver = (eax & 0xFF); unsigned int perf_mon_ver = (eax & 0xFF);
print_user_verbose("Performance monitoring version: %u\n", perf_mon_ver); print_user_verbose("Performance monitoring version: %u\n", perf_mon_ver);
if (perf_mon_ver < 2) { if (perf_mon_ver < 2) {
print_error("Error: performance monitoring version >= 2 required\n"); print_error("Error: performance monitoring version >= 2 required\n");
return 1; return true;
} }
n_programmable_counters = ((eax >> 8) & 0xFF); n_programmable_counters = ((eax >> 8) & 0xFF);
print_user_verbose("Number of general-purpose performance counters: %u\n", n_programmable_counters); print_user_verbose("Number of general-purpose performance counters: %u\n", n_programmable_counters);
if (n_programmable_counters < 2) { if (n_programmable_counters < 2) {
print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_programmable_counters); print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_programmable_counters);
return 1; return true;
} }
print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF)); print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF));
} else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) { } else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) {
is_AMD_CPU = 1; is_AMD_CPU = true;
n_programmable_counters = 6; n_programmable_counters = 6;
} else { } else {
print_error("Error: unsupported CPU found\n"); print_error("Error: unsupported CPU found\n");
return 1; return true;
} }
return 0; return false;
} }
void parse_counter_configs() { void parse_counter_configs() {
@@ -176,11 +176,11 @@ void parse_counter_configs() {
char* ce; char* ce;
while ((ce = strsep(&tok, ".")) != NULL) { while ((ce = strsep(&tok, ".")) != NULL) {
if (!strcmp(ce, "AnyT")) { if (!strcmp(ce, "AnyT")) {
pfc_configs[n_pfc_configs].any = 1; pfc_configs[n_pfc_configs].any = true;
} else if (!strcmp(ce, "EDG")) { } else if (!strcmp(ce, "EDG")) {
pfc_configs[n_pfc_configs].edge = 1; pfc_configs[n_pfc_configs].edge = true;
} else if (!strcmp(ce, "INV")) { } else if (!strcmp(ce, "INV")) {
pfc_configs[n_pfc_configs].inv = 1; pfc_configs[n_pfc_configs].inv = true;
} else if (!strncmp(ce, "CTR=", 4)) { } else if (!strncmp(ce, "CTR=", 4)) {
unsigned long counter; unsigned long counter;
nb_strtoul(ce+4, 0, &counter); nb_strtoul(ce+4, 0, &counter);
@@ -294,7 +294,7 @@ void write_msr(unsigned int msr, uint64_t value) {
#endif #endif
} }
void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os) { void configure_perf_ctrs_FF_Intel(bool usr, bool os) {
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL); uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
global_ctrl |= ((uint64_t)7 << 32) | 15; global_ctrl |= ((uint64_t)7 << 32) | 15;
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl); write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
@@ -313,7 +313,7 @@ void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os) {
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl); write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
} }
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]) { size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, bool usr, bool os, char* descriptions[]) {
if (is_Intel_CPU) { if (is_Intel_CPU) {
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL); uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
global_ctrl |= ((uint64_t)7 << 32) | 15; global_ctrl |= ((uint64_t)7 << 32) | 15;
@@ -722,7 +722,7 @@ void print_all_measurement_results(int64_t* results[], int n_counters) {
print_verbose("\n"); print_verbose("\n");
} }
int starts_with_magic_bytes(char* c, int64_t magic_bytes) { bool starts_with_magic_bytes(char* c, int64_t magic_bytes) {
return (*((int64_t*)c) == magic_bytes); return (*((int64_t*)c) == magic_bytes);
} }

View File

@@ -24,6 +24,7 @@
#include <string.h> #include <string.h>
#endif #endif
#include <stdbool.h>
#include <cpuid.h> #include <cpuid.h>
#ifdef __KERNEL__ #ifdef __KERNEL__
@@ -97,38 +98,38 @@ extern size_t alignment_offset;
// If enabled, the front-end buffers are drained between code_late_init and code by executing a sequence of 128 15-Byte NOP instructions. // If enabled, the front-end buffers are drained between code_late_init and code by executing a sequence of 128 15-Byte NOP instructions.
extern int drain_frontend; extern int drain_frontend;
#define DRAIN_FRONTEND_DEFAULT 0; #define DRAIN_FRONTEND_DEFAULT false;
// If enabled, the temporary performance counter values are stored in registers instead of in memory; // If enabled, the temporary performance counter values are stored in registers instead of in memory;
// the code to be measured must then not use registers R8-R13 // the code to be measured must then not use registers R8-R13
extern int no_mem; extern int no_mem;
#define NO_MEM_DEFAULT 0; #define NO_MEM_DEFAULT false;
// If enabled, the measurement results are not divided by the number of repetitions. // If enabled, the measurement results are not divided by the number of repetitions.
extern int no_normalization; extern int no_normalization;
#define NO_NORMALIZATION_DEFAULT 0; #define NO_NORMALIZATION_DEFAULT false;
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two // If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
// measurements. // measurements.
// If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference // If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference
// between the two measurements. // between the two measurements.
extern int basic_mode; extern int basic_mode;
#define BASIC_MODE_DEFAULT 0; #define BASIC_MODE_DEFAULT false;
// If enabled, the result includes measurements using the fixed-function performance counters and the RDTSC instruction. // If enabled, the result includes measurements using the fixed-function performance counters and the RDTSC instruction.
extern int use_fixed_counters; extern int use_fixed_counters;
#define USE_FIXED_COUNTERS_DEFAULT 0; #define USE_FIXED_COUNTERS_DEFAULT false;
enum agg_enum {AVG_20_80, MIN, MAX, MED}; enum agg_enum {AVG_20_80, MIN, MAX, MED};
extern int aggregate_function; extern int aggregate_function;
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80; #define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
extern int verbose; extern int verbose;
#define VERBOSE_DEFAULT 0; #define VERBOSE_DEFAULT false;
// Whether to generate a breakpoint trap after executing the code to be benchmarked. // Whether to generate a breakpoint trap after executing the code to be benchmarked.
extern int debug; extern int debug;
#define DEBUG_DEFAULT 0; #define DEBUG_DEFAULT false;
extern char* code; extern char* code;
extern size_t code_length; extern size_t code_length;
@@ -146,9 +147,9 @@ struct pfc_config {
unsigned long evt_num; unsigned long evt_num;
unsigned long umask; unsigned long umask;
unsigned long cmask; unsigned long cmask;
unsigned int any; bool any;
unsigned int edge; bool edge;
unsigned int inv; bool inv;
unsigned long msr_3f6h; unsigned long msr_3f6h;
unsigned long msr_pf; unsigned long msr_pf;
unsigned long msr_rsp0; unsigned long msr_rsp0;
@@ -171,8 +172,8 @@ extern struct msr_config msr_configs[];
extern size_t n_msr_configs; extern size_t n_msr_configs;
extern char* msr_config_file_content; extern char* msr_config_file_content;
extern int is_Intel_CPU; extern bool is_Intel_CPU;
extern int is_AMD_CPU; extern bool is_AMD_CPU;
#define MAX_PROGRAMMABLE_COUNTERS 8 #define MAX_PROGRAMMABLE_COUNTERS 8
extern int n_programmable_counters; extern int n_programmable_counters;
@@ -205,7 +206,7 @@ extern int cpu;
// Checks whether we have an Intel or AMD CPU and determines the number of programmable counters. // Checks whether we have an Intel or AMD CPU and determines the number of programmable counters.
// Returns 0 if successful, 1 otherwise. // Returns 0 if successful, 1 otherwise.
int check_cpuid(void); bool check_cpuid(void);
void parse_counter_configs(void); void parse_counter_configs(void);
void parse_msr_configs(void); void parse_msr_configs(void);
@@ -216,12 +217,12 @@ uint64_t read_msr(unsigned int msr);
void write_msr(unsigned int msr, uint64_t value); void write_msr(unsigned int msr, uint64_t value);
// Enables and clears the fixed-function performance counters. // Enables and clears the fixed-function performance counters.
void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os); void configure_perf_ctrs_FF_Intel(bool usr, bool os);
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs. // Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_counters many configurations from this array; // next_pfc_config is an index into the pfc_configs array; the function takes up to n_counters many configurations from this array;
// it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array. // it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array.
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]); size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, bool usr, bool os, char* descriptions[]);
void configure_MSRs(struct msr_config config); void configure_MSRs(struct msr_config config);
@@ -257,11 +258,10 @@ void print_all_measurement_results(int64_t* results[], int n_counters);
#define MAGIC_BYTES_CODE_PFC_START 0xE0B513B1C2813F04 #define MAGIC_BYTES_CODE_PFC_START 0xE0B513B1C2813F04
#define MAGIC_BYTES_CODE_PFC_STOP 0xF0B513B1C2813F04 #define MAGIC_BYTES_CODE_PFC_STOP 0xF0B513B1C2813F04
#define STRINGIFY2(X) #X #define STRINGIFY2(X) #X
#define STRINGIFY(X) STRINGIFY2(X) #define STRINGIFY(X) STRINGIFY2(X)
int starts_with_magic_bytes(char* c, int64_t magic_bytes); bool starts_with_magic_bytes(char* c, int64_t magic_bytes);
// The following functions must not use global variables (or anything that uses RIP-relative addressing) // The following functions must not use global variables (or anything that uses RIP-relative addressing)
void measurement_template_Intel_2(void); void measurement_template_Intel_2(void);
@@ -283,8 +283,7 @@ void initial_warm_up_template(void);
// RBX, RBP, and R12R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions) // RBX, RBP, and R12R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
#define SAVE_REGS_FLAGS() \ #define SAVE_REGS_FLAGS() \
asm volatile( \ asm(".intel_syntax noprefix\n" \
".intel_syntax noprefix\n" \
"push rbx\n" \ "push rbx\n" \
"push rbp\n" \ "push rbp\n" \
"push r12\n" \ "push r12\n" \
@@ -313,8 +312,7 @@ void initial_warm_up_template(void);
".att_syntax noprefix"); ".att_syntax noprefix");
#define RESTORE_REGS_FLAGS() \ #define RESTORE_REGS_FLAGS() \
asm volatile( \ asm(".intel_syntax noprefix\n" \
".intel_syntax noprefix\n" \
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \ "mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
"mov rsp, [r15]\n" \ "mov rsp, [r15]\n" \
"popfq\n" \ "popfq\n" \

View File

@@ -546,7 +546,7 @@ static int show(struct seq_file *m, void *v) {
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1)); seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1));
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2)); seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2));
} else { } else {
configure_perf_ctrs_FF_Intel(0, 1); configure_perf_ctrs_FF_Intel(false, true);
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count); run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count); run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
@@ -596,7 +596,7 @@ static int show(struct seq_file *m, void *v) {
size_t next_pfc_config = 0; size_t next_pfc_config = 0;
while (next_pfc_config < n_pfc_configs) { while (next_pfc_config < n_pfc_configs) {
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0}; char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, 1, 1, pfc_descriptions); next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, true, true, pfc_descriptions);
// on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set // on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count); run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);

View File

@@ -67,8 +67,8 @@ int main(int argc, char **argv) {
* Parse command-line options * Parse command-line options
************************************/ ************************************/
char* config_file_name = NULL; char* config_file_name = NULL;
int usr = 1; bool usr = 1;
int os = 0; bool os = 0;
struct option long_opts[] = { struct option long_opts[] = {
{"code", required_argument, 0, 'c'}, {"code", required_argument, 0, 'c'},
@@ -76,26 +76,26 @@ int main(int argc, char **argv) {
{"code_late_init", required_argument, 0, 't'}, {"code_late_init", required_argument, 0, 't'},
{"code_one_time_init", required_argument, 0, 'o'}, {"code_one_time_init", required_argument, 0, 'o'},
{"config", required_argument, 0, 'f'}, {"config", required_argument, 0, 'f'},
{"fixed_counters", no_argument, &use_fixed_counters, 1}, {"fixed_counters", no_argument, &use_fixed_counters, true},
{"n_measurements", required_argument, 0, 'n'}, {"n_measurements", required_argument, 0, 'n'},
{"unroll_count", required_argument, 0, 'u'}, {"unroll_count", required_argument, 0, 'u'},
{"loop_count", required_argument, 0, 'l'}, {"loop_count", required_argument, 0, 'l'},
{"warm_up_count", required_argument, 0, 'w'}, {"warm_up_count", required_argument, 0, 'w'},
{"initial_warm_up_count", required_argument, 0, 'a'}, {"initial_warm_up_count", required_argument, 0, 'a'},
{"alignment_offset", required_argument, 0, 'm'}, {"alignment_offset", required_argument, 0, 'm'},
{"df", no_argument, &drain_frontend, 1}, {"df", no_argument, &drain_frontend, true},
{"avg", no_argument, &aggregate_function, AVG_20_80}, {"avg", no_argument, &aggregate_function, AVG_20_80},
{"median", no_argument, &aggregate_function, MED}, {"median", no_argument, &aggregate_function, MED},
{"min", no_argument, &aggregate_function, MIN}, {"min", no_argument, &aggregate_function, MIN},
{"max", no_argument, &aggregate_function, MAX}, {"max", no_argument, &aggregate_function, MAX},
{"basic_mode", no_argument, &basic_mode, 1}, {"basic_mode", no_argument, &basic_mode, true},
{"no_mem", no_argument, &no_mem, 1}, {"no_mem", no_argument, &no_mem, true},
{"no_normalization", no_argument, &no_normalization, 1}, {"no_normalization", no_argument, &no_normalization, true},
{"verbose", no_argument, &verbose, 1}, {"verbose", no_argument, &verbose, true},
{"cpu", required_argument, 0, 'p'}, {"cpu", required_argument, 0, 'p'},
{"usr", required_argument, 0, 'r'}, {"usr", required_argument, 0, 'r'},
{"os", required_argument, 0, 's'}, {"os", required_argument, 0, 's'},
{"debug", no_argument, &debug, 1}, {"debug", no_argument, &debug, true},
{"help", no_argument, 0, 'h'}, {"help", no_argument, 0, 'h'},
{0, 0, 0, 0} {0, 0, 0, 0}
}; };