mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-16 11:30:07 +01:00
make fixed counters optional
This commit is contained in:
26
README.md
26
README.md
@@ -42,12 +42,11 @@ The following command will benchmark the assembler code sequence "ADD RAX, RBX;
|
|||||||
sudo ./nanoBench.sh -asm "ADD RAX, RBX; add RBX, RAX" -config configs/cfg_Skylake_common.txt
|
sudo ./nanoBench.sh -asm "ADD RAX, RBX; add RBX, RAX" -config configs/cfg_Skylake_common.txt
|
||||||
|
|
||||||
It will produce an output similar to the following.
|
It will produce an output similar to the following.
|
||||||
|
|
||||||
Instructions retired: 2.00
|
CORE_CYCLES: 2.00
|
||||||
Core cycles: 2.00
|
INST_RETIRED: 2.00
|
||||||
Reference cycles: 1.85
|
UOPS_ISSUED: 2.00
|
||||||
UOPS_ISSUED.ANY: 2.00
|
UOPS_EXECUTED: 2.00
|
||||||
UOPS_EXECUTED.THREAD: 2.00
|
|
||||||
UOPS_DISPATCHED_PORT.PORT_0: 0.49
|
UOPS_DISPATCHED_PORT.PORT_0: 0.49
|
||||||
UOPS_DISPATCHED_PORT.PORT_1: 0.50
|
UOPS_DISPATCHED_PORT.PORT_1: 0.50
|
||||||
UOPS_DISPATCHED_PORT.PORT_2: 0.00
|
UOPS_DISPATCHED_PORT.PORT_2: 0.00
|
||||||
@@ -96,7 +95,7 @@ We will now take a look behind the scenes at the code that *nanoBench* generates
|
|||||||
|
|
||||||
int run(code, code_init, local_unroll_count):
|
int run(code, code_init, local_unroll_count):
|
||||||
int measurements[n_measurements]
|
int measurements[n_measurements]
|
||||||
|
|
||||||
for i=-warm_up_count to n_measurements
|
for i=-warm_up_count to n_measurements
|
||||||
save_regs
|
save_regs
|
||||||
code_init
|
code_init
|
||||||
@@ -111,17 +110,17 @@ We will now take a look behind the scenes at the code that *nanoBench* generates
|
|||||||
restore_regs
|
restore_regs
|
||||||
if i >= 0: // ignore warm-up runs
|
if i >= 0: // ignore warm-up runs
|
||||||
measurements[i] = m2 - m1
|
measurements[i] = m2 - m1
|
||||||
|
|
||||||
return agg(measurements) // apply selected aggregate function
|
return agg(measurements) // apply selected aggregate function
|
||||||
|
|
||||||
`run(...)` is executed twice: The first time with `local_unroll_count = unroll_count`, and the second time with `local_unroll_count = 2 * unroll_count`. If the `-basic_mode` options is used, the first execution is with no instructions between `m1 = read_perf_ctrs` and `m2 = read_perf_ctrs`, and the second with `local_unroll_count = unroll_count`.
|
`run(...)` is executed twice: The first time with `local_unroll_count = unroll_count`, and the second time with `local_unroll_count = 2 * unroll_count`. If the `-basic_mode` options is used, the first execution is with no instructions between `m1 = read_perf_ctrs` and `m2 = read_perf_ctrs`, and the second with `local_unroll_count = unroll_count`.
|
||||||
|
|
||||||
|
|
||||||
The result that is finally reported by *nanoBench* is the difference between these two executions divided by `max(loop_count * unroll_count, unroll_count)`.
|
The result that is finally reported by *nanoBench* is the difference between these two executions divided by `max(loop_count * unroll_count, unroll_count)`.
|
||||||
|
|
||||||
Before the first execution of `run(...)`, the performance counters are configured according to the event specifications in the `-config` file. If this file contains more events than there are programmable performance counters available, `run(...)` is executed multiple times with different performance counter configurations.
|
Before the first execution of `run(...)`, the performance counters are configured according to the event specifications in the `-config` file. If this file contains more events than there are programmable performance counters available, `run(...)` is executed multiple times with different performance counter configurations.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Command-line Options
|
## Command-line Options
|
||||||
|
|
||||||
@@ -137,7 +136,8 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
|
|||||||
| `-code_init <filename>` | A binary file containing code to be executed once in the beginning of every benchmark run. *This option cannot be used together with `-asm_init`.* |
|
| `-code_init <filename>` | A binary file containing code to be executed once in the beginning of every benchmark run. *This option cannot be used together with `-asm_init`.* |
|
||||||
| `-code_late_init <filename>` | A binary file containing code to be executed once immediately before the code to be benchmarked. *This option cannot be used together with `-asm_late_init`.* |
|
| `-code_late_init <filename>` | A binary file containing code to be executed once immediately before the code to be benchmarked. *This option cannot be used together with `-asm_late_init`.* |
|
||||||
| `-code_one_time_init <code>` | A binary file containing code to be executed once before the first benchmark run. *This option cannot be used together with `-asm_one_time_init`.*|
|
| `-code_one_time_init <code>` | A binary file containing code to be executed once before the first benchmark run. *This option cannot be used together with `-asm_one_time_init`.*|
|
||||||
| `-config <file>` | File with performance counter event specifications. Details are described [below](#performance-counter-config-files). |
|
| `-config <file>` | File with performance counter event specifications. Details are described [below](#performance-counter-config-files). |
|
||||||
|
| `-fixed_counters` | Reads the fixed-function performance counters. |
|
||||||
| `-n_measurements <n>` | Number of times the measurements are repeated. `[Default: n=10]` |
|
| `-n_measurements <n>` | Number of times the measurements are repeated. `[Default: n=10]` |
|
||||||
| `-unroll_count <n>` | Number of copies of the benchmark code inside the inner loop. `[Default: n=1000]` |
|
| `-unroll_count <n>` | Number of copies of the benchmark code inside the inner loop. `[Default: n=1000]` |
|
||||||
| `-loop_count <n>` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` |
|
| `-loop_count <n>` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` |
|
||||||
@@ -196,7 +196,7 @@ can be used to count the number of last-level cache lookups in C-Box 0 on a Skyl
|
|||||||
|
|
||||||
## Pausing Performance Counting
|
## Pausing Performance Counting
|
||||||
|
|
||||||
If the `-no_mem` option is used, nanoBench provides a feature to temporarily pause performance counting. This is enabled by including the *magic* byte sequences `0xF0b513b1C2813F04` (for stopping the counters), and `0xE0b513b1C2813F04` (for restarting them) in the code of the microbenchmark.
|
If the `-no_mem` option is used, nanoBench provides a feature to temporarily pause performance counting. This is enabled by including the *magic* byte sequences `0xF0B513B1C2813F04` (for stopping the counters), and `0xE0B513B1C2813F04` (for restarting them) in the code of the microbenchmark.
|
||||||
|
|
||||||
Using this feature incurs a certain timing overhead that will be included in the measurement results. It is therefore, in particular, useful for microbenchmarks that do not measure the time, but e.g., cache hits or misses, such as the microbenchmarks generated by the tools in [tools/CacheAnalyzer](tools/CacheAnalyzer).
|
Using this feature incurs a certain timing overhead that will be included in the measurement results. It is therefore, in particular, useful for microbenchmarks that do not measure the time, but e.g., cache hits or misses, such as the microbenchmarks generated by the tools in [tools/CacheAnalyzer](tools/CacheAnalyzer).
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ int drain_frontend = DRAIN_FRONTEND_DEFAULT;
|
|||||||
int no_mem = NO_MEM_DEFAULT;
|
int no_mem = NO_MEM_DEFAULT;
|
||||||
int no_normalization = NO_NORMALIZATION_DEFAULT;
|
int no_normalization = NO_NORMALIZATION_DEFAULT;
|
||||||
int basic_mode = BASIC_MODE_DEFAULT;
|
int basic_mode = BASIC_MODE_DEFAULT;
|
||||||
|
int use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT;
|
||||||
int aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
|
int aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
|
||||||
int verbose = VERBOSE_DEFAULT;
|
int verbose = VERBOSE_DEFAULT;
|
||||||
int debug = DEBUG_DEFAULT;
|
int debug = DEBUG_DEFAULT;
|
||||||
@@ -113,19 +114,13 @@ int check_cpuid() {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int n_available_counters = ((eax >> 8) & 0xFF);
|
n_programmable_counters = ((eax >> 8) & 0xFF);
|
||||||
print_user_verbose("Number of general-purpose performance counters: %u\n", n_available_counters);
|
print_user_verbose("Number of general-purpose performance counters: %u\n", n_programmable_counters);
|
||||||
if (n_available_counters >= 4) {
|
if (n_programmable_counters < 2) {
|
||||||
n_programmable_counters = 4;
|
print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_programmable_counters);
|
||||||
} else if (n_available_counters >= 2) {
|
|
||||||
n_programmable_counters = 2;
|
|
||||||
} else {
|
|
||||||
print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_available_counters);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF));
|
print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF));
|
||||||
|
|
||||||
} else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) {
|
} else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) {
|
||||||
is_AMD_CPU = 1;
|
is_AMD_CPU = 1;
|
||||||
n_programmable_counters = 6;
|
n_programmable_counters = 6;
|
||||||
@@ -299,34 +294,32 @@ void write_msr(unsigned int msr, uint64_t value) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void configure_perf_ctrs_FF(unsigned int usr, unsigned int os) {
|
void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os) {
|
||||||
if (is_Intel_CPU) {
|
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
||||||
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
||||||
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
||||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
|
||||||
|
|
||||||
uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL);
|
uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL);
|
||||||
// disable fixed counters
|
// disable fixed counters
|
||||||
fixed_ctrl &= ~((1 << 12) - 1);
|
fixed_ctrl &= ~((1 << 12) - 1);
|
||||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||||
// clear
|
// clear
|
||||||
for (int i=0; i<3; i++) {
|
for (int i=0; i<3; i++) {
|
||||||
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
|
write_msr(MSR_IA32_FIXED_CTR0+i, 0);
|
||||||
}
|
|
||||||
//enable fixed counters
|
|
||||||
fixed_ctrl |= (os << 8) | (os << 4) | os;
|
|
||||||
fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1);
|
|
||||||
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
|
||||||
}
|
}
|
||||||
|
//enable fixed counters
|
||||||
|
fixed_ctrl |= (os << 8) | (os << 4) | os;
|
||||||
|
fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1);
|
||||||
|
write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, unsigned int usr, unsigned int os, char* descriptions[]) {
|
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]) {
|
||||||
if (is_Intel_CPU) {
|
if (is_Intel_CPU) {
|
||||||
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL);
|
||||||
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
global_ctrl |= ((uint64_t)7 << 32) | 15;
|
||||||
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl);
|
||||||
|
|
||||||
for (int i=0; i<n_programmable_counters; i++) {
|
for (int i=0; i<n_counters; i++) {
|
||||||
// clear
|
// clear
|
||||||
write_msr(MSR_IA32_PMC0+i, 0);
|
write_msr(MSR_IA32_PMC0+i, 0);
|
||||||
|
|
||||||
@@ -367,7 +360,7 @@ size_t configure_perf_ctrs_programmable(size_t next_pfc_config, unsigned int usr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i=0; i<n_programmable_counters; i++) {
|
for (int i=0; i<n_counters; i++) {
|
||||||
// clear
|
// clear
|
||||||
write_msr(CORE_X86_MSR_PERF_CTR+(2*i), 0);
|
write_msr(CORE_X86_MSR_PERF_CTR+(2*i), 0);
|
||||||
|
|
||||||
@@ -610,10 +603,10 @@ void create_and_run_one_time_init_code() {
|
|||||||
((void(*)(void))runtime_one_time_init_code)();
|
((void(*)(void))runtime_one_time_init_code)();
|
||||||
}
|
}
|
||||||
|
|
||||||
void run_warmup_experiment(char* measurement_template) {
|
void run_initial_warmup_experiment() {
|
||||||
if (!initial_warm_up_count) return;
|
if (!initial_warm_up_count) return;
|
||||||
|
|
||||||
create_runtime_code(measurement_template, unroll_count, loop_count);
|
create_runtime_code((char*)&initial_warm_up_template, unroll_count, loop_count);
|
||||||
|
|
||||||
for (int i=0; i<initial_warm_up_count; i++) {
|
for (int i=0; i<initial_warm_up_count; i++) {
|
||||||
((void(*)(void))runtime_code)();
|
((void(*)(void))runtime_code)();
|
||||||
@@ -735,8 +728,7 @@ int starts_with_magic_bytes(char* c, int64_t magic_bytes) {
|
|||||||
|
|
||||||
void measurement_template_Intel_2() {
|
void measurement_template_Intel_2() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"push rax \n"
|
"push rax \n"
|
||||||
"lahf \n"
|
"lahf \n"
|
||||||
@@ -784,8 +776,7 @@ void measurement_template_Intel_2() {
|
|||||||
|
|
||||||
void measurement_template_Intel_4() {
|
void measurement_template_Intel_4() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"push rax \n"
|
"push rax \n"
|
||||||
"lahf \n"
|
"lahf \n"
|
||||||
@@ -851,8 +842,7 @@ void measurement_template_Intel_4() {
|
|||||||
|
|
||||||
void measurement_template_Intel_noMem_2() {
|
void measurement_template_Intel_noMem_2() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"mov r8, 0 \n"
|
"mov r8, 0 \n"
|
||||||
"mov r9, 0 \n"
|
"mov r9, 0 \n"
|
||||||
@@ -887,8 +877,7 @@ void measurement_template_Intel_noMem_2() {
|
|||||||
|
|
||||||
void measurement_template_Intel_noMem_4() {
|
void measurement_template_Intel_noMem_4() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"mov r8, 0 \n"
|
"mov r8, 0 \n"
|
||||||
"mov r9, 0 \n"
|
"mov r9, 0 \n"
|
||||||
@@ -943,8 +932,7 @@ void measurement_template_Intel_noMem_4() {
|
|||||||
|
|
||||||
void measurement_template_AMD() {
|
void measurement_template_AMD() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"push rax \n"
|
"push rax \n"
|
||||||
"lahf \n"
|
"lahf \n"
|
||||||
@@ -1028,8 +1016,7 @@ void measurement_template_AMD() {
|
|||||||
|
|
||||||
void measurement_template_AMD_noMem() {
|
void measurement_template_AMD_noMem() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"mov r8, 0 \n"
|
"mov r8, 0 \n"
|
||||||
"mov r9, 0 \n"
|
"mov r9, 0 \n"
|
||||||
@@ -1104,8 +1091,7 @@ void measurement_template_AMD_noMem() {
|
|||||||
|
|
||||||
void measurement_FF_template_Intel() {
|
void measurement_FF_template_Intel() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"push rax \n"
|
"push rax \n"
|
||||||
"lahf \n"
|
"lahf \n"
|
||||||
@@ -1168,8 +1154,7 @@ void measurement_FF_template_Intel() {
|
|||||||
|
|
||||||
void measurement_FF_template_Intel_noMem() {
|
void measurement_FF_template_Intel_noMem() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"mov r8, 0 \n"
|
"mov r8, 0 \n"
|
||||||
"mov r9, 0 \n"
|
"mov r9, 0 \n"
|
||||||
@@ -1222,8 +1207,7 @@ void measurement_FF_template_Intel_noMem() {
|
|||||||
|
|
||||||
void measurement_FF_template_AMD() {
|
void measurement_FF_template_AMD() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"push rax \n"
|
"push rax \n"
|
||||||
"lahf \n"
|
"lahf \n"
|
||||||
@@ -1278,8 +1262,7 @@ void measurement_FF_template_AMD() {
|
|||||||
|
|
||||||
void measurement_FF_template_AMD_noMem() {
|
void measurement_FF_template_AMD_noMem() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"mov r8, 0 \n"
|
"mov r8, 0 \n"
|
||||||
"mov r9, 0 \n"
|
"mov r9, 0 \n"
|
||||||
@@ -1321,8 +1304,7 @@ void measurement_FF_template_AMD_noMem() {
|
|||||||
|
|
||||||
void measurement_RDTSC_template() {
|
void measurement_RDTSC_template() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"push rax \n"
|
"push rax \n"
|
||||||
"lahf \n"
|
"lahf \n"
|
||||||
@@ -1356,8 +1338,7 @@ void measurement_RDTSC_template() {
|
|||||||
|
|
||||||
void measurement_RDTSC_template_noMem() {
|
void measurement_RDTSC_template_noMem() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"mov r8, 0 \n"
|
"mov r8, 0 \n"
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
||||||
@@ -1380,8 +1361,7 @@ void measurement_RDTSC_template_noMem() {
|
|||||||
|
|
||||||
void measurement_RDMSR_template() {
|
void measurement_RDMSR_template() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"push rax \n"
|
"push rax \n"
|
||||||
"lahf \n"
|
"lahf \n"
|
||||||
@@ -1419,8 +1399,7 @@ void measurement_RDMSR_template() {
|
|||||||
|
|
||||||
void measurement_RDMSR_template_noMem() {
|
void measurement_RDMSR_template_noMem() {
|
||||||
SAVE_REGS_FLAGS();
|
SAVE_REGS_FLAGS();
|
||||||
asm volatile(
|
asm(".intel_syntax noprefix \n"
|
||||||
".intel_syntax noprefix \n"
|
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
"mov r8, 0 \n"
|
"mov r8, 0 \n"
|
||||||
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
||||||
@@ -1448,4 +1427,14 @@ void one_time_init_template() {
|
|||||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||||
RESTORE_REGS_FLAGS();
|
RESTORE_REGS_FLAGS();
|
||||||
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void initial_warm_up_template() {
|
||||||
|
SAVE_REGS_FLAGS();
|
||||||
|
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)" \n"
|
||||||
|
".quad "STRINGIFY(MAGIC_BYTES_PFC_START)"\n"
|
||||||
|
".quad "STRINGIFY(MAGIC_BYTES_CODE)" \n"
|
||||||
|
".quad "STRINGIFY(MAGIC_BYTES_PFC_END)" \n");
|
||||||
|
RESTORE_REGS_FLAGS();
|
||||||
|
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||||
|
}
|
||||||
|
|||||||
@@ -115,6 +115,10 @@ extern int no_normalization;
|
|||||||
extern int basic_mode;
|
extern int basic_mode;
|
||||||
#define BASIC_MODE_DEFAULT 0;
|
#define BASIC_MODE_DEFAULT 0;
|
||||||
|
|
||||||
|
// If enabled, the result includes measurements using the fixed-function performance counters and the RDTSC instruction.
|
||||||
|
extern int use_fixed_counters;
|
||||||
|
#define USE_FIXED_COUNTERS_DEFAULT 0;
|
||||||
|
|
||||||
enum agg_enum {AVG_20_80, MIN, MAX, MED};
|
enum agg_enum {AVG_20_80, MIN, MAX, MED};
|
||||||
extern int aggregate_function;
|
extern int aggregate_function;
|
||||||
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
|
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
|
||||||
@@ -170,7 +174,7 @@ extern char* msr_config_file_content;
|
|||||||
extern int is_Intel_CPU;
|
extern int is_Intel_CPU;
|
||||||
extern int is_AMD_CPU;
|
extern int is_AMD_CPU;
|
||||||
|
|
||||||
#define MAX_PROGRAMMABLE_COUNTERS 6
|
#define MAX_PROGRAMMABLE_COUNTERS 8
|
||||||
extern int n_programmable_counters;
|
extern int n_programmable_counters;
|
||||||
|
|
||||||
// Pointers to a memory regions that are writable and executable.
|
// Pointers to a memory regions that are writable and executable.
|
||||||
@@ -212,19 +216,19 @@ uint64_t read_msr(unsigned int msr);
|
|||||||
void write_msr(unsigned int msr, uint64_t value);
|
void write_msr(unsigned int msr, uint64_t value);
|
||||||
|
|
||||||
// Enables and clears the fixed-function performance counters.
|
// Enables and clears the fixed-function performance counters.
|
||||||
void configure_perf_ctrs_FF(unsigned int usr, unsigned int os);
|
void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os);
|
||||||
|
|
||||||
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
|
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
|
||||||
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_programmable_counters many configurations from this array;
|
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_counters many configurations from this array;
|
||||||
// it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array.
|
// it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array.
|
||||||
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, unsigned int usr, unsigned int os, char* descriptions[]);
|
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]);
|
||||||
|
|
||||||
void configure_MSRs(struct msr_config config);
|
void configure_MSRs(struct msr_config config);
|
||||||
|
|
||||||
size_t get_required_runtime_code_length(void);
|
size_t get_required_runtime_code_length(void);
|
||||||
|
|
||||||
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
|
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
|
||||||
void run_warmup_experiment(char* measurement_template);
|
void run_initial_warmup_experiment(void);
|
||||||
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
|
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
|
||||||
void create_and_run_one_time_init_code(void);
|
void create_and_run_one_time_init_code(void);
|
||||||
|
|
||||||
@@ -236,22 +240,22 @@ long long ll_abs(long long val);
|
|||||||
void print_all_measurement_results(int64_t* results[], int n_counters);
|
void print_all_measurement_results(int64_t* results[], int n_counters);
|
||||||
|
|
||||||
|
|
||||||
#define MAGIC_BYTES_INIT 0x10b513b1C2813F04
|
#define MAGIC_BYTES_INIT 0x10B513B1C2813F04
|
||||||
#define MAGIC_BYTES_CODE 0x20b513b1C2813F04
|
#define MAGIC_BYTES_CODE 0x20B513B1C2813F04
|
||||||
#define MAGIC_BYTES_RSP_ADDRESS 0x30b513b1C2813F04
|
#define MAGIC_BYTES_RSP_ADDRESS 0x30B513B1C2813F04
|
||||||
#define MAGIC_BYTES_RUNTIME_R14 0x40b513b1C2813F04
|
#define MAGIC_BYTES_RUNTIME_R14 0x40B513B1C2813F04
|
||||||
#define MAGIC_BYTES_RUNTIME_RBP 0x50b513b1C2813F04
|
#define MAGIC_BYTES_RUNTIME_RBP 0x50B513B1C2813F04
|
||||||
#define MAGIC_BYTES_RUNTIME_RDI 0x60b513b1C2813F04
|
#define MAGIC_BYTES_RUNTIME_RDI 0x60B513B1C2813F04
|
||||||
#define MAGIC_BYTES_RUNTIME_RSI 0x70b513b1C2813F04
|
#define MAGIC_BYTES_RUNTIME_RSI 0x70B513B1C2813F04
|
||||||
#define MAGIC_BYTES_RUNTIME_RSP 0x80b513b1C2813F04
|
#define MAGIC_BYTES_RUNTIME_RSP 0x80B513B1C2813F04
|
||||||
#define MAGIC_BYTES_PFC 0x90b513b1C2813F04
|
#define MAGIC_BYTES_PFC 0x90B513B1C2813F04
|
||||||
#define MAGIC_BYTES_MSR 0xA0b513b1C2813F04
|
#define MAGIC_BYTES_MSR 0xA0B513B1C2813F04
|
||||||
#define MAGIC_BYTES_TEMPLATE_END 0xB0b513b1C2813F04
|
#define MAGIC_BYTES_TEMPLATE_END 0xB0B513B1C2813F04
|
||||||
#define MAGIC_BYTES_PFC_START 0xC0b513b1C2813F04
|
#define MAGIC_BYTES_PFC_START 0xC0B513B1C2813F04
|
||||||
#define MAGIC_BYTES_PFC_END 0xD0b513b1C2813F04
|
#define MAGIC_BYTES_PFC_END 0xD0B513B1C2813F04
|
||||||
|
|
||||||
#define MAGIC_BYTES_CODE_PFC_START 0xE0b513b1C2813F04
|
#define MAGIC_BYTES_CODE_PFC_START 0xE0B513B1C2813F04
|
||||||
#define MAGIC_BYTES_CODE_PFC_STOP 0xF0b513b1C2813F04
|
#define MAGIC_BYTES_CODE_PFC_STOP 0xF0B513B1C2813F04
|
||||||
|
|
||||||
|
|
||||||
#define STRINGIFY2(X) #X
|
#define STRINGIFY2(X) #X
|
||||||
@@ -275,6 +279,7 @@ void measurement_RDTSC_template_noMem(void);
|
|||||||
void measurement_RDMSR_template(void);
|
void measurement_RDMSR_template(void);
|
||||||
void measurement_RDMSR_template_noMem(void);
|
void measurement_RDMSR_template_noMem(void);
|
||||||
void one_time_init_template(void);
|
void one_time_init_template(void);
|
||||||
|
void initial_warm_up_template(void);
|
||||||
|
|
||||||
// RBX, RBP, and R12–R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
|
// RBX, RBP, and R12–R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
|
||||||
#define SAVE_REGS_FLAGS() \
|
#define SAVE_REGS_FLAGS() \
|
||||||
|
|||||||
@@ -2,9 +2,11 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3DH and 06_47H.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_3DH and 06_47H.
|
||||||
# See Table 19-8 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-8 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
0E.01 UOPS_ISSUED.ANY
|
3C.00 CORE_CYCLES
|
||||||
C2.01 UOPS_RETIRED.ALL
|
C0.00 INST_RETIRED
|
||||||
B1.01 UOPS_EXECUTED.THREAD
|
0E.01 UOPS_ISSUED
|
||||||
|
B1.01 UOPS_EXECUTED
|
||||||
|
C2.01 UOPS_RETIRED
|
||||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||||
@@ -14,7 +16,7 @@ A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
|||||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||||
|
|||||||
@@ -157,7 +157,7 @@
|
|||||||
32.C0 L2_NO_REQ.ALL_CORES
|
32.C0 L2_NO_REQ.ALL_CORES
|
||||||
3A.00 EIST_TRANS
|
3A.00 EIST_TRANS
|
||||||
3B.C0 THERMAL_TRIP
|
3B.C0 THERMAL_TRIP
|
||||||
3C.00 CPU_CLK_UNHALTED.
|
3C.00 CPU_CLK_UNHALTED.CORE_P
|
||||||
3C.01 CPU_CLK_UNHALTED.BUS
|
3C.01 CPU_CLK_UNHALTED.BUS
|
||||||
3C.02 CPU_CLK_UNHALTED.NO
|
3C.02 CPU_CLK_UNHALTED.NO
|
||||||
40.08 L1D_CACHE_LD.M
|
40.08 L1D_CACHE_LD.M
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
# Performance monitoring events for processors based on the Core and the Enhanced Core microarchitectures.
|
# Performance monitoring events for processors based on the Core and the Enhanced Core microarchitectures.
|
||||||
# See Table 19-26 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-26 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
|
3C.00 CORE_CYCLES
|
||||||
|
C0.00 INST_RETIRED
|
||||||
C2.07 UOPS_RETIRED.FUSED
|
C2.07 UOPS_RETIRED.FUSED
|
||||||
C2.0F UOPS_RETIRED.ANY
|
C2.0F UOPS_RETIRED.ANY
|
||||||
A0.00 RS_UOPS_DISPATCHED
|
A0.00 RS_UOPS_DISPATCHED
|
||||||
@@ -28,5 +30,3 @@ CB.08.CTR=0 MEM_LOAD_RETIRED.L2_LINE_MISS
|
|||||||
29.72 L2_LD.THIS_CORE.ALL_INCL.S
|
29.72 L2_LD.THIS_CORE.ALL_INCL.S
|
||||||
29.71 L2_LD.THIS_CORE.ALL_INCL.I
|
29.71 L2_LD.THIS_CORE.ALL_INCL.I
|
||||||
29.7E L2_LD.THIS_CORE.ALL_INCL.MES
|
29.7E L2_LD.THIS_CORE.ALL_INCL.MES
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,8 +2,10 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3CH, 06_45H and 06_46H.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_3CH, 06_45H and 06_46H.
|
||||||
# See Table 19-10 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-10 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
0E.01 UOPS_ISSUED.ANY
|
3C.00 CORE_CYCLES
|
||||||
C2.01 UOPS_RETIRED.ALL
|
C0.00 INST_RETIRED
|
||||||
|
0E.01 UOPS_ISSUED
|
||||||
|
C2.01 UOPS_RETIRED
|
||||||
A1.01 UOPS_EXECUTED_PORT.PORT_0
|
A1.01 UOPS_EXECUTED_PORT.PORT_0
|
||||||
A1.02 UOPS_EXECUTED_PORT.PORT_1
|
A1.02 UOPS_EXECUTED_PORT.PORT_1
|
||||||
A1.04 UOPS_EXECUTED_PORT.PORT_2
|
A1.04 UOPS_EXECUTED_PORT.PORT_2
|
||||||
|
|||||||
@@ -2,8 +2,10 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_7DH and 06_7EH.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_7DH and 06_7EH.
|
||||||
# See Table 19-5 of Intel's "System Programming Guide" (May 2019)
|
# See Table 19-5 of Intel's "System Programming Guide" (May 2019)
|
||||||
|
|
||||||
0E.01 UOPS_ISSUED.ANY
|
3C.00 CORE_CYCLES
|
||||||
B1.01 UOPS_EXECUTED.THREAD
|
C0.00 INST_RETIRED
|
||||||
|
0E.01 UOPS_ISSUED
|
||||||
|
B1.01 UOPS_EXECUTED
|
||||||
A1.01 UOPS_DISPATCHED.PORT_0
|
A1.01 UOPS_DISPATCHED.PORT_0
|
||||||
A1.02 UOPS_DISPATCHED.PORT_1
|
A1.02 UOPS_DISPATCHED.PORT_1
|
||||||
A1.04 UOPS_DISPATCHED.PORT_2_3
|
A1.04 UOPS_DISPATCHED.PORT_2_3
|
||||||
|
|||||||
@@ -2,9 +2,11 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_3AH.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_3AH.
|
||||||
# See Table 19-14 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-14 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
0E.01 UOPS_ISSUED.ANY
|
3C.00 CORE_CYCLES
|
||||||
B1.01 UOPS_EXECUTED.THREAD
|
C0.00 INST_RETIRED
|
||||||
C2.01 UOPS_RETIRED.ALL
|
0E.01 UOPS_ISSUED
|
||||||
|
B1.01 UOPS_EXECUTED
|
||||||
|
C2.01 UOPS_RETIRED
|
||||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||||
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
||||||
@@ -12,7 +14,7 @@ A1.30 UOPS_DISPATCHED_PORT.PORT_3
|
|||||||
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
||||||
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
||||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||||
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
D1.08 MEM_LOAD_UOPS_RETIRED.L1_MISS
|
||||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||||
|
|||||||
@@ -2,7 +2,9 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_57H and 06_85H.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_57H and 06_85H.
|
||||||
# See Table 19-7 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-7 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
C2.10 UOPS_RETIRED.ALL
|
3C.00 CORE_CYCLES
|
||||||
|
C0.00 INST_RETIRED
|
||||||
|
C2.10 UOPS_RETIRED
|
||||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||||
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||||
04.01 MEM_UOPS_RETIRED.L1_MISS_LOADS
|
04.01 MEM_UOPS_RETIRED.L1_MISS_LOADS
|
||||||
|
|||||||
@@ -2,9 +2,11 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_1AH, 06_1EH, 06_1FH, and 06_2EH.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_1AH, 06_1EH, 06_1FH, and 06_2EH.
|
||||||
# See Table 19-20 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-20 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
|
3C.00 CORE_CYCLES
|
||||||
|
C0.00 INST_RETIRED
|
||||||
0E.01 UOPS_ISSUED.ANY
|
0E.01 UOPS_ISSUED.ANY
|
||||||
0E.02 UOPS_ISSUED.FUSED
|
0E.02 UOPS_ISSUED.FUSED
|
||||||
C2.01 UOPS_RETIRED.ANY
|
C2.01 UOPS_RETIRED
|
||||||
B1.01 UOPS_EXECUTED.PORT0
|
B1.01 UOPS_EXECUTED.PORT0
|
||||||
B1.02 UOPS_EXECUTED.PORT1
|
B1.02 UOPS_EXECUTED.PORT1
|
||||||
B1.04 UOPS_EXECUTED.PORT2_CORE
|
B1.04 UOPS_EXECUTED.PORT2_CORE
|
||||||
|
|||||||
@@ -2,9 +2,11 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_2AH and 06_2DH.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_2AH and 06_2DH.
|
||||||
# See Table 19-16 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-16 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
0E.01 UOPS_ISSUED.ANY
|
3C.00 CORE_CYCLES
|
||||||
B1.01 UOPS_DISPATCHED.THREAD
|
C0.00 INST_RETIRED
|
||||||
C2.01 UOPS_RETIRED.ALL
|
0E.01 UOPS_ISSUED
|
||||||
|
B1.01 UOPS_DISPATCHED
|
||||||
|
C2.01 UOPS_RETIRED
|
||||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||||
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
A1.0C UOPS_DISPATCHED_PORT.PORT_2
|
||||||
@@ -12,7 +14,7 @@ A1.30 UOPS_DISPATCHED_PORT.PORT_3
|
|||||||
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
A1.40 UOPS_DISPATCHED_PORT.PORT_4
|
||||||
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
A1.80 UOPS_DISPATCHED_PORT.PORT_5
|
||||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||||
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
D1.01 MEM_LOAD_UOPS_RETIRED.L1_HIT
|
||||||
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
D1.02 MEM_LOAD_UOPS_RETIRED.L2_HIT
|
||||||
D1.04 MEM_LOAD_UOPS_RETIRED.LLC_HIT
|
D1.04 MEM_LOAD_UOPS_RETIRED.LLC_HIT
|
||||||
|
|||||||
@@ -2,8 +2,10 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_4EH, 06_5EH, 06_8EH, and 06_9EH.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_4EH, 06_5EH, 06_8EH, and 06_9EH.
|
||||||
# See Table 19-5 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-5 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
0E.01 UOPS_ISSUED.ANY
|
3C.00 CORE_CYCLES
|
||||||
B1.01 UOPS_EXECUTED.THREAD
|
C0.00 INST_RETIRED
|
||||||
|
0E.01 UOPS_ISSUED
|
||||||
|
B1.01 UOPS_EXECUTED
|
||||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||||
@@ -13,7 +15,7 @@ A1.20 UOPS_DISPATCHED_PORT.PORT_5
|
|||||||
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
A1.40 UOPS_DISPATCHED_PORT.PORT_6
|
||||||
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
A1.80 UOPS_DISPATCHED_PORT.PORT_7
|
||||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||||
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
D1.01 MEM_LOAD_RETIRED.L1_HIT
|
||||||
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
D1.08 MEM_LOAD_RETIRED.L1_MISS
|
||||||
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
D1.02 MEM_LOAD_RETIRED.L2_HIT
|
||||||
|
|||||||
@@ -2,9 +2,11 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_25H and 06_2CH.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_25H and 06_2CH.
|
||||||
# See Table 19-22 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-22 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
|
3C.00 CORE_CYCLES
|
||||||
|
C0.00 INST_RETIRED
|
||||||
0E.01 UOPS_ISSUED.ANY
|
0E.01 UOPS_ISSUED.ANY
|
||||||
0E.02 UOPS_ISSUED.FUSED
|
0E.02 UOPS_ISSUED.FUSED
|
||||||
C2.01 UOPS_RETIRED.ANY
|
C2.01 UOPS_RETIRED
|
||||||
B1.01 UOPS_EXECUTED.PORT0
|
B1.01 UOPS_EXECUTED.PORT0
|
||||||
B1.02 UOPS_EXECUTED.PORT1
|
B1.02 UOPS_EXECUTED.PORT1
|
||||||
B1.04 UOPS_EXECUTED.PORT2_CORE
|
B1.04 UOPS_EXECUTED.PORT2_CORE
|
||||||
@@ -12,7 +14,7 @@ B1.08 UOPS_EXECUTED.PORT3_CORE
|
|||||||
B1.10 UOPS_EXECUTED.PORT4_CORE
|
B1.10 UOPS_EXECUTED.PORT4_CORE
|
||||||
B1.20 UOPS_EXECUTED.PORT5
|
B1.20 UOPS_EXECUTED.PORT5
|
||||||
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
C4.00 BR_INST_RETIRED.ALL_BRANCHES
|
||||||
C5.04 BR_MISP_RETIRED.ALL_BRANCHES
|
C5.00 BR_MISP_RETIRED.ALL_BRANCHES
|
||||||
CB.01 MEM_LOAD_RETIRED.L1D_HIT
|
CB.01 MEM_LOAD_RETIRED.L1D_HIT
|
||||||
CB.02 MEM_LOAD_RETIRED.L2_HIT
|
CB.02 MEM_LOAD_RETIRED.L2_HIT
|
||||||
CB.04 MEM_LOAD_RETIRED.L3_UNSHARED_HIT
|
CB.04 MEM_LOAD_RETIRED.L3_UNSHARED_HIT
|
||||||
|
|||||||
@@ -2,8 +2,10 @@
|
|||||||
# Applies to processors with DisplayFamily_DisplayModel of 06_55H.
|
# Applies to processors with DisplayFamily_DisplayModel of 06_55H.
|
||||||
# See Table 19-3 of Intel's "System Programming Guide" (Jan. 2019)
|
# See Table 19-3 of Intel's "System Programming Guide" (Jan. 2019)
|
||||||
|
|
||||||
0E.01 UOPS_ISSUED.ANY
|
3C.00 CORE_CYCLES
|
||||||
B1.01 UOPS_EXECUTED.THREAD
|
C0.00 INST_RETIRED
|
||||||
|
0E.01 UOPS_ISSUED
|
||||||
|
B1.01 UOPS_EXECUTED
|
||||||
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
A1.01 UOPS_DISPATCHED_PORT.PORT_0
|
||||||
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
A1.02 UOPS_DISPATCHED_PORT.PORT_1
|
||||||
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
A1.04 UOPS_DISPATCHED_PORT.PORT_2
|
||||||
|
|||||||
@@ -54,6 +54,9 @@ while [ "$1" ]; do
|
|||||||
elif [[ "$1" == -con* ]]; then
|
elif [[ "$1" == -con* ]]; then
|
||||||
echo -n "$2" > /sys/nb/config
|
echo -n "$2" > /sys/nb/config
|
||||||
shift 2
|
shift 2
|
||||||
|
elif [[ "$1" == -f* ]]; then
|
||||||
|
echo "1" > /sys/nb/fixed_counters
|
||||||
|
shift
|
||||||
elif [[ "$1" == -msr* ]]; then
|
elif [[ "$1" == -msr* ]]; then
|
||||||
echo -n "$2" > /sys/nb/msr_config
|
echo -n "$2" > /sys/nb/msr_config
|
||||||
shift 2
|
shift 2
|
||||||
@@ -112,6 +115,7 @@ while [ "$1" ]; do
|
|||||||
echo " -code_init <filename>: Binary file containing code to be executed once in the beginning."
|
echo " -code_init <filename>: Binary file containing code to be executed once in the beginning."
|
||||||
echo " -code_late_init <filename>: Binary file containing code to be executed once immediately before the code to be benchmarked."
|
echo " -code_late_init <filename>: Binary file containing code to be executed once immediately before the code to be benchmarked."
|
||||||
echo " -config <filename>: File with performance counter event specifications."
|
echo " -config <filename>: File with performance counter event specifications."
|
||||||
|
echo " -fixed_counters: Reads the fixed-function performance counters.\n"
|
||||||
echo " -n_measurements <n>: Number of times the measurements are repeated."
|
echo " -n_measurements <n>: Number of times the measurements are repeated."
|
||||||
echo " -unroll_count <n>: Number of copies of the benchmark code inside the inner loop."
|
echo " -unroll_count <n>: Number of copies of the benchmark code inside the inner loop."
|
||||||
echo " -loop_count <n>: Number of iterations of the inner loop."
|
echo " -loop_count <n>: Number of iterations of the inner loop."
|
||||||
@@ -135,4 +139,9 @@ while [ "$1" ]; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
prev_nmi_watchdog=$(cat /proc/sys/kernel/nmi_watchdog)
|
||||||
|
echo 0 > /proc/sys/kernel/nmi_watchdog
|
||||||
|
|
||||||
$taskset cat /proc/nanoBench
|
$taskset cat /proc/nanoBench
|
||||||
|
|
||||||
|
echo $prev_nmi_watchdog > /proc/sys/kernel/nmi_watchdog
|
||||||
|
|||||||
129
kernel/nb_km.c
129
kernel/nb_km.c
@@ -219,6 +219,15 @@ static ssize_t msr_config_store(struct kobject *kobj, struct kobj_attribute *att
|
|||||||
}
|
}
|
||||||
static struct kobj_attribute msr_config_attribute =__ATTR(msr_config, 0660, msr_config_show, msr_config_store);
|
static struct kobj_attribute msr_config_attribute =__ATTR(msr_config, 0660, msr_config_show, msr_config_store);
|
||||||
|
|
||||||
|
static ssize_t fixed_counters_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||||
|
return sprintf(buf, "%u\n", use_fixed_counters);
|
||||||
|
}
|
||||||
|
static ssize_t fixed_counters_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||||
|
sscanf(buf, "%u", &use_fixed_counters);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
static struct kobj_attribute fixed_counters_attribute =__ATTR(fixed_counters, 0660, fixed_counters_show, fixed_counters_store);
|
||||||
|
|
||||||
static ssize_t unroll_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
static ssize_t unroll_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||||
return sprintf(buf, "%ld\n", unroll_count);
|
return sprintf(buf, "%ld\n", unroll_count);
|
||||||
}
|
}
|
||||||
@@ -455,6 +464,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha
|
|||||||
no_mem = NO_MEM_DEFAULT;
|
no_mem = NO_MEM_DEFAULT;
|
||||||
no_normalization = NO_NORMALIZATION_DEFAULT;
|
no_normalization = NO_NORMALIZATION_DEFAULT;
|
||||||
basic_mode = BASIC_MODE_DEFAULT;
|
basic_mode = BASIC_MODE_DEFAULT;
|
||||||
|
use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT;
|
||||||
aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
|
aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
|
||||||
verbose = VERBOSE_DEFAULT;
|
verbose = VERBOSE_DEFAULT;
|
||||||
alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
|
alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
|
||||||
@@ -500,61 +510,65 @@ static int show(struct seq_file *m, void *v) {
|
|||||||
char buf[100];
|
char buf[100];
|
||||||
char* measurement_template;
|
char* measurement_template;
|
||||||
|
|
||||||
|
create_and_run_one_time_init_code();
|
||||||
|
run_initial_warmup_experiment();
|
||||||
|
|
||||||
/*********************************
|
/*********************************
|
||||||
* Fixed-function counters.
|
* Fixed-function counters.
|
||||||
********************************/
|
********************************/
|
||||||
if (is_AMD_CPU) {
|
if (use_fixed_counters) {
|
||||||
if (no_mem) {
|
if (is_AMD_CPU) {
|
||||||
measurement_template = (char*)&measurement_FF_template_AMD_noMem;
|
if (no_mem) {
|
||||||
|
measurement_template = (char*)&measurement_FF_template_AMD_noMem;
|
||||||
|
} else {
|
||||||
|
measurement_template = (char*)&measurement_FF_template_AMD;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
measurement_template = (char*)&measurement_FF_template_AMD;
|
if (no_mem) {
|
||||||
|
measurement_template = (char*)&measurement_FF_template_Intel_noMem;
|
||||||
|
} else {
|
||||||
|
measurement_template = (char*)&measurement_FF_template_Intel;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
if (no_mem) {
|
if (is_AMD_CPU) {
|
||||||
measurement_template = (char*)&measurement_FF_template_Intel_noMem;
|
run_experiment(measurement_template, measurement_results_base, 3, base_unroll_count, base_loop_count);
|
||||||
|
run_experiment(measurement_template, measurement_results, 3, main_unroll_count, main_loop_count);
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
pr_debug("\nRDTSC, MPERF, and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results_base, 3);
|
||||||
|
pr_debug("RDTSC, MPERF, and and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||||
|
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1));
|
||||||
|
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2));
|
||||||
} else {
|
} else {
|
||||||
measurement_template = (char*)&measurement_FF_template_Intel;
|
configure_perf_ctrs_FF_Intel(0, 1);
|
||||||
|
|
||||||
|
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
|
||||||
|
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
pr_debug("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results_base, 4);
|
||||||
|
pr_debug("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||||
|
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1));
|
||||||
|
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2));
|
||||||
|
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
configure_perf_ctrs_FF(0, 1);
|
|
||||||
create_and_run_one_time_init_code();
|
|
||||||
run_warmup_experiment(measurement_template);
|
|
||||||
|
|
||||||
if (is_AMD_CPU) {
|
|
||||||
run_experiment(measurement_template, measurement_results_base, 3, base_unroll_count, base_loop_count);
|
|
||||||
run_experiment(measurement_template, measurement_results, 3, main_unroll_count, main_loop_count);
|
|
||||||
|
|
||||||
if (verbose) {
|
|
||||||
pr_debug("\nRDTSC, MPERF, and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
|
||||||
print_all_measurement_results(measurement_results_base, 3);
|
|
||||||
pr_debug("RDTSC, MPERF, and and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
|
||||||
print_all_measurement_results(measurement_results, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
|
||||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1));
|
|
||||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2));
|
|
||||||
} else {
|
|
||||||
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
|
|
||||||
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
|
|
||||||
|
|
||||||
if (verbose) {
|
|
||||||
pr_debug("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
|
||||||
print_all_measurement_results(measurement_results_base, 4);
|
|
||||||
pr_debug("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
|
||||||
print_all_measurement_results(measurement_results, 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
|
||||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1));
|
|
||||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2));
|
|
||||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*********************************
|
/*********************************
|
||||||
* Programmable counters.
|
* Programmable counters.
|
||||||
********************************/
|
********************************/
|
||||||
|
int n_used_counters = n_programmable_counters;
|
||||||
if (is_AMD_CPU) {
|
if (is_AMD_CPU) {
|
||||||
if (no_mem) {
|
if (no_mem) {
|
||||||
measurement_template = (char*)&measurement_template_AMD_noMem;
|
measurement_template = (char*)&measurement_template_AMD_noMem;
|
||||||
@@ -562,15 +576,17 @@ static int show(struct seq_file *m, void *v) {
|
|||||||
measurement_template = (char*)&measurement_template_AMD;
|
measurement_template = (char*)&measurement_template_AMD;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (no_mem) {
|
if (n_used_counters >= 4) {
|
||||||
if (n_programmable_counters >= 4) {
|
n_used_counters = 4;
|
||||||
measurement_template = (char*)&measurement_template_Intel_noMem_4;
|
if (no_mem) {
|
||||||
|
measurement_template = (char*)&measurement_template_Intel_noMem_4;
|
||||||
} else {
|
} else {
|
||||||
measurement_template = (char*)&measurement_template_Intel_noMem_2;
|
measurement_template = (char*)&measurement_template_Intel_4;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (n_programmable_counters >= 4) {
|
n_used_counters = 2;
|
||||||
measurement_template = (char*)&measurement_template_Intel_4;
|
if (no_mem) {
|
||||||
|
measurement_template = (char*)&measurement_template_Intel_noMem_2;
|
||||||
} else {
|
} else {
|
||||||
measurement_template = (char*)&measurement_template_Intel_2;
|
measurement_template = (char*)&measurement_template_Intel_2;
|
||||||
}
|
}
|
||||||
@@ -580,20 +596,20 @@ static int show(struct seq_file *m, void *v) {
|
|||||||
size_t next_pfc_config = 0;
|
size_t next_pfc_config = 0;
|
||||||
while (next_pfc_config < n_pfc_configs) {
|
while (next_pfc_config < n_pfc_configs) {
|
||||||
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
|
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
|
||||||
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, 1, 1, pfc_descriptions);
|
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, 1, 1, pfc_descriptions);
|
||||||
// on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set
|
// on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set
|
||||||
|
|
||||||
run_experiment(measurement_template, measurement_results_base, n_programmable_counters, base_unroll_count, base_loop_count);
|
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);
|
||||||
run_experiment(measurement_template, measurement_results, n_programmable_counters, main_unroll_count, main_loop_count);
|
run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count);
|
||||||
|
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
pr_debug("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
pr_debug("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||||
print_all_measurement_results(measurement_results_base, n_programmable_counters);
|
print_all_measurement_results(measurement_results_base, n_used_counters);
|
||||||
pr_debug("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
pr_debug("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||||
print_all_measurement_results(measurement_results, n_programmable_counters);
|
print_all_measurement_results(measurement_results, n_used_counters);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t c=0; c < n_programmable_counters; c++) {
|
for (size_t c=0; c < n_used_counters; c++) {
|
||||||
if (pfc_descriptions[c]) seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c));
|
if (pfc_descriptions[c]) seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -715,6 +731,7 @@ static int __init nb_init(void) {
|
|||||||
error |= sysfs_create_file(nb_kobject, &code_one_time_init_attribute.attr);
|
error |= sysfs_create_file(nb_kobject, &code_one_time_init_attribute.attr);
|
||||||
error |= sysfs_create_file(nb_kobject, &config_attribute.attr);
|
error |= sysfs_create_file(nb_kobject, &config_attribute.attr);
|
||||||
error |= sysfs_create_file(nb_kobject, &msr_config_attribute.attr);
|
error |= sysfs_create_file(nb_kobject, &msr_config_attribute.attr);
|
||||||
|
error |= sysfs_create_file(nb_kobject, &fixed_counters_attribute.attr);
|
||||||
error |= sysfs_create_file(nb_kobject, &loop_count_attribute.attr);
|
error |= sysfs_create_file(nb_kobject, &loop_count_attribute.attr);
|
||||||
error |= sysfs_create_file(nb_kobject, &unroll_count_attribute.attr);
|
error |= sysfs_create_file(nb_kobject, &unroll_count_attribute.attr);
|
||||||
error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr);
|
error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr);
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ import collections
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
PFC_START_ASM = '.quad 0xE0b513b1C2813F04'
|
PFC_START_ASM = '.quad 0xE0B513B1C2813F04'
|
||||||
PFC_STOP_ASM = '.quad 0xF0b513b1C2813F04'
|
PFC_STOP_ASM = '.quad 0xF0B513B1C2813F04'
|
||||||
|
|
||||||
def writeFile(fileName, content):
|
def writeFile(fileName, content):
|
||||||
with open(fileName, 'w') as f:
|
with open(fileName, 'w') as f:
|
||||||
@@ -53,9 +53,9 @@ paramDict = dict()
|
|||||||
|
|
||||||
# Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters().
|
# Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters().
|
||||||
# Otherwise, reset() needs to be called first.
|
# Otherwise, reset() needs to be called first.
|
||||||
def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None,
|
def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, fixedCounters=None, nMeasurements=None, unrollCount=None,
|
||||||
warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None, aggregateFunction=None,
|
loopCount=None, warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None,
|
||||||
basicMode=None, noMem=None, noNormalization=None, verbose=None):
|
aggregateFunction=None, basicMode=None, noMem=None, noNormalization=None, verbose=None):
|
||||||
if not ramdiskCreated: createRamdisk()
|
if not ramdiskCreated: createRamdisk()
|
||||||
|
|
||||||
if config is not None:
|
if config is not None:
|
||||||
@@ -74,6 +74,11 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
|
|||||||
if msrConfigFile is not None:
|
if msrConfigFile is not None:
|
||||||
writeFile('/sys/nb/msr_config', msrConfigFile)
|
writeFile('/sys/nb/msr_config', msrConfigFile)
|
||||||
|
|
||||||
|
if fixedCounters is not None:
|
||||||
|
if paramDict.get('fixedCounters', None) != fixedCounters:
|
||||||
|
writeFile('/sys/nb/fixed_counters', str(int(fixedCounters)))
|
||||||
|
paramDict['fixedCounters'] = fixedCounters
|
||||||
|
|
||||||
if nMeasurements is not None:
|
if nMeasurements is not None:
|
||||||
if paramDict.get('nMeasurements', None) != nMeasurements:
|
if paramDict.get('nMeasurements', None) != nMeasurements:
|
||||||
writeFile('/sys/nb/n_measurements', str(nMeasurements))
|
writeFile('/sys/nb/n_measurements', str(nMeasurements))
|
||||||
|
|||||||
@@ -479,8 +479,8 @@ def getCodeForCacheExperiment(level, seq, initSeq, cacheSetList, cBox, cSlice, c
|
|||||||
|
|
||||||
def runCacheExperimentCode(code, initCode, oneTimeInitCode, loop, warmUpCount, codeOffset, nMeasurements, agg):
|
def runCacheExperimentCode(code, initCode, oneTimeInitCode, loop, warmUpCount, codeOffset, nMeasurements, agg):
|
||||||
resetNanoBench()
|
resetNanoBench()
|
||||||
setNanoBenchParameters(config=getDefaultCacheConfig(), msrConfig=getDefaultCacheMSRConfig(), nMeasurements=nMeasurements, unrollCount=1, loopCount=loop,
|
setNanoBenchParameters(config=getDefaultCacheConfig(), msrConfig=getDefaultCacheMSRConfig(), fixedCounters=True, nMeasurements=nMeasurements, unrollCount=1,
|
||||||
warmUpCount=warmUpCount, aggregateFunction=agg, basicMode=True, noMem=True, codeOffset=codeOffset, verbose=None)
|
loopCount=loop, warmUpCount=warmUpCount, aggregateFunction=agg, basicMode=True, noMem=True, codeOffset=codeOffset, verbose=None)
|
||||||
return runNanoBench(code=code, init=initCode, oneTimeInit=oneTimeInitCode)
|
return runNanoBench(code=code, init=initCode, oneTimeInit=oneTimeInitCode)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,8 @@ def main():
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
resetNanoBench()
|
resetNanoBench()
|
||||||
setNanoBenchParameters(config=getDefaultCacheConfig(), nMeasurements=1, warmUpCount=0, unrollCount=1, loopCount=args.loop, basicMode=False, noMem=True)
|
setNanoBenchParameters(config=getDefaultCacheConfig(), fixedCounters=True, nMeasurements=1, warmUpCount=0, unrollCount=1, loopCount=args.loop,
|
||||||
|
basicMode=False, noMem=True)
|
||||||
|
|
||||||
nbDicts = []
|
nbDicts = []
|
||||||
xValues = []
|
xValues = []
|
||||||
|
|||||||
@@ -350,7 +350,7 @@ def configurePFCs(events):
|
|||||||
cfg = getEventConfig(event)
|
cfg = getEventConfig(event)
|
||||||
if cfg is not None:
|
if cfg is not None:
|
||||||
content += cfg + ' ' + event + '\n'
|
content += cfg + ' ' + event + '\n'
|
||||||
setNanoBenchParameters(config=content)
|
setNanoBenchParameters(config=content, fixedCounters=True)
|
||||||
|
|
||||||
|
|
||||||
InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit'])
|
InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit'])
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ void print_usage() {
|
|||||||
printf(" -code_late_init <filename>: Binary file containing code to be executed once immediately before the code to be benchmarked.\n");
|
printf(" -code_late_init <filename>: Binary file containing code to be executed once immediately before the code to be benchmarked.\n");
|
||||||
printf(" -code_one_time_init <filename>: Binary file containing code to be executed once before the first measurement\n");
|
printf(" -code_one_time_init <filename>: Binary file containing code to be executed once before the first measurement\n");
|
||||||
printf(" -config <filename>: File with performance counter event specifications.\n");
|
printf(" -config <filename>: File with performance counter event specifications.\n");
|
||||||
|
printf(" -fixed_counters: Reads the fixed-function performance counters.\n");
|
||||||
printf(" -n_measurements <n>: Number of times the measurements are repeated.\n");
|
printf(" -n_measurements <n>: Number of times the measurements are repeated.\n");
|
||||||
printf(" -unroll_count <n>: Number of copies of the benchmark code inside the inner loop.\n");
|
printf(" -unroll_count <n>: Number of copies of the benchmark code inside the inner loop.\n");
|
||||||
printf(" -loop_count <n>: Number of iterations of the inner loop.\n");
|
printf(" -loop_count <n>: Number of iterations of the inner loop.\n");
|
||||||
@@ -75,6 +76,7 @@ int main(int argc, char **argv) {
|
|||||||
{"code_late_init", required_argument, 0, 't'},
|
{"code_late_init", required_argument, 0, 't'},
|
||||||
{"code_one_time_init", required_argument, 0, 'o'},
|
{"code_one_time_init", required_argument, 0, 'o'},
|
||||||
{"config", required_argument, 0, 'f'},
|
{"config", required_argument, 0, 'f'},
|
||||||
|
{"fixed_counters", no_argument, &use_fixed_counters, 1},
|
||||||
{"n_measurements", required_argument, 0, 'n'},
|
{"n_measurements", required_argument, 0, 'n'},
|
||||||
{"unroll_count", required_argument, 0, 'u'},
|
{"unroll_count", required_argument, 0, 'u'},
|
||||||
{"loop_count", required_argument, 0, 'l'},
|
{"loop_count", required_argument, 0, 'l'},
|
||||||
@@ -235,9 +237,6 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*************************************
|
|
||||||
* Fixed-function counters
|
|
||||||
************************************/
|
|
||||||
long base_unroll_count = (basic_mode?0:unroll_count);
|
long base_unroll_count = (basic_mode?0:unroll_count);
|
||||||
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
|
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
|
||||||
long base_loop_count = (basic_mode?0:loop_count);
|
long base_loop_count = (basic_mode?0:loop_count);
|
||||||
@@ -246,57 +245,63 @@ int main(int argc, char **argv) {
|
|||||||
char buf[100];
|
char buf[100];
|
||||||
char* measurement_template;
|
char* measurement_template;
|
||||||
|
|
||||||
if (is_AMD_CPU) {
|
|
||||||
if (no_mem) {
|
|
||||||
measurement_template = (char*)&measurement_RDTSC_template_noMem;
|
|
||||||
} else {
|
|
||||||
measurement_template = (char*)&measurement_RDTSC_template;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (no_mem) {
|
|
||||||
measurement_template = (char*)&measurement_FF_template_Intel_noMem;
|
|
||||||
} else {
|
|
||||||
measurement_template = (char*)&measurement_FF_template_Intel;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
create_and_run_one_time_init_code();
|
create_and_run_one_time_init_code();
|
||||||
run_warmup_experiment(measurement_template);
|
run_initial_warmup_experiment();
|
||||||
|
|
||||||
if (is_AMD_CPU) {
|
/*************************************
|
||||||
run_experiment(measurement_template, measurement_results_base, 1, base_unroll_count, base_loop_count);
|
* Fixed-function counters
|
||||||
run_experiment(measurement_template, measurement_results, 1, main_unroll_count, main_loop_count);
|
************************************/
|
||||||
|
if (use_fixed_counters) {
|
||||||
if (verbose) {
|
if (is_AMD_CPU) {
|
||||||
printf("\nRDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
if (no_mem) {
|
||||||
print_all_measurement_results(measurement_results_base, 1);
|
measurement_template = (char*)&measurement_RDTSC_template_noMem;
|
||||||
printf("RDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
} else {
|
||||||
print_all_measurement_results(measurement_results, 1);
|
measurement_template = (char*)&measurement_RDTSC_template;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (no_mem) {
|
||||||
|
measurement_template = (char*)&measurement_FF_template_Intel_noMem;
|
||||||
|
} else {
|
||||||
|
measurement_template = (char*)&measurement_FF_template_Intel;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
if (is_AMD_CPU) {
|
||||||
} else {
|
run_experiment(measurement_template, measurement_results_base, 1, base_unroll_count, base_loop_count);
|
||||||
configure_perf_ctrs_FF(usr, os);
|
run_experiment(measurement_template, measurement_results, 1, main_unroll_count, main_loop_count);
|
||||||
|
|
||||||
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
|
if (verbose) {
|
||||||
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
|
printf("\nRDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results_base, 1);
|
||||||
|
printf("RDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results, 1);
|
||||||
|
}
|
||||||
|
|
||||||
if (verbose) {
|
printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||||
printf("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
} else {
|
||||||
print_all_measurement_results(measurement_results_base, 4);
|
configure_perf_ctrs_FF_Intel(usr, os);
|
||||||
printf("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
|
||||||
print_all_measurement_results(measurement_results, 4);
|
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
|
||||||
|
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
printf("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results_base, 4);
|
||||||
|
printf("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||||
|
print_all_measurement_results(measurement_results, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||||
|
printf("%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1));
|
||||||
|
printf("%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2));
|
||||||
|
printf("%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
|
||||||
printf("%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1));
|
|
||||||
printf("%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2));
|
|
||||||
printf("%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*************************************
|
/*************************************
|
||||||
* Programmable counters
|
* Programmable counters
|
||||||
************************************/
|
************************************/
|
||||||
|
int n_used_counters = n_programmable_counters;
|
||||||
if (is_AMD_CPU) {
|
if (is_AMD_CPU) {
|
||||||
if (no_mem) {
|
if (no_mem) {
|
||||||
measurement_template = (char*)&measurement_template_AMD_noMem;
|
measurement_template = (char*)&measurement_template_AMD_noMem;
|
||||||
@@ -304,15 +309,17 @@ int main(int argc, char **argv) {
|
|||||||
measurement_template = (char*)&measurement_template_AMD;
|
measurement_template = (char*)&measurement_template_AMD;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (no_mem) {
|
if (n_used_counters >= 4) {
|
||||||
if (n_programmable_counters >= 4) {
|
n_used_counters = 4;
|
||||||
measurement_template = (char*)&measurement_template_Intel_noMem_4;
|
if (no_mem) {
|
||||||
|
measurement_template = (char*)&measurement_template_Intel_noMem_4;
|
||||||
} else {
|
} else {
|
||||||
measurement_template = (char*)&measurement_template_Intel_noMem_2;
|
measurement_template = (char*)&measurement_template_Intel_4;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (n_programmable_counters >= 4) {
|
n_used_counters = 2;
|
||||||
measurement_template = (char*)&measurement_template_Intel_4;
|
if (no_mem) {
|
||||||
|
measurement_template = (char*)&measurement_template_Intel_noMem_2;
|
||||||
} else {
|
} else {
|
||||||
measurement_template = (char*)&measurement_template_Intel_2;
|
measurement_template = (char*)&measurement_template_Intel_2;
|
||||||
}
|
}
|
||||||
@@ -322,19 +329,19 @@ int main(int argc, char **argv) {
|
|||||||
size_t next_pfc_config = 0;
|
size_t next_pfc_config = 0;
|
||||||
while (next_pfc_config < n_pfc_configs) {
|
while (next_pfc_config < n_pfc_configs) {
|
||||||
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
|
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
|
||||||
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, usr, os, pfc_descriptions);
|
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, usr, os, pfc_descriptions);
|
||||||
|
|
||||||
run_experiment(measurement_template, measurement_results_base, n_programmable_counters, base_unroll_count, base_loop_count);
|
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);
|
||||||
run_experiment(measurement_template, measurement_results, n_programmable_counters, main_unroll_count, main_loop_count);
|
run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count);
|
||||||
|
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
printf("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
printf("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||||
print_all_measurement_results(measurement_results_base, n_programmable_counters);
|
print_all_measurement_results(measurement_results_base, n_used_counters);
|
||||||
printf("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
printf("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||||
print_all_measurement_results(measurement_results, n_programmable_counters);
|
print_all_measurement_results(measurement_results, n_used_counters);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t c=0; c < n_programmable_counters; c++) {
|
for (size_t c=0; c < n_used_counters; c++) {
|
||||||
if (pfc_descriptions[c]) printf("%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c));
|
if (pfc_descriptions[c]) printf("%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user