From 04bfd842a4a787d52d7104585f8ee15e46b95a87 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Fri, 29 Oct 2021 17:32:59 +0200 Subject: [PATCH] make fixed counters optional --- README.md | 26 +++--- common/nanoBench.c | 111 ++++++++++------------ common/nanoBench.h | 45 +++++---- configs/cfg_Broadwell_common.txt | 10 +- configs/cfg_Core_all.txt | 2 +- configs/cfg_Core_common.txt | 4 +- configs/cfg_Haswell_common.txt | 6 +- configs/cfg_IceLake_common.txt | 6 +- configs/cfg_IvyBridge_common.txt | 10 +- configs/cfg_KnightsLanding_common.txt | 4 +- configs/cfg_Nehalem_common.txt | 4 +- configs/cfg_SandyBridge_common.txt | 10 +- configs/cfg_Skylake_common.txt | 8 +- configs/cfg_Westmere_common.txt | 6 +- configs/cfg_XeonScalable_common.txt | 6 +- kernel-nanoBench.sh | 9 ++ kernel/nb_km.c | 129 +++++++++++++++----------- kernelNanoBench.py | 15 ++- tools/CacheAnalyzer/cacheLib.py | 4 +- tools/CacheAnalyzer/strideGraph.py | 3 +- tools/cpuBench/cpuBench.py | 2 +- user/nanoBench_main.c | 115 ++++++++++++----------- 22 files changed, 294 insertions(+), 241 deletions(-) diff --git a/README.md b/README.md index fd6f07b..01b0ca7 100644 --- a/README.md +++ b/README.md @@ -42,12 +42,11 @@ The following command will benchmark the assembler code sequence "ADD RAX, RBX; sudo ./nanoBench.sh -asm "ADD RAX, RBX; add RBX, RAX" -config configs/cfg_Skylake_common.txt It will produce an output similar to the following. - - Instructions retired: 2.00 - Core cycles: 2.00 - Reference cycles: 1.85 - UOPS_ISSUED.ANY: 2.00 - UOPS_EXECUTED.THREAD: 2.00 + + CORE_CYCLES: 2.00 + INST_RETIRED: 2.00 + UOPS_ISSUED: 2.00 + UOPS_EXECUTED: 2.00 UOPS_DISPATCHED_PORT.PORT_0: 0.49 UOPS_DISPATCHED_PORT.PORT_1: 0.50 UOPS_DISPATCHED_PORT.PORT_2: 0.00 @@ -96,7 +95,7 @@ We will now take a look behind the scenes at the code that *nanoBench* generates int run(code, code_init, local_unroll_count): int measurements[n_measurements] - + for i=-warm_up_count to n_measurements save_regs code_init @@ -111,17 +110,17 @@ We will now take a look behind the scenes at the code that *nanoBench* generates restore_regs if i >= 0: // ignore warm-up runs measurements[i] = m2 - m1 - + return agg(measurements) // apply selected aggregate function -`run(...)` is executed twice: The first time with `local_unroll_count = unroll_count`, and the second time with `local_unroll_count = 2 * unroll_count`. If the `-basic_mode` options is used, the first execution is with no instructions between `m1 = read_perf_ctrs` and `m2 = read_perf_ctrs`, and the second with `local_unroll_count = unroll_count`. +`run(...)` is executed twice: The first time with `local_unroll_count = unroll_count`, and the second time with `local_unroll_count = 2 * unroll_count`. If the `-basic_mode` options is used, the first execution is with no instructions between `m1 = read_perf_ctrs` and `m2 = read_perf_ctrs`, and the second with `local_unroll_count = unroll_count`. -The result that is finally reported by *nanoBench* is the difference between these two executions divided by `max(loop_count * unroll_count, unroll_count)`. +The result that is finally reported by *nanoBench* is the difference between these two executions divided by `max(loop_count * unroll_count, unroll_count)`. Before the first execution of `run(...)`, the performance counters are configured according to the event specifications in the `-config` file. If this file contains more events than there are programmable performance counters available, `run(...)` is executed multiple times with different performance counter configurations. - + ## Command-line Options @@ -137,7 +136,8 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line | `-code_init ` | A binary file containing code to be executed once in the beginning of every benchmark run. *This option cannot be used together with `-asm_init`.* | | `-code_late_init ` | A binary file containing code to be executed once immediately before the code to be benchmarked. *This option cannot be used together with `-asm_late_init`.* | | `-code_one_time_init ` | A binary file containing code to be executed once before the first benchmark run. *This option cannot be used together with `-asm_one_time_init`.*| -| `-config ` | File with performance counter event specifications. Details are described [below](#performance-counter-config-files). | +| `-config ` | File with performance counter event specifications. Details are described [below](#performance-counter-config-files). | +| `-fixed_counters` | Reads the fixed-function performance counters. | | `-n_measurements ` | Number of times the measurements are repeated. `[Default: n=10]` | | `-unroll_count ` | Number of copies of the benchmark code inside the inner loop. `[Default: n=1000]` | | `-loop_count ` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` | @@ -196,7 +196,7 @@ can be used to count the number of last-level cache lookups in C-Box 0 on a Skyl ## Pausing Performance Counting -If the `-no_mem` option is used, nanoBench provides a feature to temporarily pause performance counting. This is enabled by including the *magic* byte sequences `0xF0b513b1C2813F04` (for stopping the counters), and `0xE0b513b1C2813F04` (for restarting them) in the code of the microbenchmark. +If the `-no_mem` option is used, nanoBench provides a feature to temporarily pause performance counting. This is enabled by including the *magic* byte sequences `0xF0B513B1C2813F04` (for stopping the counters), and `0xE0B513B1C2813F04` (for restarting them) in the code of the microbenchmark. Using this feature incurs a certain timing overhead that will be included in the measurement results. It is therefore, in particular, useful for microbenchmarks that do not measure the time, but e.g., cache hits or misses, such as the microbenchmarks generated by the tools in [tools/CacheAnalyzer](tools/CacheAnalyzer). diff --git a/common/nanoBench.c b/common/nanoBench.c index 1856b1b..c7d2585 100644 --- a/common/nanoBench.c +++ b/common/nanoBench.c @@ -21,6 +21,7 @@ int drain_frontend = DRAIN_FRONTEND_DEFAULT; int no_mem = NO_MEM_DEFAULT; int no_normalization = NO_NORMALIZATION_DEFAULT; int basic_mode = BASIC_MODE_DEFAULT; +int use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT; int aggregate_function = AGGREGATE_FUNCTION_DEFAULT; int verbose = VERBOSE_DEFAULT; int debug = DEBUG_DEFAULT; @@ -113,19 +114,13 @@ int check_cpuid() { return 1; } - unsigned int n_available_counters = ((eax >> 8) & 0xFF); - print_user_verbose("Number of general-purpose performance counters: %u\n", n_available_counters); - if (n_available_counters >= 4) { - n_programmable_counters = 4; - } else if (n_available_counters >= 2) { - n_programmable_counters = 2; - } else { - print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_available_counters); + n_programmable_counters = ((eax >> 8) & 0xFF); + print_user_verbose("Number of general-purpose performance counters: %u\n", n_programmable_counters); + if (n_programmable_counters < 2) { + print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_programmable_counters); return 1; } - print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF)); - } else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) { is_AMD_CPU = 1; n_programmable_counters = 6; @@ -299,34 +294,32 @@ void write_msr(unsigned int msr, uint64_t value) { #endif } -void configure_perf_ctrs_FF(unsigned int usr, unsigned int os) { - if (is_Intel_CPU) { - uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL); - global_ctrl |= ((uint64_t)7 << 32) | 15; - write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl); +void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os) { + uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL); + global_ctrl |= ((uint64_t)7 << 32) | 15; + write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl); - uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL); - // disable fixed counters - fixed_ctrl &= ~((1 << 12) - 1); - write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl); - // clear - for (int i=0; i<3; i++) { - write_msr(MSR_IA32_FIXED_CTR0+i, 0); - } - //enable fixed counters - fixed_ctrl |= (os << 8) | (os << 4) | os; - fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1); - write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl); + uint64_t fixed_ctrl = read_msr(MSR_IA32_FIXED_CTR_CTRL); + // disable fixed counters + fixed_ctrl &= ~((1 << 12) - 1); + write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl); + // clear + for (int i=0; i<3; i++) { + write_msr(MSR_IA32_FIXED_CTR0+i, 0); } + //enable fixed counters + fixed_ctrl |= (os << 8) | (os << 4) | os; + fixed_ctrl |= (usr << 9) | (usr << 5) | (usr << 1); + write_msr(MSR_IA32_FIXED_CTR_CTRL, fixed_ctrl); } -size_t configure_perf_ctrs_programmable(size_t next_pfc_config, unsigned int usr, unsigned int os, char* descriptions[]) { +size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]) { if (is_Intel_CPU) { uint64_t global_ctrl = read_msr(MSR_IA32_PERF_GLOBAL_CTRL); global_ctrl |= ((uint64_t)7 << 32) | 15; write_msr(MSR_IA32_PERF_GLOBAL_CTRL, global_ctrl); - for (int i=0; i /sys/nb/config shift 2 + elif [[ "$1" == -f* ]]; then + echo "1" > /sys/nb/fixed_counters + shift elif [[ "$1" == -msr* ]]; then echo -n "$2" > /sys/nb/msr_config shift 2 @@ -112,6 +115,7 @@ while [ "$1" ]; do echo " -code_init : Binary file containing code to be executed once in the beginning." echo " -code_late_init : Binary file containing code to be executed once immediately before the code to be benchmarked." echo " -config : File with performance counter event specifications." + echo " -fixed_counters: Reads the fixed-function performance counters.\n" echo " -n_measurements : Number of times the measurements are repeated." echo " -unroll_count : Number of copies of the benchmark code inside the inner loop." echo " -loop_count : Number of iterations of the inner loop." @@ -135,4 +139,9 @@ while [ "$1" ]; do fi done +prev_nmi_watchdog=$(cat /proc/sys/kernel/nmi_watchdog) +echo 0 > /proc/sys/kernel/nmi_watchdog + $taskset cat /proc/nanoBench + +echo $prev_nmi_watchdog > /proc/sys/kernel/nmi_watchdog diff --git a/kernel/nb_km.c b/kernel/nb_km.c index ef12259..f1e500c 100644 --- a/kernel/nb_km.c +++ b/kernel/nb_km.c @@ -219,6 +219,15 @@ static ssize_t msr_config_store(struct kobject *kobj, struct kobj_attribute *att } static struct kobj_attribute msr_config_attribute =__ATTR(msr_config, 0660, msr_config_show, msr_config_store); +static ssize_t fixed_counters_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + return sprintf(buf, "%u\n", use_fixed_counters); +} +static ssize_t fixed_counters_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + sscanf(buf, "%u", &use_fixed_counters); + return count; +} +static struct kobj_attribute fixed_counters_attribute =__ATTR(fixed_counters, 0660, fixed_counters_show, fixed_counters_store); + static ssize_t unroll_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%ld\n", unroll_count); } @@ -455,6 +464,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha no_mem = NO_MEM_DEFAULT; no_normalization = NO_NORMALIZATION_DEFAULT; basic_mode = BASIC_MODE_DEFAULT; + use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT; aggregate_function = AGGREGATE_FUNCTION_DEFAULT; verbose = VERBOSE_DEFAULT; alignment_offset = ALIGNMENT_OFFSET_DEFAULT; @@ -500,61 +510,65 @@ static int show(struct seq_file *m, void *v) { char buf[100]; char* measurement_template; + create_and_run_one_time_init_code(); + run_initial_warmup_experiment(); + /********************************* * Fixed-function counters. - ********************************/ - if (is_AMD_CPU) { - if (no_mem) { - measurement_template = (char*)&measurement_FF_template_AMD_noMem; + ********************************/ + if (use_fixed_counters) { + if (is_AMD_CPU) { + if (no_mem) { + measurement_template = (char*)&measurement_FF_template_AMD_noMem; + } else { + measurement_template = (char*)&measurement_FF_template_AMD; + } } else { - measurement_template = (char*)&measurement_FF_template_AMD; + if (no_mem) { + measurement_template = (char*)&measurement_FF_template_Intel_noMem; + } else { + measurement_template = (char*)&measurement_FF_template_Intel; + } } - } else { - if (no_mem) { - measurement_template = (char*)&measurement_FF_template_Intel_noMem; + + if (is_AMD_CPU) { + run_experiment(measurement_template, measurement_results_base, 3, base_unroll_count, base_loop_count); + run_experiment(measurement_template, measurement_results, 3, main_unroll_count, main_loop_count); + + if (verbose) { + pr_debug("\nRDTSC, MPERF, and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); + print_all_measurement_results(measurement_results_base, 3); + pr_debug("RDTSC, MPERF, and and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); + print_all_measurement_results(measurement_results, 3); + } + + seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); + seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1)); + seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2)); } else { - measurement_template = (char*)&measurement_FF_template_Intel; + configure_perf_ctrs_FF_Intel(0, 1); + + run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count); + run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count); + + if (verbose) { + pr_debug("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); + print_all_measurement_results(measurement_results_base, 4); + pr_debug("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); + print_all_measurement_results(measurement_results, 4); + } + + seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); + seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1)); + seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2)); + seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3)); } } - configure_perf_ctrs_FF(0, 1); - create_and_run_one_time_init_code(); - run_warmup_experiment(measurement_template); - - if (is_AMD_CPU) { - run_experiment(measurement_template, measurement_results_base, 3, base_unroll_count, base_loop_count); - run_experiment(measurement_template, measurement_results, 3, main_unroll_count, main_loop_count); - - if (verbose) { - pr_debug("\nRDTSC, MPERF, and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); - print_all_measurement_results(measurement_results_base, 3); - pr_debug("RDTSC, MPERF, and and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); - print_all_measurement_results(measurement_results, 3); - } - - seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); - seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1)); - seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2)); - } else { - run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count); - run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count); - - if (verbose) { - pr_debug("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); - print_all_measurement_results(measurement_results_base, 4); - pr_debug("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); - print_all_measurement_results(measurement_results, 4); - } - - seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); - seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1)); - seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2)); - seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3)); - } - /********************************* * Programmable counters. ********************************/ + int n_used_counters = n_programmable_counters; if (is_AMD_CPU) { if (no_mem) { measurement_template = (char*)&measurement_template_AMD_noMem; @@ -562,15 +576,17 @@ static int show(struct seq_file *m, void *v) { measurement_template = (char*)&measurement_template_AMD; } } else { - if (no_mem) { - if (n_programmable_counters >= 4) { - measurement_template = (char*)&measurement_template_Intel_noMem_4; + if (n_used_counters >= 4) { + n_used_counters = 4; + if (no_mem) { + measurement_template = (char*)&measurement_template_Intel_noMem_4; } else { - measurement_template = (char*)&measurement_template_Intel_noMem_2; + measurement_template = (char*)&measurement_template_Intel_4; } } else { - if (n_programmable_counters >= 4) { - measurement_template = (char*)&measurement_template_Intel_4; + n_used_counters = 2; + if (no_mem) { + measurement_template = (char*)&measurement_template_Intel_noMem_2; } else { measurement_template = (char*)&measurement_template_Intel_2; } @@ -580,20 +596,20 @@ static int show(struct seq_file *m, void *v) { size_t next_pfc_config = 0; while (next_pfc_config < n_pfc_configs) { char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0}; - next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, 1, 1, pfc_descriptions); + next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, 1, 1, pfc_descriptions); // on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set - run_experiment(measurement_template, measurement_results_base, n_programmable_counters, base_unroll_count, base_loop_count); - run_experiment(measurement_template, measurement_results, n_programmable_counters, main_unroll_count, main_loop_count); + run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count); + run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count); if (verbose) { pr_debug("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); - print_all_measurement_results(measurement_results_base, n_programmable_counters); + print_all_measurement_results(measurement_results_base, n_used_counters); pr_debug("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); - print_all_measurement_results(measurement_results, n_programmable_counters); + print_all_measurement_results(measurement_results, n_used_counters); } - for (size_t c=0; c < n_programmable_counters; c++) { + for (size_t c=0; c < n_used_counters; c++) { if (pfc_descriptions[c]) seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c)); } } @@ -715,6 +731,7 @@ static int __init nb_init(void) { error |= sysfs_create_file(nb_kobject, &code_one_time_init_attribute.attr); error |= sysfs_create_file(nb_kobject, &config_attribute.attr); error |= sysfs_create_file(nb_kobject, &msr_config_attribute.attr); + error |= sysfs_create_file(nb_kobject, &fixed_counters_attribute.attr); error |= sysfs_create_file(nb_kobject, &loop_count_attribute.attr); error |= sysfs_create_file(nb_kobject, &unroll_count_attribute.attr); error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr); diff --git a/kernelNanoBench.py b/kernelNanoBench.py index 39a5424..1ff70cd 100644 --- a/kernelNanoBench.py +++ b/kernelNanoBench.py @@ -3,8 +3,8 @@ import collections import subprocess import sys -PFC_START_ASM = '.quad 0xE0b513b1C2813F04' -PFC_STOP_ASM = '.quad 0xF0b513b1C2813F04' +PFC_START_ASM = '.quad 0xE0B513B1C2813F04' +PFC_STOP_ASM = '.quad 0xF0B513B1C2813F04' def writeFile(fileName, content): with open(fileName, 'w') as f: @@ -53,9 +53,9 @@ paramDict = dict() # Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters(). # Otherwise, reset() needs to be called first. -def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None, - warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None, aggregateFunction=None, - basicMode=None, noMem=None, noNormalization=None, verbose=None): +def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, fixedCounters=None, nMeasurements=None, unrollCount=None, + loopCount=None, warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None, + aggregateFunction=None, basicMode=None, noMem=None, noNormalization=None, verbose=None): if not ramdiskCreated: createRamdisk() if config is not None: @@ -74,6 +74,11 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf if msrConfigFile is not None: writeFile('/sys/nb/msr_config', msrConfigFile) + if fixedCounters is not None: + if paramDict.get('fixedCounters', None) != fixedCounters: + writeFile('/sys/nb/fixed_counters', str(int(fixedCounters))) + paramDict['fixedCounters'] = fixedCounters + if nMeasurements is not None: if paramDict.get('nMeasurements', None) != nMeasurements: writeFile('/sys/nb/n_measurements', str(nMeasurements)) diff --git a/tools/CacheAnalyzer/cacheLib.py b/tools/CacheAnalyzer/cacheLib.py index 9ac573d..56d7185 100755 --- a/tools/CacheAnalyzer/cacheLib.py +++ b/tools/CacheAnalyzer/cacheLib.py @@ -479,8 +479,8 @@ def getCodeForCacheExperiment(level, seq, initSeq, cacheSetList, cBox, cSlice, c def runCacheExperimentCode(code, initCode, oneTimeInitCode, loop, warmUpCount, codeOffset, nMeasurements, agg): resetNanoBench() - setNanoBenchParameters(config=getDefaultCacheConfig(), msrConfig=getDefaultCacheMSRConfig(), nMeasurements=nMeasurements, unrollCount=1, loopCount=loop, - warmUpCount=warmUpCount, aggregateFunction=agg, basicMode=True, noMem=True, codeOffset=codeOffset, verbose=None) + setNanoBenchParameters(config=getDefaultCacheConfig(), msrConfig=getDefaultCacheMSRConfig(), fixedCounters=True, nMeasurements=nMeasurements, unrollCount=1, + loopCount=loop, warmUpCount=warmUpCount, aggregateFunction=agg, basicMode=True, noMem=True, codeOffset=codeOffset, verbose=None) return runNanoBench(code=code, init=initCode, oneTimeInit=oneTimeInitCode) diff --git a/tools/CacheAnalyzer/strideGraph.py b/tools/CacheAnalyzer/strideGraph.py index d6cc569..ae80f86 100755 --- a/tools/CacheAnalyzer/strideGraph.py +++ b/tools/CacheAnalyzer/strideGraph.py @@ -17,7 +17,8 @@ def main(): args = parser.parse_args() resetNanoBench() - setNanoBenchParameters(config=getDefaultCacheConfig(), nMeasurements=1, warmUpCount=0, unrollCount=1, loopCount=args.loop, basicMode=False, noMem=True) + setNanoBenchParameters(config=getDefaultCacheConfig(), fixedCounters=True, nMeasurements=1, warmUpCount=0, unrollCount=1, loopCount=args.loop, + basicMode=False, noMem=True) nbDicts = [] xValues = [] diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py index cf7200c..8553e29 100755 --- a/tools/cpuBench/cpuBench.py +++ b/tools/cpuBench/cpuBench.py @@ -350,7 +350,7 @@ def configurePFCs(events): cfg = getEventConfig(event) if cfg is not None: content += cfg + ' ' + event + '\n' - setNanoBenchParameters(config=content) + setNanoBenchParameters(config=content, fixedCounters=True) InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit']) diff --git a/user/nanoBench_main.c b/user/nanoBench_main.c index cafaea6..ddac6ad 100644 --- a/user/nanoBench_main.c +++ b/user/nanoBench_main.c @@ -29,6 +29,7 @@ void print_usage() { printf(" -code_late_init : Binary file containing code to be executed once immediately before the code to be benchmarked.\n"); printf(" -code_one_time_init : Binary file containing code to be executed once before the first measurement\n"); printf(" -config : File with performance counter event specifications.\n"); + printf(" -fixed_counters: Reads the fixed-function performance counters.\n"); printf(" -n_measurements : Number of times the measurements are repeated.\n"); printf(" -unroll_count : Number of copies of the benchmark code inside the inner loop.\n"); printf(" -loop_count : Number of iterations of the inner loop.\n"); @@ -75,6 +76,7 @@ int main(int argc, char **argv) { {"code_late_init", required_argument, 0, 't'}, {"code_one_time_init", required_argument, 0, 'o'}, {"config", required_argument, 0, 'f'}, + {"fixed_counters", no_argument, &use_fixed_counters, 1}, {"n_measurements", required_argument, 0, 'n'}, {"unroll_count", required_argument, 0, 'u'}, {"loop_count", required_argument, 0, 'l'}, @@ -235,9 +237,6 @@ int main(int argc, char **argv) { } } - /************************************* - * Fixed-function counters - ************************************/ long base_unroll_count = (basic_mode?0:unroll_count); long main_unroll_count = (basic_mode?unroll_count:2*unroll_count); long base_loop_count = (basic_mode?0:loop_count); @@ -246,57 +245,63 @@ int main(int argc, char **argv) { char buf[100]; char* measurement_template; - if (is_AMD_CPU) { - if (no_mem) { - measurement_template = (char*)&measurement_RDTSC_template_noMem; - } else { - measurement_template = (char*)&measurement_RDTSC_template; - } - } else { - if (no_mem) { - measurement_template = (char*)&measurement_FF_template_Intel_noMem; - } else { - measurement_template = (char*)&measurement_FF_template_Intel; - } - } - create_and_run_one_time_init_code(); - run_warmup_experiment(measurement_template); + run_initial_warmup_experiment(); - if (is_AMD_CPU) { - run_experiment(measurement_template, measurement_results_base, 1, base_unroll_count, base_loop_count); - run_experiment(measurement_template, measurement_results, 1, main_unroll_count, main_loop_count); - - if (verbose) { - printf("\nRDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); - print_all_measurement_results(measurement_results_base, 1); - printf("RDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); - print_all_measurement_results(measurement_results, 1); + /************************************* + * Fixed-function counters + ************************************/ + if (use_fixed_counters) { + if (is_AMD_CPU) { + if (no_mem) { + measurement_template = (char*)&measurement_RDTSC_template_noMem; + } else { + measurement_template = (char*)&measurement_RDTSC_template; + } + } else { + if (no_mem) { + measurement_template = (char*)&measurement_FF_template_Intel_noMem; + } else { + measurement_template = (char*)&measurement_FF_template_Intel; + } } - printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); - } else { - configure_perf_ctrs_FF(usr, os); + if (is_AMD_CPU) { + run_experiment(measurement_template, measurement_results_base, 1, base_unroll_count, base_loop_count); + run_experiment(measurement_template, measurement_results, 1, main_unroll_count, main_loop_count); - run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count); - run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count); + if (verbose) { + printf("\nRDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); + print_all_measurement_results(measurement_results_base, 1); + printf("RDTSC results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); + print_all_measurement_results(measurement_results, 1); + } - if (verbose) { - printf("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); - print_all_measurement_results(measurement_results_base, 4); - printf("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); - print_all_measurement_results(measurement_results, 4); + printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); + } else { + configure_perf_ctrs_FF_Intel(usr, os); + + run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count); + run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count); + + if (verbose) { + printf("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); + print_all_measurement_results(measurement_results_base, 4); + printf("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); + print_all_measurement_results(measurement_results, 4); + } + + printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); + printf("%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1)); + printf("%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2)); + printf("%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3)); } - - printf("%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); - printf("%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1)); - printf("%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2)); - printf("%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3)); } /************************************* * Programmable counters ************************************/ + int n_used_counters = n_programmable_counters; if (is_AMD_CPU) { if (no_mem) { measurement_template = (char*)&measurement_template_AMD_noMem; @@ -304,15 +309,17 @@ int main(int argc, char **argv) { measurement_template = (char*)&measurement_template_AMD; } } else { - if (no_mem) { - if (n_programmable_counters >= 4) { - measurement_template = (char*)&measurement_template_Intel_noMem_4; + if (n_used_counters >= 4) { + n_used_counters = 4; + if (no_mem) { + measurement_template = (char*)&measurement_template_Intel_noMem_4; } else { - measurement_template = (char*)&measurement_template_Intel_noMem_2; + measurement_template = (char*)&measurement_template_Intel_4; } } else { - if (n_programmable_counters >= 4) { - measurement_template = (char*)&measurement_template_Intel_4; + n_used_counters = 2; + if (no_mem) { + measurement_template = (char*)&measurement_template_Intel_noMem_2; } else { measurement_template = (char*)&measurement_template_Intel_2; } @@ -322,19 +329,19 @@ int main(int argc, char **argv) { size_t next_pfc_config = 0; while (next_pfc_config < n_pfc_configs) { char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0}; - next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, usr, os, pfc_descriptions); + next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, n_used_counters, usr, os, pfc_descriptions); - run_experiment(measurement_template, measurement_results_base, n_programmable_counters, base_unroll_count, base_loop_count); - run_experiment(measurement_template, measurement_results, n_programmable_counters, main_unroll_count, main_loop_count); + run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count); + run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count); if (verbose) { printf("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); - print_all_measurement_results(measurement_results_base, n_programmable_counters); + print_all_measurement_results(measurement_results_base, n_used_counters); printf("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); - print_all_measurement_results(measurement_results, n_programmable_counters); + print_all_measurement_results(measurement_results, n_used_counters); } - for (size_t c=0; c < n_programmable_counters; c++) { + for (size_t c=0; c < n_used_counters; c++) { if (pfc_descriptions[c]) printf("%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c)); } }