diff --git a/README.md b/README.md index a36a371..407cead 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line | `-loop_count ` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` | | `-warm_up_count ` | Number of runs of the generated benchmark code sequence (in each invocation of `run(...)`) before the first measurement result gets recorded . This can, for example, be useful for excluding outliers due to cold caches. `[Default: n=5]` | | `-initial_warm_up_count ` | Number of runs of the benchmark code sequence before the first invocation of `run(...)`. This can be useful for benchmarking instructions that require a warm-up period before they can execute at full speed, like [AVX2 instructions on some microarchitectures](https://www.agner.org/optimize/blog/read.php?i=415). `[Default: n=0]` | -| `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` | +| `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` | | `-median` | Selects the median as the aggregate function. | | `-min` | Selects the minimum as the aggregate function. | | `-basic_mode` | The effect of this option is described in the [Generated Code](#generated-code) section. | @@ -152,6 +152,7 @@ The following parameters are only supported by `nanoBench.sh`. | `-cpu ` | Pins the measurement thread to CPU n. `[Default: Pin the thread to the CPU it is currently running on.]` | | `-usr ` | If n=1, performance events are counted when the processor is operating at a privilege level greater than 0. `[Default: n=1]` | | `-os ` | If n=1, performance events are counted when the processor is operating at privilege level 0. `[Default: n=0]` | +| `-debug` | Enables the debug mode (see [below](#debug-mode)). | ## Performance Counter Config Files @@ -164,6 +165,10 @@ The format of the entries in the configuration files is You can find details on the meanings of the different parts of the entries in chapters 18 and 19 of [Intel's System Programming Guide](https://software.intel.com/sites/default/files/managed/a4/60/325384-sdm-vol-3abcd.pdf). +## Debug Mode + +If the debug mode is enabled, the [generated code](#generated-code) contains a breakpoint right before the line `m2 = read_perf_ctrs`, and *nanoBench* is run using *gdb*. This makes it possible to analyze the effect of the code to be benchmarked on registers and on the memory. The command `info all-registers` can, for example, be used to display the current values of all registers. + ## Supported Platforms *nanoBench* should work with all Intel processors supporting architectural performance monitoring version ≥ 3, as well as with AMD Family 17h processors. diff --git a/common/nanoBench.c b/common/nanoBench.c index 7b6e089..5b3101f 100644 --- a/common/nanoBench.c +++ b/common/nanoBench.c @@ -21,6 +21,7 @@ int no_mem = NO_MEM_DEFAULT; int basic_mode = BASIC_MODE_DEFAULT; int aggregate_function = AGGREGATE_FUNCTION_DEFAULT; int verbose = VERBOSE_DEFAULT; +int debug = DEBUG_DEFAULT; char* code = NULL; size_t code_length = 0; @@ -361,10 +362,14 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo rci += code_length; } - runtime_code[rci++] = '\x49'; runtime_code[rci++] = '\xFF'; runtime_code[rci++] = '\xCF'; //dec R15 + runtime_code[rci++] = '\x49'; runtime_code[rci++] = '\xFF'; runtime_code[rci++] = '\xCF'; // dec R15 runtime_code[rci++] = '\x0F'; runtime_code[rci++] = '\x85'; *(int32_t*)(&runtime_code[rci]) = (int32_t)(rci_loop_start-rci-4); rci += 4; // jnz loop_start } + + if (debug) { + runtime_code[rci++] = '\xCC'; // INT3 + } } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC)) { *(void**)(&runtime_code[rci]) = pfc_mem; templateI += 8; @@ -385,7 +390,7 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo templateI += 8; do { runtime_code[rci++] = measurement_template[templateI++]; - } while (measurement_template[templateI-1] != '\xc3'); // 0xc3 = ret + } while (measurement_template[templateI-1] != '\xC3'); // 0xC3 = ret } void run_warmup_experiment(char* measurement_template) { diff --git a/common/nanoBench.h b/common/nanoBench.h index b66ff90..45b103c 100644 --- a/common/nanoBench.h +++ b/common/nanoBench.h @@ -110,6 +110,10 @@ extern int aggregate_function; extern int verbose; #define VERBOSE_DEFAULT 0; +// Whether to generate a breakpoint trap after executing the code to be benchmarked. +extern int debug; +#define DEBUG_DEFAULT 0; + extern char* code; extern size_t code_length; @@ -262,6 +266,7 @@ void measurement_RDTSC_template_noMem(void); "mov r11, 0\n" \ "mov r12, 0\n" \ "mov r13, 0\n" \ + "mov r15, 0\n" \ "mov r14, rsp\n" \ "add r14, 0x1000\n" \ "mov rdi, rsp\n" \ diff --git a/nanoBench.sh b/nanoBench.sh index d463fe4..3923f4a 100755 --- a/nanoBench.sh +++ b/nanoBench.sh @@ -12,6 +12,13 @@ if ! command -v rdmsr &>/dev/null; then exit 1 fi +debug=false +for p in "$@"; do + if [[ "$p" == -d* ]]; then + debug=true + fi +done + args='' while [ "$2" ]; do if [ "$1" == '-asm' ]; then @@ -27,7 +34,7 @@ while [ "$2" ]; do as asm-init.s -o asm-init.o || exit objcopy asm-init.o -O binary asm-init.bin args="$args -code_init asm-init.bin" - shift 2 + shift 2 else args="$args $1" shift @@ -52,7 +59,11 @@ iTCO_vendor_support_prev_loaded=$? prev_nmi_watchdog=$(cat /proc/sys/kernel/nmi_watchdog) echo 0 > /proc/sys/kernel/nmi_watchdog -user/nanoBench $@ +if [ "$debug" = true ]; then + gdb -ex=run --args user/nanoBench $@ +else + user/nanoBench $@ +fi rm -f asm-code.* rm -f asm-init.* diff --git a/user/nanoBench_main.c b/user/nanoBench_main.c index f4fd13d..89e2da1 100644 --- a/user/nanoBench_main.c +++ b/user/nanoBench_main.c @@ -41,6 +41,7 @@ void print_usage() { printf(" -cpu : Pins the measurement thread to CPU n. \n"); printf(" -usr : If 1, counts events at a privilege level greater than 0.\n"); printf(" -os : If 1, counts events at a privilege level 0.\n"); + printf(" -debug: Generate a breakpoint trap after running the code to be benchmarked.\n"); } size_t mmap_file(char* filename, char** content) { @@ -77,10 +78,11 @@ int main(int argc, char **argv) { {"min", no_argument, &aggregate_function, MIN}, {"basic_mode", no_argument, &basic_mode, 1}, {"no_mem", no_argument, &no_mem, 1}, - {"verbose", no_argument, &verbose, 1}, + {"verbose", no_argument, &verbose, 1}, {"cpu", required_argument, 0, 'p'}, {"usr", required_argument, 0, 'r'}, {"os", required_argument, 0, 's'}, + {"debug", no_argument, &debug, 1}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} };