mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2026-01-04 19:40:08 +01:00
added debug mode
This commit is contained in:
@@ -138,7 +138,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
|
||||
| `-loop_count <n>` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` |
|
||||
| `-warm_up_count <n>` | Number of runs of the generated benchmark code sequence (in each invocation of `run(...)`) before the first measurement result gets recorded . This can, for example, be useful for excluding outliers due to cold caches. `[Default: n=5]` |
|
||||
| `-initial_warm_up_count <n>` | Number of runs of the benchmark code sequence before the first invocation of `run(...)`. This can be useful for benchmarking instructions that require a warm-up period before they can execute at full speed, like [AVX2 instructions on some microarchitectures](https://www.agner.org/optimize/blog/read.php?i=415). `[Default: n=0]` |
|
||||
| `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` |
|
||||
| `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` |
|
||||
| `-median` | Selects the median as the aggregate function. |
|
||||
| `-min` | Selects the minimum as the aggregate function. |
|
||||
| `-basic_mode` | The effect of this option is described in the [Generated Code](#generated-code) section. |
|
||||
@@ -152,6 +152,7 @@ The following parameters are only supported by `nanoBench.sh`.
|
||||
| `-cpu <n>` | Pins the measurement thread to CPU n. `[Default: Pin the thread to the CPU it is currently running on.]` |
|
||||
| `-usr <n>` | If n=1, performance events are counted when the processor is operating at a privilege level greater than 0. `[Default: n=1]` |
|
||||
| `-os <n>` | If n=1, performance events are counted when the processor is operating at privilege level 0. `[Default: n=0]` |
|
||||
| `-debug` | Enables the debug mode (see [below](#debug-mode)). |
|
||||
|
||||
|
||||
## Performance Counter Config Files
|
||||
@@ -164,6 +165,10 @@ The format of the entries in the configuration files is
|
||||
|
||||
You can find details on the meanings of the different parts of the entries in chapters 18 and 19 of [Intel's System Programming Guide](https://software.intel.com/sites/default/files/managed/a4/60/325384-sdm-vol-3abcd.pdf).
|
||||
|
||||
## Debug Mode
|
||||
|
||||
If the debug mode is enabled, the [generated code](#generated-code) contains a breakpoint right before the line `m2 = read_perf_ctrs`, and *nanoBench* is run using *gdb*. This makes it possible to analyze the effect of the code to be benchmarked on registers and on the memory. The command `info all-registers` can, for example, be used to display the current values of all registers.
|
||||
|
||||
## Supported Platforms
|
||||
|
||||
*nanoBench* should work with all Intel processors supporting architectural performance monitoring version ≥ 3, as well as with AMD Family 17h processors.
|
||||
|
||||
@@ -21,6 +21,7 @@ int no_mem = NO_MEM_DEFAULT;
|
||||
int basic_mode = BASIC_MODE_DEFAULT;
|
||||
int aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
|
||||
int verbose = VERBOSE_DEFAULT;
|
||||
int debug = DEBUG_DEFAULT;
|
||||
|
||||
char* code = NULL;
|
||||
size_t code_length = 0;
|
||||
@@ -361,10 +362,14 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
rci += code_length;
|
||||
}
|
||||
|
||||
runtime_code[rci++] = '\x49'; runtime_code[rci++] = '\xFF'; runtime_code[rci++] = '\xCF'; //dec R15
|
||||
runtime_code[rci++] = '\x49'; runtime_code[rci++] = '\xFF'; runtime_code[rci++] = '\xCF'; // dec R15
|
||||
runtime_code[rci++] = '\x0F'; runtime_code[rci++] = '\x85';
|
||||
*(int32_t*)(&runtime_code[rci]) = (int32_t)(rci_loop_start-rci-4); rci += 4; // jnz loop_start
|
||||
}
|
||||
|
||||
if (debug) {
|
||||
runtime_code[rci++] = '\xCC'; // INT3
|
||||
}
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC)) {
|
||||
*(void**)(&runtime_code[rci]) = pfc_mem;
|
||||
templateI += 8;
|
||||
@@ -385,7 +390,7 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
templateI += 8;
|
||||
do {
|
||||
runtime_code[rci++] = measurement_template[templateI++];
|
||||
} while (measurement_template[templateI-1] != '\xc3'); // 0xc3 = ret
|
||||
} while (measurement_template[templateI-1] != '\xC3'); // 0xC3 = ret
|
||||
}
|
||||
|
||||
void run_warmup_experiment(char* measurement_template) {
|
||||
|
||||
@@ -110,6 +110,10 @@ extern int aggregate_function;
|
||||
extern int verbose;
|
||||
#define VERBOSE_DEFAULT 0;
|
||||
|
||||
// Whether to generate a breakpoint trap after executing the code to be benchmarked.
|
||||
extern int debug;
|
||||
#define DEBUG_DEFAULT 0;
|
||||
|
||||
extern char* code;
|
||||
extern size_t code_length;
|
||||
|
||||
@@ -262,6 +266,7 @@ void measurement_RDTSC_template_noMem(void);
|
||||
"mov r11, 0\n" \
|
||||
"mov r12, 0\n" \
|
||||
"mov r13, 0\n" \
|
||||
"mov r15, 0\n" \
|
||||
"mov r14, rsp\n" \
|
||||
"add r14, 0x1000\n" \
|
||||
"mov rdi, rsp\n" \
|
||||
|
||||
15
nanoBench.sh
15
nanoBench.sh
@@ -12,6 +12,13 @@ if ! command -v rdmsr &>/dev/null; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
debug=false
|
||||
for p in "$@"; do
|
||||
if [[ "$p" == -d* ]]; then
|
||||
debug=true
|
||||
fi
|
||||
done
|
||||
|
||||
args=''
|
||||
while [ "$2" ]; do
|
||||
if [ "$1" == '-asm' ]; then
|
||||
@@ -27,7 +34,7 @@ while [ "$2" ]; do
|
||||
as asm-init.s -o asm-init.o || exit
|
||||
objcopy asm-init.o -O binary asm-init.bin
|
||||
args="$args -code_init asm-init.bin"
|
||||
shift 2
|
||||
shift 2
|
||||
else
|
||||
args="$args $1"
|
||||
shift
|
||||
@@ -52,7 +59,11 @@ iTCO_vendor_support_prev_loaded=$?
|
||||
prev_nmi_watchdog=$(cat /proc/sys/kernel/nmi_watchdog)
|
||||
echo 0 > /proc/sys/kernel/nmi_watchdog
|
||||
|
||||
user/nanoBench $@
|
||||
if [ "$debug" = true ]; then
|
||||
gdb -ex=run --args user/nanoBench $@
|
||||
else
|
||||
user/nanoBench $@
|
||||
fi
|
||||
|
||||
rm -f asm-code.*
|
||||
rm -f asm-init.*
|
||||
|
||||
@@ -41,6 +41,7 @@ void print_usage() {
|
||||
printf(" -cpu <n>: Pins the measurement thread to CPU n. \n");
|
||||
printf(" -usr <n>: If 1, counts events at a privilege level greater than 0.\n");
|
||||
printf(" -os <n>: If 1, counts events at a privilege level 0.\n");
|
||||
printf(" -debug: Generate a breakpoint trap after running the code to be benchmarked.\n");
|
||||
}
|
||||
|
||||
size_t mmap_file(char* filename, char** content) {
|
||||
@@ -77,10 +78,11 @@ int main(int argc, char **argv) {
|
||||
{"min", no_argument, &aggregate_function, MIN},
|
||||
{"basic_mode", no_argument, &basic_mode, 1},
|
||||
{"no_mem", no_argument, &no_mem, 1},
|
||||
{"verbose", no_argument, &verbose, 1},
|
||||
{"verbose", no_argument, &verbose, 1},
|
||||
{"cpu", required_argument, 0, 'p'},
|
||||
{"usr", required_argument, 0, 'r'},
|
||||
{"os", required_argument, 0, 's'},
|
||||
{"debug", no_argument, &debug, 1},
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user