diff --git a/README.md b/README.md index 597ffef..dbe4780 100644 --- a/README.md +++ b/README.md @@ -62,9 +62,9 @@ The tool will *unroll* the assembler code multiple times, i.e., it will create m The config file contains the required information for configuring the programmable performance counters with the desired events. We provide example configuration files for recent Intel and AMD microarchitectures in the `config` folder. When using the kernel-module, the config file must not be larger than 4 kB. -The assembler code sequence may use and modify any general-purpose or vector registers, including the stack pointer. There is no need to restore the registers to their original values at the end (unless the `-loop` or `-no_mem` options are used). +The assembler code sequence may use and modify any general-purpose or vector registers (unless the `-loop` or `-no_mem` options are used), including the stack pointer. There is no need to restore the registers to their original values at the end. -R14, RDI, RSI, RSP, and RBP are initialized with addresses in a dedicated memory area (of about 2 MB), that can be freely modified by the assembler code. The addresses in R14, RDI, RSI, RSP, and RBP are at least 4 kB apart from each other. +R14, RDI, RSI, RSP, and RBP are initialized with addresses in the middle of dedicated memory areas (of about 1 MB each), that can be freely modified by the assembler code. All other registers have initially undefined values. They can, however, be initialized as shown in the following example. diff --git a/common/nanoBench.c b/common/nanoBench.c index c355bd5..ff5aa47 100644 --- a/common/nanoBench.c +++ b/common/nanoBench.c @@ -40,7 +40,11 @@ int is_AMD_CPU = 0; int n_programmable_counters; char* runtime_code; -void* runtime_mem; +void* runtime_r14; +void* runtime_rbp; +void* runtime_rdi; +void* runtime_rsi; +void* runtime_rsp; int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS]; void* RSP_mem; @@ -378,8 +382,24 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo *(void**)(&runtime_code[rci]) = &RSP_mem; templateI += 8; rci += 8; - } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_MEM)) { - *(void**)(&runtime_code[rci]) = runtime_mem; + } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_R14)) { + *(void**)(&runtime_code[rci]) = runtime_r14 + RUNTIME_R_SIZE/2; + templateI += 8; + rci += 8; + } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RBP)) { + *(void**)(&runtime_code[rci]) = runtime_rbp + RUNTIME_R_SIZE/2; + templateI += 8; + rci += 8; + } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RDI)) { + *(void**)(&runtime_code[rci]) = runtime_rdi + RUNTIME_R_SIZE/2; + templateI += 8; + rci += 8; + } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSI)) { + *(void**)(&runtime_code[rci]) = runtime_rsi + RUNTIME_R_SIZE/2; + templateI += 8; + rci += 8; + } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSP)) { + *(void**)(&runtime_code[rci]) = runtime_rsp + RUNTIME_R_SIZE/2; templateI += 8; rci += 8; } else { @@ -509,14 +529,13 @@ int starts_with_magic_bytes(char* c, int64_t magic_bytes) { void measurement_template_Intel() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" "push rax \n" "lahf \n" "seto al \n" - "push rax \n" + "push rax \n" "push rcx \n" "push rdx \n" "push r15 \n" @@ -547,7 +566,7 @@ void measurement_template_Intel() { "pop rcx; lfence \n" "pop rax; lfence \n" "cmp al, -127; lfence \n" - "sahf; lfence \n" + "sahf; lfence \n" "pop rax; \n" "lfence \n" ".att_syntax noprefix "); @@ -580,7 +599,6 @@ void measurement_template_Intel() { void measurement_template_Intel_noMem() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" @@ -638,7 +656,6 @@ void measurement_template_Intel_noMem() { void measurement_template_AMD() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" @@ -727,7 +744,6 @@ void measurement_template_AMD() { void measurement_template_AMD_noMem() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" @@ -805,7 +821,6 @@ void measurement_template_AMD_noMem() { void measurement_FF_template_Intel() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" @@ -873,7 +888,6 @@ void measurement_FF_template_Intel() { void measurement_FF_template_Intel_noMem() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" @@ -929,11 +943,10 @@ void measurement_FF_template_Intel_noMem() { void measurement_FF_template_AMD() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" - "push rax \n" + "push rax \n" "lahf \n" "seto al \n" "push rax \n" @@ -989,7 +1002,6 @@ void measurement_FF_template_AMD() { void measurement_FF_template_AMD_noMem() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" @@ -1035,7 +1047,6 @@ void measurement_FF_template_AMD_noMem() { void measurement_RDTSC_template() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" @@ -1074,7 +1085,6 @@ void measurement_RDTSC_template() { void measurement_RDTSC_template_noMem() { SAVE_REGS_FLAGS(); - INITIALIZE_REGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( ".intel_syntax noprefix \n" diff --git a/common/nanoBench.h b/common/nanoBench.h index 45b103c..870af2d 100644 --- a/common/nanoBench.h +++ b/common/nanoBench.h @@ -149,8 +149,14 @@ extern int n_programmable_counters; // Pointer to a memory region that is writable and executable. extern char* runtime_code; -// During measurement, RSP, RDI, RSI, and R14 will point to locations in runtime_mem. -extern void* runtime_mem; +#define RUNTIME_R_SIZE (1024*1024) + +// During measurements, R14, RBP, RDI, RSI, and RSP will contain these addresses plus RUNTIME_R_SIZE/2. +extern void* runtime_r14; +extern void* runtime_rbp; +extern void* runtime_rdi; +extern void* runtime_rsi; +extern void* runtime_rsp; // Stores performance counter values during measurements. extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS]; @@ -198,9 +204,13 @@ void print_all_measurement_results(int64_t* results[], int n_counters); #define MAGIC_BYTES_INIT 0x10b513b1C2813F04 #define MAGIC_BYTES_CODE 0x20b513b1C2813F04 #define MAGIC_BYTES_RSP_ADDRESS 0x30b513b1C2813F04 -#define MAGIC_BYTES_RUNTIME_MEM 0x40b513b1C2813F04 -#define MAGIC_BYTES_PFC 0x50b513b1C2813F04 -#define MAGIC_BYTES_TEMPLATE_END 0x60b513b1C2813F04 +#define MAGIC_BYTES_RUNTIME_R14 0x40b513b1C2813F04 +#define MAGIC_BYTES_RUNTIME_RBP 0x50b513b1C2813F04 +#define MAGIC_BYTES_RUNTIME_RDI 0x60b513b1C2813F04 +#define MAGIC_BYTES_RUNTIME_RSI 0x70b513b1C2813F04 +#define MAGIC_BYTES_RUNTIME_RSP 0x80b513b1C2813F04 +#define MAGIC_BYTES_PFC 0x90b513b1C2813F04 +#define MAGIC_BYTES_TEMPLATE_END 0xA0b513b1C2813F04 #define STRINGIFY2(X) #X #define STRINGIFY(X) STRINGIFY2(X) @@ -232,11 +242,22 @@ void measurement_RDTSC_template_noMem(void); "pushfq\n" \ "mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \ "mov [r15], rsp\n" \ - "mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_MEM)"\n" \ - "add rsp, 0xfffff\n" \ - "mov r15, 0xfff\n" /*4 kB alignment*/ \ - "not r15\n" \ - "and rsp, r15\n" \ + "mov rax, 0\n" \ + "mov rbx, 0\n" \ + "mov rcx, 0\n" \ + "mov rdx, 0\n" \ + "mov r8, 0\n" \ + "mov r9, 0\n" \ + "mov r10, 0\n" \ + "mov r11, 0\n" \ + "mov r12, 0\n" \ + "mov r13, 0\n" \ + "mov r15, 0\n" \ + "mov r14, "STRINGIFY(MAGIC_BYTES_RUNTIME_R14)"\n" \ + "mov rbp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RBP)"\n" \ + "mov rdi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RDI)"\n" \ + "mov rsi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSI)"\n" \ + "mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSP)"\n" \ ".att_syntax noprefix"); #define RESTORE_REGS_FLAGS() \ @@ -252,29 +273,5 @@ void measurement_RDTSC_template_noMem(void); "pop rbp\n" \ "pop rbx\n" \ ".att_syntax noprefix"); - -#define INITIALIZE_REGS() \ - asm volatile( \ - ".intel_syntax noprefix\n" \ - "mov rax, 0\n" \ - "mov rbx, 0\n" \ - "mov rcx, 0\n" \ - "mov rdx, 0\n" \ - "mov r8, 0\n" \ - "mov r9, 0\n" \ - "mov r10, 0\n" \ - "mov r11, 0\n" \ - "mov r12, 0\n" \ - "mov r13, 0\n" \ - "mov r15, 0\n" \ - "mov r14, rsp\n" \ - "add r14, 0x1000\n" \ - "mov rdi, rsp\n" \ - "add rdi, 0x2000\n" \ - "mov rsi, rsp\n" \ - "add rsi, 0x3000\n" \ - "mov rbp, rsp\n" \ - "add rbp, 0x4000\n" \ - ".att_syntax noprefix"); #endif \ No newline at end of file diff --git a/kernel/nb_km.c b/kernel/nb_km.c index 8f8069d..6397f58 100644 --- a/kernel/nb_km.c +++ b/kernel/nb_km.c @@ -1,5 +1,5 @@ // nanoBench -// +// // Copyright (C) 2019 Andreas Abel // // This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License. @@ -92,7 +92,7 @@ static ssize_t n_measurements_show(struct kobject *kobj, struct kobj_attribute * static ssize_t n_measurements_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { long old_n_measurements = n_measurements; sscanf(buf, "%ld", &n_measurements); - + if (old_n_measurements != n_measurements) { for (int i=0; iparent); - if(!nb_kobject) { + if (!nb_kobject) { pr_debug("failed to create and add nb\n"); return -1; } @@ -387,15 +392,19 @@ static int __init nb_init (void) { } static void __exit nb_exit (void) { - if (code) kfree(code); - if (code_init) kfree(code_init); - if (pfc_config_file_content) kfree(pfc_config_file_content); - if (runtime_code) vfree(runtime_code); - if (runtime_mem) kfree(runtime_mem); + kfree(code); + kfree(code_init); + kfree(pfc_config_file_content); + vfree(runtime_code); + vfree(runtime_r14); + vfree(runtime_rbp); + vfree(runtime_rdi); + vfree(runtime_rsi); + vfree(runtime_rsp); for (int i=0; i