From f3b31d391c01869c9ffbdbc6902d81b41f8f66a7 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Tue, 24 Sep 2019 14:31:47 +0200 Subject: [PATCH] starting/stopping counters --- common/nanoBench.c | 417 +++++++++++++++++++++++++++++++----------- common/nanoBench.h | 14 +- kernel/Makefile | 4 +- kernel/nb_km.c | 55 +++++- set-R14-size.sh | 2 +- user/nanoBench_main.c | 14 +- 6 files changed, 386 insertions(+), 120 deletions(-) diff --git a/common/nanoBench.c b/common/nanoBench.c index 98f40a6..02d9fc1 100644 --- a/common/nanoBench.c +++ b/common/nanoBench.c @@ -99,7 +99,6 @@ int check_cpuid() { if (strcmp(proc_vendor_string, "GenuineIntel") == 0) { is_Intel_CPU = 1; - n_programmable_counters = 4; __cpuid(0x0A, eax, ebx, ecx, edx); unsigned int perf_mon_ver = (eax & 0xFF); @@ -111,12 +110,17 @@ int check_cpuid() { unsigned int n_available_counters = ((eax >> 8) & 0xFF); print_user_verbose("Number of general-purpose performance counters: %u\n", n_available_counters); - print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF)); - - if (n_available_counters < n_programmable_counters) { - print_error("Error: only %u programmable counters available; nanoBench requires at least %u\n", n_available_counters, n_programmable_counters); + if (n_available_counters >= 4) { + n_programmable_counters = 4; + } else if (n_available_counters >= 2) { + n_programmable_counters = 2; + } else { + print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_available_counters); return 1; } + + print_user_verbose("Bit widths of general-purpose performance counters: %u\n", ((eax >> 16) & 0xFF)); + } else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) { is_AMD_CPU = 1; n_programmable_counters = 6; @@ -138,6 +142,8 @@ void parse_counter_configs() { continue; } + pfc_configs[n_pfc_configs].invalid = 0; + char* config_str = strsep(&line, " \t"); if (line && strlen(line) > 0) { @@ -401,72 +407,132 @@ void configure_MSRs(struct msr_config config) { cur_rdmsr = config.rdmsr; } +size_t get_required_runtime_code_length() { + size_t req_code_length = code_length; + for (size_t i=0; i+7 0) { + runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xC7'; runtime_code[rcI++] = '\xC7'; + *(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count + rcI_loop_start = rcI; + } + } + + if (!code_contains_magic_bytes) { + // in this case, we can use a memcpy, which is faster + for (unrollI=0; unrollI= local_unroll_count) { + if (local_loop_count > 0) { + runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xFF'; runtime_code[rcI++] = '\xCF'; // dec R15 + runtime_code[rcI++] = '\x0F'; runtime_code[rcI++] = '\x85'; + *(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_loop_start-rcI-4); rcI += 4; // jnz loop_start } - runtime_code[rci++] = '\x49'; runtime_code[rci++] = '\xFF'; runtime_code[rci++] = '\xCF'; // dec R15 - runtime_code[rci++] = '\x0F'; runtime_code[rci++] = '\x85'; - *(int32_t*)(&runtime_code[rci]) = (int32_t)(rci_loop_start-rci-4); rci += 4; // jnz loop_start + if (debug) { + runtime_code[rcI++] = '\xCC'; // INT3 + } } - - if (debug) { - runtime_code[rci++] = '\xCC'; // INT3 + } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC_END)) { + if (unrollI < local_unroll_count) { + templateI = magic_bytes_code_I; + } else { + templateI += 8; } } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC)) { - *(void**)(&runtime_code[rci]) = pfc_mem; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = pfc_mem; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_MSR)) { - *(void**)(&runtime_code[rci]) = (void*)cur_rdmsr; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = (void*)cur_rdmsr; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RSP_ADDRESS)) { - *(void**)(&runtime_code[rci]) = &RSP_mem; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = &RSP_mem; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_R14)) { - *(void**)(&runtime_code[rci]) = runtime_r14; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = runtime_r14; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RBP)) { - *(void**)(&runtime_code[rci]) = runtime_rbp; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = runtime_rbp; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RDI)) { - *(void**)(&runtime_code[rci]) = runtime_rdi; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = runtime_rdi; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSI)) { - *(void**)(&runtime_code[rci]) = runtime_rsi; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = runtime_rsi; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSP)) { - *(void**)(&runtime_code[rci]) = runtime_rsp; - templateI += 8; rci += 8; + *(void**)(&runtime_code[rcI]) = runtime_rsp; + templateI += 8; rcI += 8; } else { - runtime_code[rci++] = measurement_template[templateI++]; + runtime_code[rcI++] = measurement_template[templateI++]; } + continue_outer_loop: ; } templateI += 8; do { - runtime_code[rci++] = measurement_template[templateI++]; + runtime_code[rcI++] = measurement_template[templateI++]; } while (measurement_template[templateI-1] != '\xC3'); // 0xC3 = ret } @@ -474,39 +540,39 @@ void create_and_run_one_time_init_code() { if (code_one_time_init_length == 0) return; char* template = (char*)&one_time_init_template; - int templateI = 0; - int rci = 0; + size_t templateI = 0; + size_t rcI = 0; while (!starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_TEMPLATE_END)) { if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_INIT)) { templateI += 8; - memcpy(&runtime_one_time_init_code[rci], code_one_time_init, code_one_time_init_length); - rci += code_one_time_init_length; + memcpy(&runtime_one_time_init_code[rcI], code_one_time_init, code_one_time_init_length); + rcI += code_one_time_init_length; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RSP_ADDRESS)) { - *(void**)(&runtime_one_time_init_code[rci]) = &RSP_mem; - templateI += 8; rci += 8; + *(void**)(&runtime_one_time_init_code[rcI]) = &RSP_mem; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_R14)) { - *(void**)(&runtime_one_time_init_code[rci]) = runtime_r14; - templateI += 8; rci += 8; + *(void**)(&runtime_one_time_init_code[rcI]) = runtime_r14; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RBP)) { - *(void**)(&runtime_one_time_init_code[rci]) = runtime_rbp; - templateI += 8; rci += 8; + *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rbp; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RDI)) { - *(void**)(&runtime_one_time_init_code[rci]) = runtime_rdi; - templateI += 8; rci += 8; + *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rdi; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSI)) { - *(void**)(&runtime_one_time_init_code[rci]) = runtime_rsi; - templateI += 8; rci += 8; + *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rsi; + templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSP)) { - *(void**)(&runtime_one_time_init_code[rci]) = runtime_rsp; - templateI += 8; rci += 8; + *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rsp; + templateI += 8; rcI += 8; } else { - runtime_one_time_init_code[rci++] = template[templateI++]; + runtime_one_time_init_code[rcI++] = template[templateI++]; } } templateI += 8; do { - runtime_one_time_init_code[rci++] = template[templateI++]; + runtime_one_time_init_code[rcI++] = template[templateI++]; } while (template[templateI-1] != '\xC3'); // 0xC3 = ret ((void(*)(void))runtime_one_time_init_code)(); @@ -626,7 +692,59 @@ int starts_with_magic_bytes(char* c, int64_t magic_bytes) { return (*((int64_t*)c) == magic_bytes); } -void measurement_template_Intel() { +void measurement_template_Intel_2() { + SAVE_REGS_FLAGS(); + asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); + asm volatile( + ".intel_syntax noprefix \n" + "push rax \n" + "lahf \n" + "seto al \n" + "push rax \n" + "push rcx \n" + "push rdx \n" + "push r15 \n" + "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" + "mov qword ptr [r15 + 0], 0 \n" + "mov qword ptr [r15 + 8], 0 \n" + "mov rcx, 0 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "sub [r15 + 0], rdx \n" + "mov rcx, 1 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "sub [r15 + 8], rdx \n" + "lfence \n" + "pop r15; lfence \n" + "pop rdx; lfence \n" + "pop rcx; lfence \n" + "pop rax; lfence \n" + "cmp al, -127; lfence \n" + "sahf; lfence \n" + "pop rax; \n" + "lfence \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_CODE)); + asm volatile( + ".intel_syntax noprefix \n" + "lfence \n" + "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" + "mov rcx, 0 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "add [r15 + 0], rdx \n" + "mov rcx, 1 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "add [r15 + 8], rdx \n" + "lfence \n" + ".att_syntax noprefix "); + RESTORE_REGS_FLAGS(); + asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END)); +} + +void measurement_template_Intel_4() { SAVE_REGS_FLAGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( @@ -643,19 +761,19 @@ void measurement_template_Intel() { "mov qword ptr [r15 + 8], 0 \n" "mov qword ptr [r15 + 16], 0 \n" "mov qword ptr [r15 + 24], 0 \n" - "mov rcx, 0x00000000 \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 0], rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 8], rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 16], rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 24], rdx \n" @@ -674,19 +792,19 @@ void measurement_template_Intel() { ".intel_syntax noprefix \n" "lfence \n" "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" - "mov rcx, 0x00000000 \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 0], rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 8], rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 16], rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 24], rdx \n" @@ -696,7 +814,52 @@ void measurement_template_Intel() { asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END)); } -void measurement_template_Intel_noMem() { +void measurement_template_Intel_noMem_2() { + SAVE_REGS_FLAGS(); + asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); + asm volatile( + ".intel_syntax noprefix \n" + "mov r8, 0 \n" + "mov r9, 0 \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_START)); + asm volatile( + ".intel_syntax noprefix \n" + "mov rcx, 0 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "sub r8, rdx \n" + "mov rcx, 1 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "sub r9, rdx \n" + "lfence \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_CODE)); + asm volatile( + ".intel_syntax noprefix \n" + "lfence \n" + "mov rcx, 0 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "add r8, rdx \n" + "mov rcx, 1 \n" + "lfence; rdpmc; lfence \n" + "shl rdx, 32; or rdx, rax \n" + "add r9, rdx \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_END)); + asm volatile( + ".intel_syntax noprefix \n" + "mov rax, "STRINGIFY(MAGIC_BYTES_PFC)" \n" + "mov [rax + 0], r8 \n" + "mov [rax + 8], r9 \n" + ".att_syntax noprefix "); + RESTORE_REGS_FLAGS(); + asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END)); +} + +void measurement_template_Intel_noMem_4() { SAVE_REGS_FLAGS(); asm(".quad "STRINGIFY(MAGIC_BYTES_INIT)); asm volatile( @@ -705,19 +868,23 @@ void measurement_template_Intel_noMem() { "mov r9, 0 \n" "mov r10, 0 \n" "mov r11, 0 \n" - "mov rcx, 0x00000000 \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_START)); + asm volatile( + ".intel_syntax noprefix \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r8, rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r9, rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r10, rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r11, rdx \n" @@ -727,22 +894,26 @@ void measurement_template_Intel_noMem() { asm volatile( ".intel_syntax noprefix \n" "lfence \n" - "mov rcx, 0x00000000 \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r8, rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r9, rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r10, rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r11, rdx \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_END)); + asm volatile( + ".intel_syntax noprefix \n" "mov rax, "STRINGIFY(MAGIC_BYTES_PFC)" \n" "mov [rax + 0], r8 \n" "mov [rax + 8], r9 \n" @@ -772,27 +943,27 @@ void measurement_template_AMD() { "mov qword ptr [r15 + 24], 0 \n" "mov qword ptr [r15 + 32], 0 \n" "mov qword ptr [r15 + 40], 0 \n" - "mov rcx, 0x00000000 \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 0], rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 8], rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 16], rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 24], rdx \n" - "mov rcx, 0x00000004 \n" + "mov rcx, 4 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 32], rdx \n" - "mov rcx, 0x00000005 \n" + "mov rcx, 5 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub [r15 + 40], rdx \n" @@ -811,27 +982,27 @@ void measurement_template_AMD() { ".intel_syntax noprefix \n" "lfence \n" "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" - "mov rcx, 0x00000000 \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 0], rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 8], rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 16], rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 24], rdx \n" - "mov rcx, 0x00000004 \n" + "mov rcx, 4 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 32], rdx \n" - "mov rcx, 0x00000005 \n" + "mov rcx, 5 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add [r15 + 40], rdx \n" @@ -852,27 +1023,31 @@ void measurement_template_AMD_noMem() { "mov r11, 0 \n" "mov r12, 0 \n" "mov r13, 0 \n" - "mov rcx, 0x00000000 \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_START)); + asm volatile( + ".intel_syntax noprefix \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r8, rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r9, rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r10, rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r11, rdx \n" - "mov rcx, 0x00000004 \n" + "mov rcx, 4 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r12, rdx \n" - "mov rcx, 0x00000005 \n" + "mov rcx, 5 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r13, rdx \n" @@ -882,30 +1057,34 @@ void measurement_template_AMD_noMem() { asm volatile( ".intel_syntax noprefix \n" "lfence \n" - "mov rcx, 0x00000000 \n" + "mov rcx, 0 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r8, rdx \n" - "mov rcx, 0x00000001 \n" + "mov rcx, 1 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r9, rdx \n" - "mov rcx, 0x00000002 \n" + "mov rcx, 2 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r10, rdx \n" - "mov rcx, 0x00000003 \n" + "mov rcx, 3 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r11, rdx \n" - "mov rcx, 0x00000004 \n" + "mov rcx, 4 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r12, rdx \n" - "mov rcx, 0x00000005 \n" + "mov rcx, 5 \n" "lfence; rdpmc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r13, rdx \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_END)); + asm volatile( + ".intel_syntax noprefix \n" "mov rax, "STRINGIFY(MAGIC_BYTES_PFC)" \n" "mov [rax + 0], r8 \n" "mov [rax + 8], r9 \n" @@ -994,6 +1173,10 @@ void measurement_FF_template_Intel_noMem() { "mov r9, 0 \n" "mov r10, 0 \n" "mov r11, 0 \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_START)); + asm volatile( + ".intel_syntax noprefix \n" "lfence; rdtsc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r8, rdx \n" @@ -1030,6 +1213,10 @@ void measurement_FF_template_Intel_noMem() { "lfence; rdtsc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r8, rdx \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_END)); + asm volatile( + ".intel_syntax noprefix \n" "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" "mov [r15], r8 \n" "mov [r15+8], r9 \n" @@ -1107,6 +1294,10 @@ void measurement_FF_template_AMD_noMem() { "mov r8, 0 \n" "mov r9, 0 \n" "mov r10, 0 \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_START)); + asm volatile( + ".intel_syntax noprefix \n" "lfence; rdtsc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r8, rdx \n" @@ -1135,6 +1326,10 @@ void measurement_FF_template_AMD_noMem() { "lfence; rdtsc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r8, rdx \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_END)); + asm volatile( + ".intel_syntax noprefix \n" "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" "mov [r15], r8 \n" "mov [r15+8], r9 \n" @@ -1188,6 +1383,10 @@ void measurement_RDTSC_template_noMem() { asm volatile( ".intel_syntax noprefix \n" "mov r8, 0 \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_START)); + asm volatile( + ".intel_syntax noprefix \n" "lfence; rdtsc; lfence \n" "shl rdx, 32; or rdx, rax \n" "sub r8, rdx \n" @@ -1200,6 +1399,10 @@ void measurement_RDTSC_template_noMem() { "lfence; rdtsc; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r8, rdx \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_END)); + asm volatile( + ".intel_syntax noprefix \n" "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" "mov [r15], r8 \n" ".att_syntax noprefix "); @@ -1255,6 +1458,10 @@ void measurement_RDMSR_template_noMem() { asm volatile( ".intel_syntax noprefix \n" "mov r8, 0 \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_START)); + asm volatile( + ".intel_syntax noprefix \n" "mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n" "lfence; rdmsr; lfence \n" "shl rdx, 32; or rdx, rax \n" @@ -1269,6 +1476,10 @@ void measurement_RDMSR_template_noMem() { "lfence; rdmsr; lfence \n" "shl rdx, 32; or rdx, rax \n" "add r8, rdx \n" + ".att_syntax noprefix "); + asm(".quad "STRINGIFY(MAGIC_BYTES_PFC_END)); + asm volatile( + ".intel_syntax noprefix \n" "mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n" "mov [r15], r8 \n" ".att_syntax noprefix "); diff --git a/common/nanoBench.h b/common/nanoBench.h index e8d8988..8c065a8 100644 --- a/common/nanoBench.h +++ b/common/nanoBench.h @@ -205,6 +205,8 @@ void configure_perf_ctrs_programmable(int start, int end, unsigned int usr, unsi void configure_MSRs(struct msr_config config); +size_t get_required_runtime_code_length(void); + void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count); void run_warmup_experiment(char* measurement_template); void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count); @@ -229,6 +231,12 @@ void print_all_measurement_results(int64_t* results[], int n_counters); #define MAGIC_BYTES_PFC 0x90b513b1C2813F04 #define MAGIC_BYTES_MSR 0xA0b513b1C2813F04 #define MAGIC_BYTES_TEMPLATE_END 0xB0b513b1C2813F04 +#define MAGIC_BYTES_PFC_START 0xC0b513b1C2813F04 +#define MAGIC_BYTES_PFC_END 0xD0b513b1C2813F04 + +#define MAGIC_BYTES_CODE_PFC_START 0xE0b513b1C2813F04 +#define MAGIC_BYTES_CODE_PFC_STOP 0xF0b513b1C2813F04 + #define STRINGIFY2(X) #X #define STRINGIFY(X) STRINGIFY2(X) @@ -236,8 +244,10 @@ void print_all_measurement_results(int64_t* results[], int n_counters); int starts_with_magic_bytes(char* c, int64_t magic_bytes); // The following functions must not use global variables (or anything that uses RIP-relative addressing) -void measurement_template_Intel(void); -void measurement_template_Intel_noMem(void); +void measurement_template_Intel_2(void); +void measurement_template_Intel_4(void); +void measurement_template_Intel_noMem_2(void); +void measurement_template_Intel_noMem_4(void); void measurement_template_AMD(void); void measurement_template_AMD_noMem(void); void measurement_FF_template_Intel(void); diff --git a/kernel/Makefile b/kernel/Makefile index 08bcbf0..b6e03f5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -25,6 +25,6 @@ all: clean: - rm -f hp ../common/*.o ../common/*.ur-safe - make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean + rm -f ../common/*.o ../common/*.ur-safe + rm -rf *.o *.ko *.mod.c .tmp_versions modules.order Module.symvers diff --git a/kernel/nb_km.c b/kernel/nb_km.c index f5ea494..da4fda2 100644 --- a/kernel/nb_km.c +++ b/kernel/nb_km.c @@ -37,7 +37,7 @@ size_t n_r14_segments = 0; static int read_file_into_buffer(const char *file_name, char **buf, size_t *buf_len, size_t *buf_memory_size) { struct file *filp = NULL; filp = filp_open(file_name, O_RDONLY, 0); - if (!filp) { + if (IS_ERR(filp)) { pr_debug("Error opening file %s\n", file_name); return -1; } @@ -55,11 +55,12 @@ static int read_file_into_buffer(const char *file_name, char **buf, size_t *buf_ if (file_size + 1 > *buf_memory_size) { kfree(*buf); - *buf_memory_size = max(file_size + 1, PAGE_SIZE); + *buf_memory_size = max(2*(file_size + 1), PAGE_SIZE); *buf = kmalloc(*buf_memory_size, GFP_KERNEL); if (!*buf) { printk(KERN_ERR "Could not allocate memory for %s\n", file_name); *buf_memory_size = 0; + filp_close(filp, NULL); return -1; } } @@ -68,12 +69,13 @@ static int read_file_into_buffer(const char *file_name, char **buf, size_t *buf_ kernel_read(filp, *buf, file_size, &pos); (*buf)[file_size] = '\0'; + path_put(&p); filp_close(filp, NULL); return 0; } static void extend_runtime_code(void) { - size_t new_runtime_code_memory_size = 10000 + code_init_memory_size + 2*(unroll_count)*code_memory_size; + size_t new_runtime_code_memory_size = get_required_runtime_code_length(); if (new_runtime_code_memory_size > runtime_code_memory_size) { runtime_code_memory_size = new_runtime_code_memory_size; vfree(runtime_code); @@ -189,12 +191,16 @@ static ssize_t n_measurements_store(struct kobject *kobj, struct kobj_attribute long old_n_measurements = n_measurements; sscanf(buf, "%ld", &n_measurements); - if (old_n_measurements != n_measurements) { + if (old_n_measurements < n_measurements) { for (int i=0; i= 4) { + measurement_template = (char*)&measurement_template_Intel_noMem_4; + } else { + measurement_template = (char*)&measurement_template_Intel_noMem_2; + } } else { - measurement_template = (char*)&measurement_template_Intel; + if (n_programmable_counters >= 4) { + measurement_template = (char*)&measurement_template_Intel_4; + } else { + measurement_template = (char*)&measurement_template_Intel_2; + } } } @@ -501,7 +537,7 @@ static const struct file_operations proc_file_fops = { static struct kobject* nb_kobject; -static int __init nb_init (void) { +static int __init nb_init(void) { pr_debug("Initializing nanoBench kernel module...\n"); if (check_cpuid()) { @@ -557,6 +593,7 @@ static int __init nb_init (void) { error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr); error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr); error |= sysfs_create_file(nb_kobject, &r14_size_attribute.attr); + error |= sysfs_create_file(nb_kobject, &print_r14_attribute.attr); error |= sysfs_create_file(nb_kobject, &verbose_attribute.attr); if (error) { @@ -573,7 +610,7 @@ static int __init nb_init (void) { return 0; } -static void __exit nb_exit (void) { +static void __exit nb_exit(void) { kfree(code); kfree(code_init); kfree(code_one_time_init); diff --git a/set-R14-size.sh b/set-R14-size.sh index 02cc24d..1616b5b 100755 --- a/set-R14-size.sh +++ b/set-R14-size.sh @@ -7,7 +7,7 @@ fi if [ $# -eq 0 ]; then echo "Usage: sudo ./set-R14-size.sh " - echo "Example: sudo ./enable-HugePages.sh 128M" + echo "Example: sudo ./set-R14-size.sh 128M" exit 1 fi diff --git a/user/nanoBench_main.c b/user/nanoBench_main.c index 6562873..4e95354 100644 --- a/user/nanoBench_main.c +++ b/user/nanoBench_main.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) { /************************************* * Allocate memory ************************************/ - size_t runtime_code_length = code_init_length + (unroll_count)*code_length*2 + 10000; + size_t runtime_code_length = get_required_runtime_code_length(); posix_memalign((void**)&runtime_code, sysconf(_SC_PAGESIZE), runtime_code_length); if (!runtime_code) { fprintf(stderr, "Error: Failed to allocate memory for runtime_code\n"); @@ -290,9 +290,17 @@ int main(int argc, char **argv) { } } else { if (no_mem) { - measurement_template = (char*)&measurement_template_Intel_noMem; + if (n_programmable_counters >= 4) { + measurement_template = (char*)&measurement_template_Intel_noMem_4; + } else { + measurement_template = (char*)&measurement_template_Intel_noMem_2; + } } else { - measurement_template = (char*)&measurement_template_Intel; + if (n_programmable_counters >= 4) { + measurement_template = (char*)&measurement_template_Intel_4; + } else { + measurement_template = (char*)&measurement_template_Intel_2; + } } }