// nanoBench // // Copyright (C) 2019 Andreas Abel // // This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License. // // This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License along with this program. If not, see . #include "nanoBench.h" long n_measurements = N_MEASUREMENTS_DEFAULT; long unroll_count = UNROLL_COUNT_DEFAULT; long loop_count = LOOP_COUNT_DEFAULT; long warm_up_count = WARM_UP_COUNT_DEFAULT; long initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT; size_t alignment_offset = ALIGNMENT_OFFSET_DEFAULT; int drain_frontend = DRAIN_FRONTEND_DEFAULT; int no_mem = NO_MEM_DEFAULT; int no_normalization = NO_NORMALIZATION_DEFAULT; int basic_mode = BASIC_MODE_DEFAULT; int use_fixed_counters = USE_FIXED_COUNTERS_DEFAULT; int aggregate_function = AGGREGATE_FUNCTION_DEFAULT; int output_range = OUTPUT_RANGE_DEFAULT; int verbose = VERBOSE_DEFAULT; int debug = DEBUG_DEFAULT; char* code = NULL; size_t code_length = 0; char* code_init = NULL; size_t code_init_length = 0; char* code_late_init = NULL; size_t code_late_init_length = 0; char* code_one_time_init = NULL; size_t code_one_time_init_length = 0; struct pfc_config pfc_configs[2000] = {{0}}; size_t n_pfc_configs = 0; char* pfc_config_file_content = NULL; struct msr_config msr_configs[2000] = {{0}}; size_t n_msr_configs = 0; char* msr_config_file_content = NULL; unsigned long cur_rdmsr = 0; bool is_Intel_CPU = false; bool is_AMD_CPU = false; bool supports_tsc_deadline = false; int displ_family; int displ_model; int Intel_perf_mon_ver = -1; int Intel_FF_ctr_width = -1; int Intel_programmable_ctr_width = -1; int n_programmable_counters; char* runtime_code; char* runtime_one_time_init_code; void* runtime_r14; void* runtime_rbp; void* runtime_rdi; void* runtime_rsi; void* runtime_rsp; int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS]; void* RSP_mem; int64_t* measurement_results[MAX_PROGRAMMABLE_COUNTERS]; int64_t* measurement_results_base[MAX_PROGRAMMABLE_COUNTERS]; int cpu = -1; const char* NOPS[] = { "", "\x90", "\x66\x90", "\x0f\x1f\x00", "\x0f\x1f\x40\x00", "\x0f\x1f\x44\x00\x00", "\x66\x0f\x1f\x44\x00\x00", "\x0f\x1f\x80\x00\x00\x00\x00", "\x0f\x1f\x84\x00\x00\x00\x00\x00", "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", "\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", "\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", "\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", "\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", "\x66\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00" }; void build_cpuid_string(char* buf, unsigned int r0, unsigned int r1, unsigned int r2, unsigned int r3) { memcpy(buf, (char*)&r0, 4); memcpy(buf+4, (char*)&r1, 4); memcpy(buf+8, (char*)&r2, 4); memcpy(buf+12, (char*)&r3, 4); } bool check_cpuid() { unsigned int eax, ebx, ecx, edx; __cpuid(0, eax, ebx, ecx, edx); char proc_vendor_string[17] = {0}; build_cpuid_string(proc_vendor_string, ebx, edx, ecx, 0); print_user_verbose("Vendor ID: %s\n", proc_vendor_string); char proc_brand_string[48]; __cpuid(0x80000002, eax, ebx, ecx, edx); build_cpuid_string(proc_brand_string, eax, ebx, ecx, edx); __cpuid(0x80000003, eax, ebx, ecx, edx); build_cpuid_string(proc_brand_string+16, eax, ebx, ecx, edx); __cpuid(0x80000004, eax, ebx, ecx, edx); build_cpuid_string(proc_brand_string+32, eax, ebx, ecx, edx); print_user_verbose("Brand: %s\n", proc_brand_string); __cpuid(0x01, eax, ebx, ecx, edx); displ_family = ((eax >> 8) & 0xF); if (displ_family == 0x0F) { displ_family += ((eax >> 20) & 0xFF); } displ_model = ((eax >> 4) & 0xF); if (displ_family == 0x06 || displ_family == 0x0F) { displ_model += ((eax >> 12) & 0xF0); } print_user_verbose("DisplayFamily_DisplayModel: %.2X_%.2XH\n", displ_family, displ_model); print_user_verbose("Stepping ID: %u\n", (eax & 0xF)); if (strcmp(proc_vendor_string, "GenuineIntel") == 0) { is_Intel_CPU = true; __cpuid(0x01, eax, ebx, ecx, edx); supports_tsc_deadline = (ecx >> 24) & 1; __cpuid(0x0A, eax, ebx, ecx, edx); Intel_perf_mon_ver = (eax & 0xFF); print_user_verbose("Performance monitoring version: %d\n", Intel_perf_mon_ver); if (Intel_perf_mon_ver < 2) { print_error("Error: performance monitoring version >= 2 required\n"); return true; } print_user_verbose("Number of fixed-function performance counters: %u\n", edx & 0x1F); n_programmable_counters = ((eax >> 8) & 0xFF); print_user_verbose("Number of general-purpose performance counters: %u\n", n_programmable_counters); if (n_programmable_counters < 2) { print_error("Error: only %u programmable counters available; nanoBench requires at least 2\n", n_programmable_counters); return true; } Intel_FF_ctr_width = (edx >> 5) & 0xFF; Intel_programmable_ctr_width = (eax >> 16) & 0xFF; print_user_verbose("Bit widths of fixed-function performance counters: %u\n", Intel_FF_ctr_width); print_user_verbose("Bit widths of general-purpose performance counters: %u\n", Intel_programmable_ctr_width); } else if (strcmp(proc_vendor_string, "AuthenticAMD") == 0) { is_AMD_CPU = true; n_programmable_counters = 6; } else { print_error("Error: unsupported CPU found\n"); return true; } return false; } void parse_counter_configs() { n_pfc_configs = 0; char* line; char* next_line = pfc_config_file_content; while ((line = strsep(&next_line, "\n")) != NULL) { if (strlen(line) == 0 || line[0] == '#') { continue; } pfc_configs[n_pfc_configs] = (struct pfc_config){0}; pfc_configs[n_pfc_configs].ctr = -1; char* config_str = strsep(&line, " \t"); if (line && strlen(line) > 0) { pfc_configs[n_pfc_configs].description = line; } else { pfc_configs[n_pfc_configs].description = config_str; } char buf[50]; if (strlen(config_str) >= sizeof(buf)) { print_error("config string too long: %s\n", config_str); continue; } strcpy(buf, config_str); char* tok = buf; char* evt_num = strsep(&tok, "."); pfc_configs[n_pfc_configs].evt_num = strtoul(evt_num, NULL, 16); if (!tok) { print_error("invalid configuration: %s\n", config_str); continue; } char* umask = strsep(&tok, "."); pfc_configs[n_pfc_configs].umask = strtoul(umask, NULL, 16); char* ce; while ((ce = strsep(&tok, ".")) != NULL) { if (!strcmp(ce, "AnyT")) { pfc_configs[n_pfc_configs].any = true; } else if (!strcmp(ce, "EDG")) { pfc_configs[n_pfc_configs].edge = true; } else if (!strcmp(ce, "INV")) { pfc_configs[n_pfc_configs].inv = true; } else if (!strcmp(ce, "TakenAlone")) { pfc_configs[n_pfc_configs].taken_alone = true; } else if (!strncmp(ce, "CTR=", 4)) { unsigned long counter; counter = strtoul(ce+4, NULL, 0); if (counter > n_programmable_counters) { print_error("invalid counter: %s\n", ce); continue; } pfc_configs[n_pfc_configs].ctr = counter; } else if (!strncmp(ce, "CMSK=", 5)) { pfc_configs[n_pfc_configs].cmask = strtoul(ce+5, NULL, 0); } else if (!strncmp(ce, "MSR_3F6H=", 9)) { pfc_configs[n_pfc_configs].msr_3f6h = strtoul(ce+9, NULL, 0); } else if (!strncmp(ce, "MSR_PF=", 7)) { pfc_configs[n_pfc_configs].msr_pf = strtoul(ce+7, NULL, 0); } else if (!strncmp(ce, "MSR_RSP0=", 9)) { pfc_configs[n_pfc_configs].msr_rsp0 = strtoul(ce+9, NULL, 0); } else if (!strncmp(ce, "MSR_RSP1=", 9)) { pfc_configs[n_pfc_configs].msr_rsp1 = strtoul(ce+9, NULL, 0); } } n_pfc_configs++; } } #ifdef __KERNEL__ void parse_msr_configs() { n_msr_configs = 0; char* line; char* next_line = msr_config_file_content; while ((line = strsep(&next_line, "\n")) != NULL) { if (strlen(line) == 0 || line[0] == '#') { continue; } char* wrmsr_str = strsep(&line, " \t"); char* rdmsr_str = strsep(&line, " \t"); if (line && strlen(line) > 0) { msr_configs[n_msr_configs].description = line; } else { msr_configs[n_msr_configs].description = rdmsr_str; rdmsr_str = wrmsr_str; wrmsr_str = line; } strreplace(rdmsr_str, 'h', '\0'); strreplace(rdmsr_str, 'H', '\0'); msr_configs[n_msr_configs].rdmsr = strtoul(rdmsr_str+4, NULL, 16); size_t n_wrmsr = 0; char* tok = wrmsr_str; char* ce; while ((ce = strsep(&tok, ".")) != NULL) { if (n_wrmsr >= 10) { print_error("Error: n_wrmsr >= 10"); break; } char* msr_str = strsep(&ce, "=")+4; pr_debug("msr_str: %s", msr_str); strreplace(msr_str, 'h', '\0'); strreplace(msr_str, 'H', '\0'); msr_configs[n_msr_configs].wrmsr[n_wrmsr] = strtoul(msr_str, NULL, 16); strreplace(ce, 'h', '\0'); strreplace(ce, 'H', '\0'); msr_configs[n_msr_configs].wrmsr_val[n_wrmsr] = strtoul(ce, NULL, 0); n_wrmsr++; } msr_configs[n_msr_configs].n_wrmsr = n_wrmsr; n_msr_configs++; } } #endif #ifndef __KERNEL__ uint64_t read_value_from_cmd(char* cmd) { FILE* fp; if(!(fp = popen(cmd, "r"))){ print_error("Error reading from \"%s\"", cmd); return 0; } char buf[20]; fgets(buf, sizeof(buf), fp); pclose(fp); uint64_t val; val = strtoul(buf, NULL, 0); return val; } #endif uint64_t read_msr(unsigned int msr) { #ifdef __KERNEL__ return native_read_msr(msr); #else char cmd[50]; snprintf(cmd, sizeof(cmd), "rdmsr -c -p%d %#x", cpu, msr); return read_value_from_cmd(cmd); #endif } void write_msr(unsigned int msr, uint64_t value) { #ifdef __KERNEL__ native_write_msr(msr, (uint32_t)value, (uint32_t)(value>>32)); #else char cmd[50]; snprintf(cmd, sizeof(cmd), "wrmsr -p%d %#x %#lx", cpu, msr, value); if (system(cmd)) { print_error("\"%s\" failed. You may need to disable Secure Boot (see README.md).", cmd); exit(1); } #endif } void change_bit_in_msr(unsigned int msr, unsigned int bit, bool bit_value) { uint64_t msr_value = read_msr(msr); msr_value &= ~((uint64_t)1 << bit); msr_value |= ((uint64_t)bit_value << bit); write_msr(msr, msr_value); } void set_bit_in_msr(unsigned int msr, unsigned int bit) { change_bit_in_msr(msr, bit, true); } void clear_bit_in_msr(unsigned int msr, unsigned int bit) { change_bit_in_msr(msr, bit, false); } uint64_t read_pmc(unsigned int counter) { unsigned long lo, hi; asm volatile("rdpmc" : "=a"(lo), "=d"(hi) : "c"(counter)); return lo | ((uint64_t)hi) << 32; } void clear_perf_counters() { if (is_Intel_CPU) { for (int i=0; i<3; i++) { write_msr(MSR_IA32_FIXED_CTR0+i, 0); } for (int i=0; i= n_pfc_configs) { break; } struct pfc_config config = pfc_configs[next_pfc_config]; if (config.taken_alone && evt_added) { break; } if ((config.ctr != -1) && (config.ctr != i)) { if (config.ctr >= n_counters) { print_error("Counter %u is not available", config.ctr); next_pfc_config++; } continue; } if (((avoid_counters >> i) & 1) && (config.ctr != i)) { continue; } next_pfc_config++; descriptions[i] = config.description; uint64_t perfevtselx = ((config.cmask & 0xFF) << 24); perfevtselx |= (config.inv << 23); perfevtselx |= (1ULL << 22); perfevtselx |= (config.any << 21); perfevtselx |= (config.edge << 18); perfevtselx |= (os << 17); perfevtselx |= (usr << 16); perfevtselx |= ((config.umask & 0xFF) << 8); perfevtselx |= (config.evt_num & 0xFF); write_msr(MSR_IA32_PERFEVTSEL0+i, perfevtselx); if (config.msr_3f6h) { write_msr(0x3f6, config.msr_3f6h); } if (config.msr_pf) { write_msr(MSR_PEBS_FRONTEND, config.msr_pf); } if (config.msr_rsp0) { write_msr(MSR_OFFCORE_RSP0, config.msr_rsp0); } if (config.msr_rsp1) { write_msr(MSR_OFFCORE_RSP1, config.msr_rsp1); } evt_added = true; if (config.taken_alone) { break; } } } else { for (int i=0; i= n_pfc_configs) { write_msr(CORE_X86_MSR_PERF_CTL + (2*i), 0); continue; } struct pfc_config config = pfc_configs[next_pfc_config]; if ((config.ctr != -1) && (config.ctr != i)) { if (config.ctr >= n_counters) { print_error("Counter %u is not available", config.ctr); next_pfc_config++; } continue; } if (((avoid_counters >> i) & 1) && (config.ctr != i)) { continue; } next_pfc_config++; descriptions[i] = config.description; uint64_t perf_ctl = 0; perf_ctl |= ((config.evt_num) & 0xF00) << 24; perf_ctl |= (config.evt_num) & 0xFF; perf_ctl |= ((config.umask) & 0xFF) << 8; perf_ctl |= ((config.cmask) & 0x7F) << 24; perf_ctl |= (config.inv << 23); perf_ctl |= (1ULL << 22); perf_ctl |= (config.edge << 18); perf_ctl |= (os << 17); perf_ctl |= (usr << 16); write_msr(CORE_X86_MSR_PERF_CTL + (2*i), perf_ctl); } } return next_pfc_config; } void configure_MSRs(struct msr_config config) { for (size_t i=0; i 0) { strcpy(&runtime_code[rcI], "\x49\xC7\xC7"); rcI += 3; *(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count } int dist = get_distance_to_code(measurement_template, templateI) + code_late_init_length; int n_fill = (64 - ((uintptr_t)&runtime_code[rcI+dist] % 64)) % 64; n_fill += alignment_offset; while (n_fill > 0) { int nop_len = min(15, n_fill); strcpy(&runtime_code[rcI], NOPS[nop_len]); rcI += nop_len; n_fill -= nop_len; } if (drain_frontend) { strcpy(&runtime_code[rcI], "\x0F\xAE\xE8"); rcI += 3; // lfence for (int i=0; i<189; i++) { strcpy(&runtime_code[rcI], NOPS[1]); rcI += 1; } for (int i=0; i<64; i++) { strcpy(&runtime_code[rcI], NOPS[15]); rcI += 15; } } } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC_START)) { magic_bytes_pfc_start_I = templateI; templateI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_CODE)) { magic_bytes_code_I = templateI; templateI += 8; if (unrollI == 0 && codeI == 0) { if (code_late_init_length > 0) { memcpy(&runtime_code[rcI], code_late_init, code_late_init_length); rcI += code_late_init_length; } if (drain_frontend) { // We first execute an lfence instruction, then, we fill the front-end buffers with 1-Byte NOPs, and then, we drain the buffers using // 15-Byte NOPs; this makes sure that before the first 15-Byte NOP is predecoded, the front-end buffers contain only NOPs that can be // issued at the maximum rate. The length of the added instructions is a multiple of 64, and thus doesn't affect the alignment. strcpy(&runtime_code[rcI], "\x0F\xAE\xE8"); rcI += 3; // lfence for (int i=0; i<189; i++) { strcpy(&runtime_code[rcI], NOPS[1]); rcI += 1; } for (int i=0; i<64; i++) { strcpy(&runtime_code[rcI], NOPS[15]); rcI += 15; } } rcI_code_start = rcI; } if (!code_contains_magic_bytes) { // in this case, we can use memcpy, which is faster for (unrollI=0; unrollI= local_unroll_count) { if (local_loop_count > 0) { strcpy(&runtime_code[rcI], "\x49\xFF\xCF"); rcI += 3; // dec R15 strcpy(&runtime_code[rcI], "\x0F\x85"); rcI += 2; *(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_code_start-rcI-4); rcI += 4; // jnz loop_start } if (drain_frontend) { // We add an lfence followed by nop instructions s.t. the front end gets drained and the following instruction begins on a 32-byte boundary. strcpy(&runtime_code[rcI], "\x0F\xAE\xE8"); rcI += 3; // lfence for (int i=0; i<189; i++) { strcpy(&runtime_code[rcI], NOPS[1]); rcI += 1; } for (int i=0; i<61; i++) { strcpy(&runtime_code[rcI], NOPS[15]); rcI += 15; } int dist_to_32Byte_boundary = 32 - ((uintptr_t)&runtime_code[rcI] % 32); if (dist_to_32Byte_boundary <= (3*15) - 32) { dist_to_32Byte_boundary += 32; } int len_nop1 = min(15, dist_to_32Byte_boundary - 2); int len_nop2 = min(15, dist_to_32Byte_boundary - len_nop1 - 1); int len_nop3 = dist_to_32Byte_boundary - len_nop1 - len_nop2; strcpy(&runtime_code[rcI], NOPS[len_nop1]); rcI += len_nop1; strcpy(&runtime_code[rcI], NOPS[len_nop2]); rcI += len_nop2; strcpy(&runtime_code[rcI], NOPS[len_nop3]); rcI += len_nop3; } if (debug) { runtime_code[rcI++] = '\xCC'; // INT3 } } } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC_END)) { if (unrollI < local_unroll_count) { templateI = magic_bytes_code_I; } else { templateI += 8; } } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC)) { *(void**)(&runtime_code[rcI]) = pfc_mem; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_MSR)) { *(void**)(&runtime_code[rcI]) = (void*)cur_rdmsr; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RSP_ADDRESS)) { *(void**)(&runtime_code[rcI]) = &RSP_mem; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_R14)) { *(void**)(&runtime_code[rcI]) = runtime_r14; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RBP)) { *(void**)(&runtime_code[rcI]) = runtime_rbp; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RDI)) { *(void**)(&runtime_code[rcI]) = runtime_rdi; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSI)) { *(void**)(&runtime_code[rcI]) = runtime_rsi; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSP)) { *(void**)(&runtime_code[rcI]) = runtime_rsp; templateI += 8; rcI += 8; } else { runtime_code[rcI++] = measurement_template[templateI++]; } continue_outer_loop: ; } templateI += 8; do { runtime_code[rcI++] = measurement_template[templateI++]; } while (measurement_template[templateI-1] != '\xC3'); // 0xC3 = ret } void create_and_run_one_time_init_code() { if (code_one_time_init_length == 0) return; char* template = (char*)&one_time_init_template; size_t templateI = 0; size_t rcI = 0; while (!starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_TEMPLATE_END)) { if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_INIT)) { templateI += 8; memcpy(&runtime_one_time_init_code[rcI], code_one_time_init, code_one_time_init_length); rcI += code_one_time_init_length; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RSP_ADDRESS)) { *(void**)(&runtime_one_time_init_code[rcI]) = &RSP_mem; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_R14)) { *(void**)(&runtime_one_time_init_code[rcI]) = runtime_r14; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RBP)) { *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rbp; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RDI)) { *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rdi; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSI)) { *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rsi; templateI += 8; rcI += 8; } else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSP)) { *(void**)(&runtime_one_time_init_code[rcI]) = runtime_rsp; templateI += 8; rcI += 8; } else { runtime_one_time_init_code[rcI++] = template[templateI++]; } } templateI += 8; do { runtime_one_time_init_code[rcI++] = template[templateI++]; } while (template[templateI-1] != '\xC3'); // 0xC3 = ret ((void(*)(void))runtime_one_time_init_code)(); } void run_initial_warmup_experiment() { if (!initial_warm_up_count) return; create_runtime_code((char*)&initial_warm_up_template, unroll_count, loop_count); for (int i=0; i max) { max = values[i]; } } return max * scale; } else { qsort(values, length, sizeof(int64_t), cmpInt64); if (agg_func == AVG_20_80) { // computes the average of the values between the 20 and 80 percentile int64_t sum = 0; int count = 0; for (int i=length/5; i