// nanoBench // // Copyright (C) 2019 Andreas Abel // // This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License. // // This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License along with this program. If not, see . #include #include #include #include #include #include #include #include #include #include #include #include <../arch/x86/include/asm/fpu/api.h> #if LINUX_VERSION_CODE <= KERNEL_VERSION(4,12,0) #include #elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) #include int (*set_memory_x)(unsigned long, int) = 0; int (*set_memory_nx)(unsigned long, int) = 0; #else #include #endif #include "../common/nanoBench.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andreas Abel"); // __vmalloc has no longer the pgprot_t parameter, so we need to hook __vmalloc_node_range directly #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) void *(*kallsym__vmalloc_node_range)(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); #endif // kallsyms_lookup_name is no longer supported; we use a kprobes to get the address #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) #include #include unsigned long kallsyms_lookup_name(const char* name) { struct kprobe kp = { .symbol_name = name, }; int ret = register_kprobe(&kp); if (ret < 0) { return 0; }; unregister_kprobe(&kp); return (unsigned long) kp.addr; } #endif // 4 Mb is the maximum that kmalloc supports on my machines #define KMALLOC_MAX (4*1024*1024) // If enabled, for cycle-by-cycle measurements, the output includes all of the measurement overhead; otherwise, only the cycles between adding the first // instruction of the benchmark to the IDQ, and retiring the last instruction of the benchmark are considered. int end_to_end = false; char* runtime_code_base = NULL; size_t code_offset = 0; size_t code_memory_size = 0; size_t code_init_memory_size = 0; size_t code_late_init_memory_size = 0; size_t code_one_time_init_memory_size = 0; size_t pfc_config_memory_size = 0; size_t msr_config_memory_size = 0; size_t runtime_code_base_memory_size = 0; size_t runtime_one_time_init_code_memory_size = 0; void** r14_segments = NULL; size_t n_r14_segments = 0; static int read_file_into_buffer(const char *file_name, char **buf, size_t *buf_len, size_t *buf_memory_size) { struct file *filp = NULL; filp = filp_open(file_name, O_RDONLY, 0); if (IS_ERR(filp)) { pr_err("Error opening file %s\n", file_name); return -1; } struct path p; struct kstat ks; kern_path(file_name, 0, &p); #if LINUX_VERSION_CODE <= KERNEL_VERSION(4,11,0) if (vfs_getattr(&p, &ks)) { #else if (vfs_getattr(&p, &ks, 0, 0)) { #endif pr_err("Error getting file attributes\n"); return -1; } size_t file_size = ks.size; *buf_len = file_size; if (file_size + 1 > *buf_memory_size) { kfree(*buf); *buf_memory_size = max(2*(file_size + 1), PAGE_SIZE); *buf = kmalloc(*buf_memory_size, GFP_KERNEL); if (!*buf) { pr_err("Could not allocate memory for %s\n", file_name); *buf_memory_size = 0; filp_close(filp, NULL); return -1; } } loff_t pos = 0; kernel_read(filp, *buf, file_size, &pos); (*buf)[file_size] = '\0'; path_put(&p); filp_close(filp, NULL); return 0; } static ssize_t code_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return 0; } static ssize_t code_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { read_file_into_buffer(buf, &code, &code_length, &code_memory_size); return count; } static struct kobj_attribute code_attribute =__ATTR(code, 0660, code_show, code_store); static ssize_t init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return 0; } static ssize_t init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { read_file_into_buffer(buf, &code_init, &code_init_length, &code_init_memory_size); return count; } static struct kobj_attribute code_init_attribute =__ATTR(init, 0660, init_show, init_store); static ssize_t late_init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return 0; } static ssize_t late_init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { read_file_into_buffer(buf, &code_late_init, &code_late_init_length, &code_late_init_memory_size); return count; } static struct kobj_attribute code_late_init_attribute =__ATTR(late_init, 0660, late_init_show, late_init_store); static ssize_t one_time_init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return 0; } static ssize_t one_time_init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { read_file_into_buffer(buf, &code_one_time_init, &code_one_time_init_length, &code_one_time_init_memory_size); size_t new_runtime_one_time_init_code_memory_size = 10000 + code_one_time_init_memory_size; if (new_runtime_one_time_init_code_memory_size > runtime_one_time_init_code_memory_size) { runtime_one_time_init_code_memory_size = new_runtime_one_time_init_code_memory_size; vfree(runtime_one_time_init_code); #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) runtime_one_time_init_code = kallsym__vmalloc_node_range(runtime_one_time_init_code_memory_size, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); #else runtime_one_time_init_code = __vmalloc(runtime_one_time_init_code_memory_size, GFP_KERNEL, PAGE_KERNEL_EXEC); #endif if (!runtime_one_time_init_code) { runtime_one_time_init_code_memory_size = 0; pr_err("failed to allocate executable memory\n"); } } return count; } static struct kobj_attribute code_one_time_init_attribute =__ATTR(one_time_init, 0660, one_time_init_show, one_time_init_store); static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; for (int i=0; i PAGE_SIZE) { return PAGE_SIZE-1; } } return count; } static ssize_t config_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { size_t pfc_config_length; read_file_into_buffer(buf, &pfc_config_file_content, &pfc_config_length, &pfc_config_memory_size); parse_counter_configs(); return count; } static struct kobj_attribute config_attribute =__ATTR(config, 0660, config_show, config_store); static ssize_t msr_config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; for (int i=0; i 0) { for (int i=0; i runtime_code_base_memory_size) { pr_err("Maximum supported code size %zu kB; requested %zu kB\n", runtime_code_base_memory_size/1024, req_code_length/1024); return false; } return true; } static int run_nanoBench(struct seq_file *m, void *v) { if (!check_memory_allocations()) { return -1; } kernel_fpu_begin(); disable_interrupts_preemption(); clear_perf_counter_configurations(); clear_perf_counters(); clear_overflow_status_bits(); enable_perf_ctrs_globally(); long base_unroll_count = (basic_mode?0:unroll_count); long main_unroll_count = (basic_mode?unroll_count:2*unroll_count); long base_loop_count = (basic_mode?0:loop_count); long main_loop_count = loop_count; char buf[100]; char* measurement_template; create_and_run_one_time_init_code(); run_initial_warmup_experiment(); /********************************* * Fixed-function counters. ********************************/ if (use_fixed_counters) { if (is_AMD_CPU) { if (no_mem) { measurement_template = (char*)&measurement_FF_template_AMD_noMem; } else { measurement_template = (char*)&measurement_FF_template_AMD; } } else { if (no_mem) { measurement_template = (char*)&measurement_FF_template_Intel_noMem; } else { measurement_template = (char*)&measurement_FF_template_Intel; } } if (is_AMD_CPU) { run_experiment(measurement_template, measurement_results_base, 3, base_unroll_count, base_loop_count); run_experiment(measurement_template, measurement_results, 3, main_unroll_count, main_loop_count); if (verbose) { pr_info("\nRDTSC, MPERF, and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); print_all_measurement_results(measurement_results_base, 3); pr_info("RDTSC, MPERF, and and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); print_all_measurement_results(measurement_results, 3); } seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1)); seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2)); } else { configure_perf_ctrs_FF_Intel(false, true); run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count); run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count); if (verbose) { pr_info("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); print_all_measurement_results(measurement_results_base, 4); pr_info("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); print_all_measurement_results(measurement_results, 4); } seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0)); seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1)); seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2)); seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3)); } } /********************************* * Programmable counters. ********************************/ int n_used_counters = n_programmable_counters; if (is_AMD_CPU) { if (no_mem) { measurement_template = (char*)&measurement_template_AMD_noMem; } else { measurement_template = (char*)&measurement_template_AMD; } } else { if (n_used_counters >= 4) { n_used_counters = 4; if (no_mem) { measurement_template = (char*)&measurement_template_Intel_noMem_4; } else { measurement_template = (char*)&measurement_template_Intel_4; } } else { n_used_counters = 2; if (no_mem) { measurement_template = (char*)&measurement_template_Intel_noMem_2; } else { measurement_template = (char*)&measurement_template_Intel_2; } } } size_t next_pfc_config = 0; while (next_pfc_config < n_pfc_configs) { char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0}; next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, true, true, n_used_counters, 0, pfc_descriptions); // on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count); run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count); if (verbose) { pr_info("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count); print_all_measurement_results(measurement_results_base, n_used_counters); pr_info("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count); print_all_measurement_results(measurement_results, n_used_counters); } for (size_t c=0; c < n_used_counters; c++) { if (pfc_descriptions[c]) seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c)); } } /********************************* * MSRs. ********************************/ if (no_mem) { measurement_template = (char*)&measurement_RDMSR_template_noMem; } else { measurement_template = (char*)&measurement_RDMSR_template; } for (size_t i=0; i 0, the programmable counters from 0 to n_used_counters-1 are read; otherwise, the fixed counters are read. // pmi_counter: 0-2: fixed counters, 3-n: programmable counters // pmi_counter_val: value that is written to pmi_counter before each measurement static void run_experiment_with_freeze_on_PMI(int64_t* results[], int n_used_counters, int pmi_counter, uint64_t pmi_counter_val) { if (pmi_counter <= 2) { set_bit_in_msr(MSR_IA32_FIXED_CTR_CTRL, pmi_counter*4 + 3); } else { set_bit_in_msr(MSR_IA32_PERFEVTSEL0 + (pmi_counter - 3), 20); } for (long ri=-warm_up_count; ri 0) { for (int c=0; c=0; cycle--) { run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle); if (get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function) < last_applicable_instr) { cycle_last_retired = cycle+1; break; } } print_verbose("Last instruction of benchmark retired in cycle: %llu\n", cycle_last_retired); return cycle_last_retired; } // Returns the cycle with which the fixed cycle counter has to be programmed such that the programmable counters are frozen in the cycle in which the first // instruction of the benchmark is added to the IDQ. static uint64_t get_cycle_first_added_to_IDQ(uint64_t cycle_last_retired_empty) { uint64_t perfevtsel2 = (uint64_t)0x79 | ((uint64_t)0x04 << 8) | (1ULL << 22) | (1ULL << 17); // IDQ.MITE_UOPS write_msr(MSR_IA32_PERFEVTSEL0+2, perfevtsel2); uint64_t cycle_first_added_to_IDQ = 0; uint64_t prev_uops = 0; for (int64_t cycle=cycle_last_retired_empty-3; cycle>=0; cycle--) { run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle); uint64_t uops = get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function); if ((prev_uops != 0) && (prev_uops - uops > 1)) { cycle_first_added_to_IDQ = cycle + 1; break; } prev_uops = uops; } print_verbose("First instruction added to IDQ in cycle: %llu\n", cycle_first_added_to_IDQ); return cycle_first_added_to_IDQ; } // Programs the fixed cycle counter such that it overflows in the specified cycle, runs the benchmark, // and stores the measurements of the programmable counters in results. static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results, uint64_t* results_min, uint64_t* results_max) { // on several microarchitectures, the counters 0 or 1 do not freeze at the same time as the other counters int avoid_counters = 0; if (displ_model == 0x97) { // Alder Lake avoid_counters = (1 << 0); } else if ((Intel_perf_mon_ver >= 3) && (Intel_perf_mon_ver <= 4) && (displ_model >= 0x3A)) { avoid_counters = (1 << 1); } // the higher counters don't count some of the events properly (e.g., D1.01 on RKL) int n_used_counters = 4; size_t next_pfc_config = 0; while (next_pfc_config < n_pfc_configs) { size_t cur_pfc_config = next_pfc_config; char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0}; next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, true, true, n_used_counters, avoid_counters, pfc_descriptions); run_experiment_with_freeze_on_PMI(measurement_results, n_used_counters, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle); for (size_t c=0; c= KERNEL_VERSION(5, 6, 0) static const struct proc_ops proc_file_fops_nanoBench = { .proc_lseek = seq_lseek, .proc_open = open_nanoBench, .proc_read = seq_read, .proc_release = single_release, }; static const struct proc_ops proc_file_fops_nanoBenchCycleByCycle = { .proc_lseek = seq_lseek, .proc_open = open_nanoBenchCycleByCycle, .proc_read = seq_read, .proc_release = single_release, }; #else static const struct file_operations proc_file_fops_nanoBench = { .llseek = seq_lseek, .open = open_nanoBench, .owner = THIS_MODULE, .read = seq_read, .release = single_release, }; static const struct file_operations proc_file_fops_nanoBenchCycleByCycle = { .llseek = seq_lseek, .open = open_nanoBenchCycleByCycle, .owner = THIS_MODULE, .read = seq_read, .release = single_release, }; #endif static struct kobject* nb_kobject; static int __init nb_init(void) { pr_info("Initializing nanoBench kernel module...\n"); #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) set_memory_x = (void*)kallsyms_lookup_name("set_memory_x"); set_memory_nx = (void*)kallsyms_lookup_name("set_memory_nx"); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) kallsym__vmalloc_node_range = (void*)kallsyms_lookup_name("__vmalloc_node_range"); #endif if (check_cpuid()) { return -1; } for (int i=0; iparent); if (!nb_kobject) { pr_err("failed to create and add nb\n"); return -1; } int error = sysfs_create_file(nb_kobject, &clear_attribute.attr); error |= sysfs_create_file(nb_kobject, &reset_attribute.attr); error |= sysfs_create_file(nb_kobject, &code_attribute.attr); error |= sysfs_create_file(nb_kobject, &code_init_attribute.attr); error |= sysfs_create_file(nb_kobject, &code_late_init_attribute.attr); error |= sysfs_create_file(nb_kobject, &code_one_time_init_attribute.attr); error |= sysfs_create_file(nb_kobject, &config_attribute.attr); error |= sysfs_create_file(nb_kobject, &msr_config_attribute.attr); error |= sysfs_create_file(nb_kobject, &fixed_counters_attribute.attr); error |= sysfs_create_file(nb_kobject, &loop_count_attribute.attr); error |= sysfs_create_file(nb_kobject, &unroll_count_attribute.attr); error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr); error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr); error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr); error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr); error |= sysfs_create_file(nb_kobject, &end_to_end_attribute.attr); error |= sysfs_create_file(nb_kobject, &drain_frontend_attribute.attr); error |= sysfs_create_file(nb_kobject, &agg_attribute.attr); error |= sysfs_create_file(nb_kobject, &output_range_attribute.attr); error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr); error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr); error |= sysfs_create_file(nb_kobject, &no_normalization_attribute.attr); error |= sysfs_create_file(nb_kobject, &r14_size_attribute.attr); error |= sysfs_create_file(nb_kobject, &print_r14_attribute.attr); error |= sysfs_create_file(nb_kobject, &code_offset_attribute.attr); error |= sysfs_create_file(nb_kobject, &addresses_attribute.attr); error |= sysfs_create_file(nb_kobject, &verbose_attribute.attr); if (error) { pr_err("failed to create file in /sys/nb/\n"); return error; } struct proc_dir_entry* proc_file_entry = proc_create("nanoBench", 0, NULL, &proc_file_fops_nanoBench); struct proc_dir_entry* proc_file_entry2 = proc_create("nanoBenchCycleByCycle", 0, NULL, &proc_file_fops_nanoBenchCycleByCycle); if(proc_file_entry == NULL || proc_file_entry2 == NULL) { pr_err("failed to create file in /proc/\n"); return -1; } return 0; } static void __exit nb_exit(void) { kfree(code); kfree(code_init); kfree(code_late_init); kfree(code_one_time_init); kfree(pfc_config_file_content); kfree(msr_config_file_content); vfree(runtime_one_time_init_code); vfree(runtime_rbp - RUNTIME_R_SIZE/2); vfree(runtime_rdi - RUNTIME_R_SIZE/2); vfree(runtime_rsi - RUNTIME_R_SIZE/2); vfree(runtime_rsp - RUNTIME_R_SIZE/2); if (runtime_code_base) { set_memory_nx((unsigned long)runtime_code_base, runtime_code_base_memory_size/PAGE_SIZE); kfree(runtime_code_base); } if (n_r14_segments > 0) { for (int i=0; i