// nanoBench
//
// Copyright (C) 2019 Andreas Abel
//
// This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License along with this program. If not, see .
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include <../arch/x86/include/asm/fpu/api.h>
#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,12,0)
#include
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
#include
int (*set_memory_x)(unsigned long, int) = 0;
int (*set_memory_nx)(unsigned long, int) = 0;
#else
#include
#endif
#include "../common/nanoBench.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andreas Abel");
// __vmalloc has no longer the pgprot_t parameter, so we need to hook __vmalloc_node_range directly
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
void *(*kallsym__vmalloc_node_range)(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller);
#endif
// kallsyms_lookup_name is no longer supported; we use a kprobes to get the address
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)
#include
#include
unsigned long kallsyms_lookup_name(const char* name) {
struct kprobe kp = {
.symbol_name = name,
};
int ret = register_kprobe(&kp);
if (ret < 0) {
return 0;
};
unregister_kprobe(&kp);
return (unsigned long) kp.addr;
}
#endif
// 4 Mb is the maximum that kmalloc supports on my machines
#define KMALLOC_MAX (4*1024*1024)
// If enabled, for cycle-by-cycle measurements, the output includes all of the measurement overhead; otherwise, only the cycles between adding the first
// instruction of the benchmark to the IDQ, and retiring the last instruction of the benchmark are considered.
int end_to_end = false;
char* runtime_code_base = NULL;
size_t code_offset = 0;
size_t code_memory_size = 0;
size_t code_init_memory_size = 0;
size_t code_late_init_memory_size = 0;
size_t code_one_time_init_memory_size = 0;
size_t pfc_config_memory_size = 0;
size_t msr_config_memory_size = 0;
size_t runtime_code_base_memory_size = 0;
size_t runtime_one_time_init_code_memory_size = 0;
void** r14_segments = NULL;
size_t n_r14_segments = 0;
static int read_file_into_buffer(const char *file_name, char **buf, size_t *buf_len, size_t *buf_memory_size) {
struct file *filp = NULL;
filp = filp_open(file_name, O_RDONLY, 0);
if (IS_ERR(filp)) {
pr_err("Error opening file %s\n", file_name);
return -1;
}
struct path p;
struct kstat ks;
kern_path(file_name, 0, &p);
#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,11,0)
if (vfs_getattr(&p, &ks)) {
#else
if (vfs_getattr(&p, &ks, 0, 0)) {
#endif
pr_err("Error getting file attributes\n");
return -1;
}
size_t file_size = ks.size;
*buf_len = file_size;
if (file_size + 1 > *buf_memory_size) {
kfree(*buf);
*buf_memory_size = max(2*(file_size + 1), PAGE_SIZE);
*buf = kmalloc(*buf_memory_size, GFP_KERNEL);
if (!*buf) {
pr_err("Could not allocate memory for %s\n", file_name);
*buf_memory_size = 0;
filp_close(filp, NULL);
return -1;
}
}
loff_t pos = 0;
kernel_read(filp, *buf, file_size, &pos);
(*buf)[file_size] = '\0';
path_put(&p);
filp_close(filp, NULL);
return 0;
}
static ssize_t code_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return 0;
}
static ssize_t code_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
read_file_into_buffer(buf, &code, &code_length, &code_memory_size);
return count;
}
static struct kobj_attribute code_attribute =__ATTR(code, 0660, code_show, code_store);
static ssize_t init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return 0;
}
static ssize_t init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
read_file_into_buffer(buf, &code_init, &code_init_length, &code_init_memory_size);
return count;
}
static struct kobj_attribute code_init_attribute =__ATTR(init, 0660, init_show, init_store);
static ssize_t late_init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return 0;
}
static ssize_t late_init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
read_file_into_buffer(buf, &code_late_init, &code_late_init_length, &code_late_init_memory_size);
return count;
}
static struct kobj_attribute code_late_init_attribute =__ATTR(late_init, 0660, late_init_show, late_init_store);
static ssize_t one_time_init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return 0;
}
static ssize_t one_time_init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
read_file_into_buffer(buf, &code_one_time_init, &code_one_time_init_length, &code_one_time_init_memory_size);
size_t new_runtime_one_time_init_code_memory_size = 10000 + code_one_time_init_memory_size;
if (new_runtime_one_time_init_code_memory_size > runtime_one_time_init_code_memory_size) {
runtime_one_time_init_code_memory_size = new_runtime_one_time_init_code_memory_size;
vfree(runtime_one_time_init_code);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
runtime_one_time_init_code = kallsym__vmalloc_node_range(runtime_one_time_init_code_memory_size, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0));
#else
runtime_one_time_init_code = __vmalloc(runtime_one_time_init_code_memory_size, GFP_KERNEL, PAGE_KERNEL_EXEC);
#endif
if (!runtime_one_time_init_code) {
runtime_one_time_init_code_memory_size = 0;
pr_err("failed to allocate executable memory\n");
}
}
return count;
}
static struct kobj_attribute code_one_time_init_attribute =__ATTR(one_time_init, 0660, one_time_init_show, one_time_init_store);
static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
ssize_t count = 0;
for (int i=0; i PAGE_SIZE) {
return PAGE_SIZE-1;
}
}
return count;
}
static ssize_t config_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
size_t pfc_config_length;
read_file_into_buffer(buf, &pfc_config_file_content, &pfc_config_length, &pfc_config_memory_size);
parse_counter_configs();
return count;
}
static struct kobj_attribute config_attribute =__ATTR(config, 0660, config_show, config_store);
static ssize_t msr_config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
ssize_t count = 0;
for (int i=0; i 0) {
for (int i=0; i runtime_code_base_memory_size) {
pr_err("Maximum supported code size %zu kB; requested %zu kB\n", runtime_code_base_memory_size/1024, req_code_length/1024);
return false;
}
return true;
}
static int run_nanoBench(struct seq_file *m, void *v) {
if (!check_memory_allocations()) {
return -1;
}
kernel_fpu_begin();
disable_interrupts_preemption();
clear_perf_counter_configurations();
clear_perf_counters();
clear_overflow_status_bits();
enable_perf_ctrs_globally();
long base_unroll_count = (basic_mode?0:unroll_count);
long main_unroll_count = (basic_mode?unroll_count:2*unroll_count);
long base_loop_count = (basic_mode?0:loop_count);
long main_loop_count = loop_count;
char buf[100];
char* measurement_template;
create_and_run_one_time_init_code();
run_initial_warmup_experiment();
/*********************************
* Fixed-function counters.
********************************/
if (use_fixed_counters) {
if (is_AMD_CPU) {
if (no_mem) {
measurement_template = (char*)&measurement_FF_template_AMD_noMem;
} else {
measurement_template = (char*)&measurement_FF_template_AMD;
}
} else {
if (no_mem) {
measurement_template = (char*)&measurement_FF_template_Intel_noMem;
} else {
measurement_template = (char*)&measurement_FF_template_Intel;
}
}
if (is_AMD_CPU) {
run_experiment(measurement_template, measurement_results_base, 3, base_unroll_count, base_loop_count);
run_experiment(measurement_template, measurement_results, 3, main_unroll_count, main_loop_count);
if (verbose) {
pr_info("\nRDTSC, MPERF, and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
print_all_measurement_results(measurement_results_base, 3);
pr_info("RDTSC, MPERF, and and APERF results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
print_all_measurement_results(measurement_results, 3);
}
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1));
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2));
} else {
configure_perf_ctrs_FF_Intel(false, true);
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
if (verbose) {
pr_info("\nRDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
print_all_measurement_results(measurement_results_base, 4);
pr_info("RDTSC and fixed-function counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
print_all_measurement_results(measurement_results, 4);
}
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1));
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2));
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3));
}
}
/*********************************
* Programmable counters.
********************************/
int n_used_counters = n_programmable_counters;
if (is_AMD_CPU) {
if (no_mem) {
measurement_template = (char*)&measurement_template_AMD_noMem;
} else {
measurement_template = (char*)&measurement_template_AMD;
}
} else {
if (n_used_counters >= 4) {
n_used_counters = 4;
if (no_mem) {
measurement_template = (char*)&measurement_template_Intel_noMem_4;
} else {
measurement_template = (char*)&measurement_template_Intel_4;
}
} else {
n_used_counters = 2;
if (no_mem) {
measurement_template = (char*)&measurement_template_Intel_noMem_2;
} else {
measurement_template = (char*)&measurement_template_Intel_2;
}
}
}
size_t next_pfc_config = 0;
while (next_pfc_config < n_pfc_configs) {
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, true, true, n_used_counters, 0, pfc_descriptions);
// on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set
run_experiment(measurement_template, measurement_results_base, n_used_counters, base_unroll_count, base_loop_count);
run_experiment(measurement_template, measurement_results, n_used_counters, main_unroll_count, main_loop_count);
if (verbose) {
pr_info("\nProgrammable counter results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
print_all_measurement_results(measurement_results_base, n_used_counters);
pr_info("Programmable counter results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
print_all_measurement_results(measurement_results, n_used_counters);
}
for (size_t c=0; c < n_used_counters; c++) {
if (pfc_descriptions[c]) seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), pfc_descriptions[c], c));
}
}
/*********************************
* MSRs.
********************************/
if (no_mem) {
measurement_template = (char*)&measurement_RDMSR_template_noMem;
} else {
measurement_template = (char*)&measurement_RDMSR_template;
}
for (size_t i=0; i 0, the programmable counters from 0 to n_used_counters-1 are read; otherwise, the fixed counters are read.
// pmi_counter: 0-2: fixed counters, 3-n: programmable counters
// pmi_counter_val: value that is written to pmi_counter before each measurement
static void run_experiment_with_freeze_on_PMI(int64_t* results[], int n_used_counters, int pmi_counter, uint64_t pmi_counter_val) {
if (pmi_counter <= 2) {
set_bit_in_msr(MSR_IA32_FIXED_CTR_CTRL, pmi_counter*4 + 3);
} else {
set_bit_in_msr(MSR_IA32_PERFEVTSEL0 + (pmi_counter - 3), 20);
}
for (long ri=-warm_up_count; ri 0) {
for (int c=0; c=0; cycle--) {
run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle);
if (get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function) < last_applicable_instr) {
cycle_last_retired = cycle+1;
break;
}
}
print_verbose("Last instruction of benchmark retired in cycle: %llu\n", cycle_last_retired);
return cycle_last_retired;
}
// Returns the cycle with which the fixed cycle counter has to be programmed such that the programmable counters are frozen in the cycle in which the first
// instruction of the benchmark is added to the IDQ.
static uint64_t get_cycle_first_added_to_IDQ(uint64_t cycle_last_retired_empty) {
uint64_t perfevtsel2 = (uint64_t)0x79 | ((uint64_t)0x04 << 8) | (1ULL << 22) | (1ULL << 17); // IDQ.MITE_UOPS
write_msr(MSR_IA32_PERFEVTSEL0+2, perfevtsel2);
uint64_t cycle_first_added_to_IDQ = 0;
uint64_t prev_uops = 0;
for (int64_t cycle=cycle_last_retired_empty-3; cycle>=0; cycle--) {
run_experiment_with_freeze_on_PMI(measurement_results, 3, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle);
uint64_t uops = get_aggregate_value(measurement_results[2], n_measurements, 1, aggregate_function);
if ((prev_uops != 0) && (prev_uops - uops > 1)) {
cycle_first_added_to_IDQ = cycle + 1;
break;
}
prev_uops = uops;
}
print_verbose("First instruction added to IDQ in cycle: %llu\n", cycle_first_added_to_IDQ);
return cycle_first_added_to_IDQ;
}
// Programs the fixed cycle counter such that it overflows in the specified cycle, runs the benchmark,
// and stores the measurements of the programmable counters in results.
static void perform_measurements_for_cycle(uint64_t cycle, uint64_t* results, uint64_t* results_min, uint64_t* results_max) {
// on several microarchitectures, the counters 0 or 1 do not freeze at the same time as the other counters
int avoid_counters = 0;
if (displ_model == 0x97) { // Alder Lake
avoid_counters = (1 << 0);
} else if ((Intel_perf_mon_ver >= 3) && (Intel_perf_mon_ver <= 4) && (displ_model >= 0x3A)) {
avoid_counters = (1 << 1);
}
// the higher counters don't count some of the events properly (e.g., D1.01 on RKL)
int n_used_counters = 4;
size_t next_pfc_config = 0;
while (next_pfc_config < n_pfc_configs) {
size_t cur_pfc_config = next_pfc_config;
char* pfc_descriptions[MAX_PROGRAMMABLE_COUNTERS] = {0};
next_pfc_config = configure_perf_ctrs_programmable(next_pfc_config, true, true, n_used_counters, avoid_counters, pfc_descriptions);
run_experiment_with_freeze_on_PMI(measurement_results, n_used_counters, FIXED_CTR_CORE_CYCLES, get_max_FF_ctr_value() - cycle);
for (size_t c=0; c= KERNEL_VERSION(5, 6, 0)
static const struct proc_ops proc_file_fops_nanoBench = {
.proc_lseek = seq_lseek,
.proc_open = open_nanoBench,
.proc_read = seq_read,
.proc_release = single_release,
};
static const struct proc_ops proc_file_fops_nanoBenchCycleByCycle = {
.proc_lseek = seq_lseek,
.proc_open = open_nanoBenchCycleByCycle,
.proc_read = seq_read,
.proc_release = single_release,
};
#else
static const struct file_operations proc_file_fops_nanoBench = {
.llseek = seq_lseek,
.open = open_nanoBench,
.owner = THIS_MODULE,
.read = seq_read,
.release = single_release,
};
static const struct file_operations proc_file_fops_nanoBenchCycleByCycle = {
.llseek = seq_lseek,
.open = open_nanoBenchCycleByCycle,
.owner = THIS_MODULE,
.read = seq_read,
.release = single_release,
};
#endif
static struct kobject* nb_kobject;
static int __init nb_init(void) {
pr_info("Initializing nanoBench kernel module...\n");
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
set_memory_x = (void*)kallsyms_lookup_name("set_memory_x");
set_memory_nx = (void*)kallsyms_lookup_name("set_memory_nx");
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
kallsym__vmalloc_node_range = (void*)kallsyms_lookup_name("__vmalloc_node_range");
#endif
if (check_cpuid()) {
return -1;
}
for (int i=0; iparent);
if (!nb_kobject) {
pr_err("failed to create and add nb\n");
return -1;
}
int error = sysfs_create_file(nb_kobject, &clear_attribute.attr);
error |= sysfs_create_file(nb_kobject, &reset_attribute.attr);
error |= sysfs_create_file(nb_kobject, &code_attribute.attr);
error |= sysfs_create_file(nb_kobject, &code_init_attribute.attr);
error |= sysfs_create_file(nb_kobject, &code_late_init_attribute.attr);
error |= sysfs_create_file(nb_kobject, &code_one_time_init_attribute.attr);
error |= sysfs_create_file(nb_kobject, &config_attribute.attr);
error |= sysfs_create_file(nb_kobject, &msr_config_attribute.attr);
error |= sysfs_create_file(nb_kobject, &fixed_counters_attribute.attr);
error |= sysfs_create_file(nb_kobject, &loop_count_attribute.attr);
error |= sysfs_create_file(nb_kobject, &unroll_count_attribute.attr);
error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr);
error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr);
error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr);
error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr);
error |= sysfs_create_file(nb_kobject, &end_to_end_attribute.attr);
error |= sysfs_create_file(nb_kobject, &drain_frontend_attribute.attr);
error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
error |= sysfs_create_file(nb_kobject, &output_range_attribute.attr);
error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
error |= sysfs_create_file(nb_kobject, &no_normalization_attribute.attr);
error |= sysfs_create_file(nb_kobject, &r14_size_attribute.attr);
error |= sysfs_create_file(nb_kobject, &print_r14_attribute.attr);
error |= sysfs_create_file(nb_kobject, &code_offset_attribute.attr);
error |= sysfs_create_file(nb_kobject, &addresses_attribute.attr);
error |= sysfs_create_file(nb_kobject, &verbose_attribute.attr);
if (error) {
pr_err("failed to create file in /sys/nb/\n");
return error;
}
struct proc_dir_entry* proc_file_entry = proc_create("nanoBench", 0, NULL, &proc_file_fops_nanoBench);
struct proc_dir_entry* proc_file_entry2 = proc_create("nanoBenchCycleByCycle", 0, NULL, &proc_file_fops_nanoBenchCycleByCycle);
if(proc_file_entry == NULL || proc_file_entry2 == NULL) {
pr_err("failed to create file in /proc/\n");
return -1;
}
return 0;
}
static void __exit nb_exit(void) {
kfree(code);
kfree(code_init);
kfree(code_late_init);
kfree(code_one_time_init);
kfree(pfc_config_file_content);
kfree(msr_config_file_content);
vfree(runtime_one_time_init_code);
vfree(runtime_rbp - RUNTIME_R_SIZE/2);
vfree(runtime_rdi - RUNTIME_R_SIZE/2);
vfree(runtime_rsi - RUNTIME_R_SIZE/2);
vfree(runtime_rsp - RUNTIME_R_SIZE/2);
if (runtime_code_base) {
set_memory_nx((unsigned long)runtime_code_base, runtime_code_base_memory_size/PAGE_SIZE);
kfree(runtime_code_base);
}
if (n_r14_segments > 0) {
for (int i=0; i