mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-15 19:10:08 +01:00
various improvements
This commit is contained in:
@@ -60,7 +60,7 @@ It will produce an output similar to the following.
|
||||
|
||||
The tool will *unroll* the assembler code multiple times, i.e., it will create multiple copies of it. The results are averages per copy of the assembler code for multiple runs of the entire generated code sequence.
|
||||
|
||||
The config file contains the required information for configuring the programmable performance counters with the desired events. We provide example configuration files for recent Intel and AMD microarchitectures in the `config` folder. When using the kernel-module, the config file must not be larger than 4 kB.
|
||||
The config file contains the required information for configuring the programmable performance counters with the desired events. We provide example configuration files for recent Intel and AMD microarchitectures in the `config` folder.
|
||||
|
||||
The assembler code sequence may use and modify any general-purpose or vector registers (unless the `-loop` or `-no_mem` options are used), including the stack pointer. There is no need to restore the registers to their original values at the end.
|
||||
|
||||
|
||||
@@ -29,23 +29,30 @@ size_t code_length = 0;
|
||||
char* code_init = NULL;
|
||||
size_t code_init_length = 0;
|
||||
|
||||
char* code_one_time_init = NULL;
|
||||
size_t code_one_time_init_length = 0;
|
||||
|
||||
struct pfc_config pfc_configs[1000] = {{0}};
|
||||
size_t n_pfc_configs = 0;
|
||||
|
||||
char* pfc_config_file_content = NULL;
|
||||
|
||||
struct msr_config msr_configs[1000] = {{0}};
|
||||
size_t n_msr_configs = 0;
|
||||
char* msr_config_file_content = NULL;
|
||||
unsigned long cur_rdmsr = 0;
|
||||
|
||||
int is_Intel_CPU = 0;
|
||||
int is_AMD_CPU = 0;
|
||||
|
||||
int n_programmable_counters;
|
||||
|
||||
char* runtime_code;
|
||||
char* runtime_one_time_init_code;
|
||||
void* runtime_r14;
|
||||
void* runtime_rbp;
|
||||
void* runtime_rdi;
|
||||
void* runtime_rsi;
|
||||
void* runtime_rsp;
|
||||
void* huge_pages = NULL;
|
||||
int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
|
||||
void* RSP_mem;
|
||||
|
||||
@@ -199,6 +206,53 @@ void parse_counter_configs() {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
void parse_msr_configs() {
|
||||
n_msr_configs = 0;
|
||||
|
||||
char* line;
|
||||
char* next_line = msr_config_file_content;
|
||||
while ((line = strsep(&next_line, "\n")) != NULL) {
|
||||
if (strlen(line) == 0 || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
char* wrmsr_str = strsep(&line, " \t");
|
||||
|
||||
char* rdmsr_str = strsep(&line, " \t");
|
||||
strreplace(rdmsr_str, 'h', '\0'); strreplace(rdmsr_str, 'H', '\0');
|
||||
|
||||
if (line && strlen(line) > 0) {
|
||||
msr_configs[n_msr_configs].description = line;
|
||||
} else {
|
||||
msr_configs[n_msr_configs].description = rdmsr_str;
|
||||
}
|
||||
|
||||
nb_strtoul(rdmsr_str+4, 16, &(msr_configs[n_msr_configs].rdmsr));
|
||||
|
||||
size_t n_wrmsr = 0;
|
||||
char* tok = wrmsr_str;
|
||||
char* ce;
|
||||
while ((ce = strsep(&tok, ".")) != NULL) {
|
||||
if (n_wrmsr >= 10) {
|
||||
print_error("Error: n_wrmsr >= 10");
|
||||
break;
|
||||
}
|
||||
|
||||
char* msr_str = strsep(&ce, "=")+4;
|
||||
pr_debug("msr_str: %s", msr_str);
|
||||
strreplace(msr_str, 'h', '\0'); strreplace(msr_str, 'H', '\0');
|
||||
nb_strtoul(msr_str, 16, &(msr_configs[n_msr_configs].wrmsr[n_wrmsr]));
|
||||
strreplace(ce, 'h', '\0'); strreplace(ce, 'H', '\0');
|
||||
nb_strtoul(ce, 0, &(msr_configs[n_msr_configs].wrmsr_val[n_wrmsr]));
|
||||
n_wrmsr++;
|
||||
}
|
||||
msr_configs[n_msr_configs].n_wrmsr = n_wrmsr;
|
||||
n_msr_configs++;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL__
|
||||
uint64_t read_value_from_cmd(char* cmd) {
|
||||
FILE* fp;
|
||||
@@ -340,6 +394,13 @@ void configure_perf_ctrs_programmable(int start, int end, unsigned int usr, unsi
|
||||
}
|
||||
}
|
||||
|
||||
void configure_MSRs(struct msr_config config) {
|
||||
for (size_t i=0; i<config.n_wrmsr; i++) {
|
||||
write_msr(config.wrmsr[i], config.wrmsr_val[i]);
|
||||
}
|
||||
cur_rdmsr = config.rdmsr;
|
||||
}
|
||||
|
||||
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count) {
|
||||
int templateI = 0;
|
||||
int rci = 0;
|
||||
@@ -377,39 +438,30 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
}
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC)) {
|
||||
*(void**)(&runtime_code[rci]) = pfc_mem;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_MSR)) {
|
||||
*(void**)(&runtime_code[rci]) = (void*)cur_rdmsr;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RSP_ADDRESS)) {
|
||||
*(void**)(&runtime_code[rci]) = &RSP_mem;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_R14)) {
|
||||
if (huge_pages) {
|
||||
*(void**)(&runtime_code[rci]) = huge_pages;
|
||||
} else {
|
||||
*(void**)(&runtime_code[rci]) = runtime_r14 + RUNTIME_R_SIZE/2;
|
||||
}
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
*(void**)(&runtime_code[rci]) = runtime_r14;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RBP)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rbp + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
*(void**)(&runtime_code[rci]) = runtime_rbp;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RDI)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rdi + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
*(void**)(&runtime_code[rci]) = runtime_rdi;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSI)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rsi + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
*(void**)(&runtime_code[rci]) = runtime_rsi;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSP)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rsp + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
*(void**)(&runtime_code[rci]) = runtime_rsp;
|
||||
templateI += 8; rci += 8;
|
||||
} else {
|
||||
runtime_code[rci++] = measurement_template[templateI];
|
||||
templateI++;
|
||||
runtime_code[rci++] = measurement_template[templateI++];
|
||||
}
|
||||
}
|
||||
templateI += 8;
|
||||
@@ -418,6 +470,48 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
} while (measurement_template[templateI-1] != '\xC3'); // 0xC3 = ret
|
||||
}
|
||||
|
||||
void create_and_run_one_time_init_code() {
|
||||
if (code_one_time_init_length == 0) return;
|
||||
|
||||
char* template = (char*)&one_time_init_template;
|
||||
int templateI = 0;
|
||||
int rci = 0;
|
||||
|
||||
while (!starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_TEMPLATE_END)) {
|
||||
if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_INIT)) {
|
||||
templateI += 8;
|
||||
memcpy(&runtime_one_time_init_code[rci], code_one_time_init, code_one_time_init_length);
|
||||
rci += code_one_time_init_length;
|
||||
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RSP_ADDRESS)) {
|
||||
*(void**)(&runtime_one_time_init_code[rci]) = &RSP_mem;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_R14)) {
|
||||
*(void**)(&runtime_one_time_init_code[rci]) = runtime_r14;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RBP)) {
|
||||
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rbp;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RDI)) {
|
||||
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rdi;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSI)) {
|
||||
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rsi;
|
||||
templateI += 8; rci += 8;
|
||||
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSP)) {
|
||||
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rsp;
|
||||
templateI += 8; rci += 8;
|
||||
} else {
|
||||
runtime_one_time_init_code[rci++] = template[templateI++];
|
||||
}
|
||||
}
|
||||
templateI += 8;
|
||||
do {
|
||||
runtime_one_time_init_code[rci++] = template[templateI++];
|
||||
} while (template[templateI-1] != '\xC3'); // 0xC3 = ret
|
||||
|
||||
((void(*)(void))runtime_one_time_init_code)();
|
||||
}
|
||||
|
||||
void run_warmup_experiment(char* measurement_template) {
|
||||
if (!initial_warm_up_count) return;
|
||||
|
||||
@@ -1112,3 +1206,79 @@ void measurement_RDTSC_template_noMem() {
|
||||
RESTORE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||
}
|
||||
|
||||
void measurement_RDMSR_template() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
"push rax \n"
|
||||
"lahf \n"
|
||||
"seto al \n"
|
||||
"push rax \n"
|
||||
"push rcx \n"
|
||||
"push rdx \n"
|
||||
"push r15 \n"
|
||||
"mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n"
|
||||
"mov qword ptr [r15], 0 \n"
|
||||
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
|
||||
"lfence; rdmsr; lfence \n"
|
||||
"shl rdx, 32; or rdx, rax \n"
|
||||
"sub [r15], rdx \n"
|
||||
"lfence \n"
|
||||
"pop r15; lfence \n"
|
||||
"pop rdx; lfence \n"
|
||||
"pop rcx; lfence \n"
|
||||
"pop rax; lfence \n"
|
||||
"cmp al, -127; lfence \n"
|
||||
"sahf; lfence \n"
|
||||
"pop rax; \n"
|
||||
"lfence \n"
|
||||
".att_syntax noprefix ");
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_CODE));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
"lfence \n"
|
||||
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
|
||||
"lfence; rdmsr; lfence \n"
|
||||
"shl rdx, 32; or rdx, rax \n"
|
||||
"mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n"
|
||||
"add [r15], rdx \n"
|
||||
".att_syntax noprefix ");
|
||||
RESTORE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||
}
|
||||
|
||||
void measurement_RDMSR_template_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
"mov r8, 0 \n"
|
||||
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
|
||||
"lfence; rdmsr; lfence \n"
|
||||
"shl rdx, 32; or rdx, rax \n"
|
||||
"sub r8, rdx \n"
|
||||
"lfence \n"
|
||||
".att_syntax noprefix ");
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_CODE));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
"lfence \n"
|
||||
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
|
||||
"lfence; rdmsr; lfence \n"
|
||||
"shl rdx, 32; or rdx, rax \n"
|
||||
"add r8, rdx \n"
|
||||
"mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n"
|
||||
"mov [r15], r8 \n"
|
||||
".att_syntax noprefix ");
|
||||
RESTORE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||
}
|
||||
|
||||
void one_time_init_template() {
|
||||
SAVE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
RESTORE_REGS_FLAGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
|
||||
}
|
||||
@@ -120,6 +120,9 @@ extern size_t code_length;
|
||||
extern char* code_init;
|
||||
extern size_t code_init_length;
|
||||
|
||||
extern char* code_one_time_init;
|
||||
extern size_t code_one_time_init_length;
|
||||
|
||||
struct pfc_config {
|
||||
unsigned long evt_num;
|
||||
unsigned long umask;
|
||||
@@ -134,33 +137,41 @@ struct pfc_config {
|
||||
unsigned int invalid;
|
||||
char* description;
|
||||
};
|
||||
|
||||
extern struct pfc_config pfc_configs[];
|
||||
extern size_t n_pfc_configs;
|
||||
|
||||
extern char* pfc_config_file_content;
|
||||
|
||||
struct msr_config {
|
||||
unsigned long rdmsr;
|
||||
unsigned long wrmsr[10];
|
||||
unsigned long wrmsr_val[10];
|
||||
size_t n_wrmsr;
|
||||
char* description;
|
||||
};
|
||||
extern struct msr_config msr_configs[];
|
||||
extern size_t n_msr_configs;
|
||||
extern char* msr_config_file_content;
|
||||
|
||||
extern int is_Intel_CPU;
|
||||
extern int is_AMD_CPU;
|
||||
|
||||
#define MAX_PROGRAMMABLE_COUNTERS 6
|
||||
extern int n_programmable_counters;
|
||||
|
||||
// Pointer to a memory region that is writable and executable.
|
||||
// Pointers to a memory regions that are writable and executable.
|
||||
extern char* runtime_code;
|
||||
extern char* runtime_one_time_init_code;
|
||||
|
||||
#define RUNTIME_R_SIZE (1024*1024)
|
||||
|
||||
// During measurements, R14, RBP, RDI, RSI, and RSP will contain these addresses plus RUNTIME_R_SIZE/2.
|
||||
// If r14_size is set in the kernel module, R14 will not have this offset.
|
||||
extern void* runtime_r14;
|
||||
extern void* runtime_rbp;
|
||||
extern void* runtime_rdi;
|
||||
extern void* runtime_rsi;
|
||||
extern void* runtime_rsp;
|
||||
|
||||
// If non-null, R14 will contain this address instead of runtime_r14.
|
||||
extern void* huge_pages;
|
||||
|
||||
// Stores performance counter values during measurements.
|
||||
extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
|
||||
|
||||
@@ -178,6 +189,7 @@ extern int cpu;
|
||||
int check_cpuid(void);
|
||||
|
||||
void parse_counter_configs(void);
|
||||
void parse_msr_configs(void);
|
||||
|
||||
uint64_t read_value_from_cmd(char* cmd);
|
||||
|
||||
@@ -191,10 +203,12 @@ void configure_perf_ctrs_FF(unsigned int usr, unsigned int os);
|
||||
// start and end are indices into the pfc_configs array.
|
||||
void configure_perf_ctrs_programmable(int start, int end, unsigned int usr, unsigned int os);
|
||||
|
||||
void configure_MSRs(struct msr_config config);
|
||||
|
||||
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
|
||||
void run_warmup_experiment(char* measurement_template);
|
||||
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
|
||||
void create_and_run_one_time_init_code(void);
|
||||
|
||||
char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter);
|
||||
int64_t get_aggregate_value_100(int64_t* values, size_t length);
|
||||
@@ -213,7 +227,8 @@ void print_all_measurement_results(int64_t* results[], int n_counters);
|
||||
#define MAGIC_BYTES_RUNTIME_RSI 0x70b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RSP 0x80b513b1C2813F04
|
||||
#define MAGIC_BYTES_PFC 0x90b513b1C2813F04
|
||||
#define MAGIC_BYTES_TEMPLATE_END 0xA0b513b1C2813F04
|
||||
#define MAGIC_BYTES_MSR 0xA0b513b1C2813F04
|
||||
#define MAGIC_BYTES_TEMPLATE_END 0xB0b513b1C2813F04
|
||||
|
||||
#define STRINGIFY2(X) #X
|
||||
#define STRINGIFY(X) STRINGIFY2(X)
|
||||
@@ -231,6 +246,9 @@ void measurement_FF_template_AMD(void);
|
||||
void measurement_FF_template_AMD_noMem(void);
|
||||
void measurement_RDTSC_template(void);
|
||||
void measurement_RDTSC_template_noMem(void);
|
||||
void measurement_RDMSR_template(void);
|
||||
void measurement_RDMSR_template_noMem(void);
|
||||
void one_time_init_template(void);
|
||||
|
||||
// RBX, RBP, and R12–R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
|
||||
#define SAVE_REGS_FLAGS() \
|
||||
|
||||
@@ -17,31 +17,47 @@ cat /sys/nb/reset
|
||||
taskset=""
|
||||
|
||||
while [ "$1" ]; do
|
||||
if [[ "$1" == -asm_init ]]; then
|
||||
if [[ "$1" == -asm_i* ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-init.s
|
||||
echo "$2" >> asm-init.s
|
||||
as asm-init.s -o asm-init.o
|
||||
objcopy asm-init.o -O binary /sys/nb/init
|
||||
objcopy asm-init.o -O binary asm-init.o
|
||||
echo -n "asm-init.o" > /sys/nb/init
|
||||
rm -f asm-init.s asm-init.o
|
||||
shift 2
|
||||
elif [[ "$1" == -asm ]]; then
|
||||
elif [[ "$1" == -asm_o* ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-one-time-init.s
|
||||
echo "$2" >> asm-one-time-init.s
|
||||
as asm-one-time-init.s -o asm-one-time-init.o
|
||||
objcopy asm-one-time-init.o -O binary asm-one-time-init.o
|
||||
echo -n "asm-one-time-init.o" > /sys/nb/one_time_init
|
||||
rm -f asm-one-time-init.s asm-one-time-init.o
|
||||
shift 2
|
||||
elif [[ "$1" == -as* ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-code.s
|
||||
echo "$2" >> asm-code.s
|
||||
as asm-code.s -o asm-code.o
|
||||
objcopy asm-code.o -O binary /sys/nb/code
|
||||
objcopy asm-code.o -O binary asm-code.o
|
||||
echo -n "asm-code.o" > /sys/nb/code
|
||||
rm -f asm-code.s asm-code.o
|
||||
shift 2
|
||||
elif [[ "$1" == -code_init ]]; then
|
||||
cp "$2" /sys/nb/init
|
||||
elif [[ "$1" == -code_i* ]]; then
|
||||
echo -n "$2" > /sys/nb/init
|
||||
shift 2
|
||||
elif [[ "$1" == -code ]]; then
|
||||
cp "$2" /sys/nb/code
|
||||
elif [[ "$1" == -code_o* ]]; then
|
||||
echo -n "$2" > /sys/nb/one_time_init
|
||||
shift 2
|
||||
elif [[ "$1" == -cod* ]]; then
|
||||
echo -n "$2" > /sys/nb/code
|
||||
shift 2
|
||||
elif [[ "$1" == -cpu ]]; then
|
||||
taskset="taskset -c $2"
|
||||
shift 2
|
||||
elif [[ "$1" == -config ]]; then
|
||||
cp "$2" /sys/nb/config
|
||||
elif [[ "$1" == -con* ]]; then
|
||||
echo -n "$2" > /sys/nb/config
|
||||
shift 2
|
||||
elif [[ "$1" == -msr* ]]; then
|
||||
echo -n "$2" > /sys/nb/msr_config
|
||||
shift 2
|
||||
elif [[ "$1" == -u* ]]; then
|
||||
echo "$2" > /sys/nb/unroll_count
|
||||
@@ -103,4 +119,4 @@ while [ "$1" ]; do
|
||||
fi
|
||||
done
|
||||
|
||||
$taskset cat /sys/nb/run
|
||||
$taskset cat /proc/nanoBench
|
||||
|
||||
355
kernel/nb_km.c
355
kernel/nb_km.c
@@ -12,7 +12,10 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <../arch/x86/include/asm/fpu/api.h>
|
||||
|
||||
#include "../common/nanoBench.h"
|
||||
@@ -20,30 +23,106 @@
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Andreas Abel");
|
||||
|
||||
struct page** pages = NULL;
|
||||
unsigned long npages = 0;
|
||||
size_t code_memory_size = 0;
|
||||
size_t code_init_memory_size = 0;
|
||||
size_t code_one_time_init_memory_size = 0;
|
||||
size_t pfc_config_memory_size = 0;
|
||||
size_t msr_config_memory_size = 0;
|
||||
size_t runtime_code_memory_size = 0;
|
||||
size_t runtime_one_time_init_code_memory_size = 0;
|
||||
|
||||
void** r14_segments = NULL;
|
||||
size_t n_r14_segments = 0;
|
||||
|
||||
static int read_file_into_buffer(const char *file_name, char **buf, size_t *buf_len, size_t *buf_memory_size) {
|
||||
struct file *filp = NULL;
|
||||
filp = filp_open(file_name, O_RDONLY, 0);
|
||||
if (!filp) {
|
||||
pr_debug("Error opening file %s\n", file_name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct path p;
|
||||
struct kstat ks;
|
||||
kern_path(file_name, 0, &p);
|
||||
if (vfs_getattr(&p, &ks, 0, 0)) {
|
||||
pr_debug("Error getting file attributes\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t file_size = ks.size;
|
||||
*buf_len = file_size;
|
||||
|
||||
if (file_size + 1 > *buf_memory_size) {
|
||||
kfree(*buf);
|
||||
*buf_memory_size = max(file_size + 1, PAGE_SIZE);
|
||||
*buf = kmalloc(*buf_memory_size, GFP_KERNEL);
|
||||
if (!*buf) {
|
||||
printk(KERN_ERR "Could not allocate memory for %s\n", file_name);
|
||||
*buf_memory_size = 0;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
loff_t pos = 0;
|
||||
kernel_read(filp, *buf, file_size, &pos);
|
||||
(*buf)[file_size] = '\0';
|
||||
|
||||
filp_close(filp, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void extend_runtime_code(void) {
|
||||
size_t new_runtime_code_memory_size = 10000 + code_init_memory_size + 2*(unroll_count)*code_memory_size;
|
||||
if (new_runtime_code_memory_size > runtime_code_memory_size) {
|
||||
runtime_code_memory_size = new_runtime_code_memory_size;
|
||||
vfree(runtime_code);
|
||||
runtime_code = __vmalloc(runtime_code_memory_size, GFP_KERNEL, PAGE_KERNEL_EXEC);
|
||||
if (!runtime_code) {
|
||||
runtime_code_memory_size = 0;
|
||||
pr_debug("failed to allocate executable memory\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static ssize_t code_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return 0;
|
||||
}
|
||||
static ssize_t code_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
read_file_into_buffer(buf, &code, &code_length, &code_memory_size);
|
||||
extend_runtime_code();
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute code_attribute =__ATTR(code, 0660, code_show, code_store);
|
||||
|
||||
static ssize_t init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
memcpy(buf, code_init, code_init_length);
|
||||
return code_init_length;
|
||||
return 0;
|
||||
}
|
||||
static ssize_t init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
memcpy(code_init, buf, count);
|
||||
code_init_length = count;
|
||||
read_file_into_buffer(buf, &code_init, &code_init_length, &code_init_memory_size);
|
||||
extend_runtime_code();
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute code_init_attribute =__ATTR(init, 0660, init_show, init_store);
|
||||
|
||||
static ssize_t code_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
memcpy(buf, code, code_length);
|
||||
return code_length;
|
||||
static ssize_t one_time_init_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return 0;
|
||||
}
|
||||
static ssize_t code_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
memcpy(code, buf, count);
|
||||
code_length = count;
|
||||
static ssize_t one_time_init_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
read_file_into_buffer(buf, &code_one_time_init, &code_one_time_init_length, &code_one_time_init_memory_size);
|
||||
size_t new_runtime_one_time_init_code_memory_size = 10000 + code_one_time_init_memory_size;
|
||||
if (new_runtime_one_time_init_code_memory_size > runtime_one_time_init_code_memory_size) {
|
||||
runtime_one_time_init_code_memory_size = new_runtime_one_time_init_code_memory_size;
|
||||
vfree(runtime_one_time_init_code);
|
||||
runtime_one_time_init_code = __vmalloc(runtime_one_time_init_code_memory_size, GFP_KERNEL, PAGE_KERNEL_EXEC);
|
||||
if (!runtime_one_time_init_code) {
|
||||
runtime_one_time_init_code_memory_size = 0;
|
||||
pr_debug("failed to allocate executable memory\n");
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute code_attribute =__ATTR(code, 0660, code_show, code_store);
|
||||
static struct kobj_attribute code_one_time_init_attribute =__ATTR(one_time_init, 0660, one_time_init_show, one_time_init_store);
|
||||
|
||||
static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
ssize_t count = 0;
|
||||
@@ -57,27 +136,39 @@ static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, ch
|
||||
return count;
|
||||
}
|
||||
static ssize_t config_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
memcpy(pfc_config_file_content, buf, count);
|
||||
pfc_config_file_content[count] = '\0';
|
||||
size_t pfc_config_length;
|
||||
read_file_into_buffer(buf, &pfc_config_file_content, &pfc_config_length, &pfc_config_memory_size);
|
||||
parse_counter_configs();
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute config_attribute =__ATTR(config, 0660, config_show, config_store);
|
||||
|
||||
static ssize_t msr_config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
ssize_t count = 0;
|
||||
for (int i=0; i<n_msr_configs; i++) {
|
||||
struct msr_config config = msr_configs[i];
|
||||
for (int j=0; j<config.n_wrmsr; j++) {
|
||||
count += sprintf(&(buf[count]), "msr_%lX=0x%lX", config.wrmsr[j], config.wrmsr_val[j]);
|
||||
if (j<config.n_wrmsr-1) count += sprintf(&(buf[count]), ".");
|
||||
}
|
||||
count += sprintf(&(buf[count]), " msr_%lX %s\n", config.rdmsr, config.description);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
static ssize_t msr_config_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
size_t msr_config_length;
|
||||
read_file_into_buffer(buf, &msr_config_file_content, &msr_config_length, &msr_config_memory_size);
|
||||
parse_msr_configs();
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute msr_config_attribute =__ATTR(msr_config, 0660, msr_config_show, msr_config_store);
|
||||
|
||||
static ssize_t unroll_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%ld\n", unroll_count);
|
||||
}
|
||||
static ssize_t unroll_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
long old_unroll_count = unroll_count;
|
||||
sscanf(buf, "%ld", &unroll_count);
|
||||
|
||||
if (old_unroll_count != unroll_count) {
|
||||
vfree(runtime_code);
|
||||
runtime_code = __vmalloc(PAGE_SIZE + (unroll_count)*PAGE_SIZE*2 + 10000, GFP_KERNEL, PAGE_KERNEL_EXEC);
|
||||
if (!runtime_code) {
|
||||
pr_debug("failed to allocate executable memory\n");
|
||||
}
|
||||
}
|
||||
extend_runtime_code();
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute unroll_count_attribute =__ATTR(unroll_count, 0660, unroll_count_show, unroll_count_store);
|
||||
@@ -163,41 +254,66 @@ static ssize_t agg_store(struct kobject *kobj, struct kobj_attribute *attr, cons
|
||||
}
|
||||
static struct kobj_attribute agg_attribute =__ATTR(agg, 0660, agg_show, agg_store);
|
||||
|
||||
static ssize_t use_huge_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return 0;
|
||||
int cmpPtr(const void *a, const void *b) {
|
||||
if (*(void**)a == *(void**)b) return 0;
|
||||
else if (*(void**)a == NULL) return 1;
|
||||
else if (*(void**)b == NULL) return -1;
|
||||
else if (*(void**)a < *(void**)b) return -1;
|
||||
else return 1;
|
||||
}
|
||||
static ssize_t use_huge_pages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
if (huge_pages) {
|
||||
vm_unmap_ram(huge_pages, npages);
|
||||
}
|
||||
for (int i=0; i<npages; i++) {
|
||||
put_page(pages[i]);
|
||||
}
|
||||
vfree(pages);
|
||||
|
||||
long len = *(long*)(buf+sizeof(void*));
|
||||
npages = (len+PAGE_SIZE-1)/PAGE_SIZE;
|
||||
if (npages == 0) {
|
||||
pr_debug("Huge pages disabled.");
|
||||
pages = NULL;
|
||||
huge_pages = NULL;
|
||||
return count;
|
||||
// 4 Mb is the maximum that kmalloc supports on my machines
|
||||
#define R14_SEGMENT_SIZE (4*1024*1024)
|
||||
|
||||
static ssize_t r14_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
if (n_r14_segments == 0 || !r14_segments[0]) return sprintf(buf, "0\n");
|
||||
|
||||
void* prev_virt_addr = r14_segments[0];
|
||||
phys_addr_t prev_phys_addr = virt_to_phys(prev_virt_addr);
|
||||
|
||||
size_t i;
|
||||
for (i=1; i<n_r14_segments; i++) {
|
||||
void* cur_virt_addr = r14_segments[i];
|
||||
phys_addr_t cur_phys_addr = virt_to_phys(cur_virt_addr);
|
||||
|
||||
if ((cur_virt_addr - prev_virt_addr != R14_SEGMENT_SIZE) || (cur_phys_addr - prev_phys_addr != R14_SEGMENT_SIZE)) {
|
||||
pr_debug("No physically contiguous memory area of the requested size found.\n");
|
||||
pr_debug("Try rebooting your computer.\n");
|
||||
break;
|
||||
}
|
||||
|
||||
prev_virt_addr = cur_virt_addr;
|
||||
prev_phys_addr = cur_phys_addr;
|
||||
}
|
||||
|
||||
pages = vmalloc(npages * sizeof(struct page*));
|
||||
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
int res = get_user_pages(*(unsigned long*)(buf), npages, FOLL_WRITE, pages, NULL);
|
||||
if (res) {
|
||||
int nid = page_to_nid(pages[0]);
|
||||
huge_pages = vm_map_ram(pages, npages, nid, PAGE_KERNEL);
|
||||
phys_addr_t phys_addr = virt_to_phys(r14_segments[0]);
|
||||
return sprintf(buf, "R14 size: %zu MB\nVirtual address: 0x%px\nPhysical address: %pa\n", i*R14_SEGMENT_SIZE/(1024*1024), r14_segments[0], &phys_addr);
|
||||
}
|
||||
static ssize_t r14_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
if (n_r14_segments > 0) {
|
||||
for (int i=0; i<n_r14_segments; i++) {
|
||||
kfree(r14_segments[i]);
|
||||
}
|
||||
} else {
|
||||
vfree(runtime_r14 - RUNTIME_R_SIZE/2);
|
||||
}
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
|
||||
pr_debug("Huge pages enabled. Start address: %px", huge_pages);
|
||||
size_t size_MB = 0;
|
||||
sscanf(buf, "%zu", &size_MB);
|
||||
n_r14_segments = (size_MB*1024*1024 + (R14_SEGMENT_SIZE-1)) / R14_SEGMENT_SIZE;
|
||||
vfree(r14_segments);
|
||||
r14_segments = vmalloc(n_r14_segments * sizeof(void*));
|
||||
|
||||
for (size_t i=0; i<n_r14_segments; i++) {
|
||||
r14_segments[i] = kmalloc(R14_SEGMENT_SIZE, GFP_KERNEL);
|
||||
}
|
||||
|
||||
sort(r14_segments, n_r14_segments, sizeof(void*), cmpPtr, NULL);
|
||||
runtime_r14 = r14_segments[0];
|
||||
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute use_huge_pages_attribute =__ATTR(use_huge_pages, 0660, use_huge_pages_show, use_huge_pages_store);
|
||||
static struct kobj_attribute r14_size_attribute =__ATTR(r14_size, 0660, r14_size_show, r14_size_store);
|
||||
|
||||
static ssize_t verbose_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%u\n", verbose);
|
||||
@@ -233,6 +349,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha
|
||||
code_init_length = 0;
|
||||
code_length = 0;
|
||||
n_pfc_configs = 0;
|
||||
n_msr_configs = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -241,7 +358,7 @@ static ssize_t reset_store(struct kobject *kobj, struct kobj_attribute *attr, co
|
||||
}
|
||||
static struct kobj_attribute reset_attribute =__ATTR(reset, 0660, reset_show, reset_store);
|
||||
|
||||
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
static int show(struct seq_file *m, void *v) {
|
||||
kernel_fpu_begin();
|
||||
|
||||
long base_unroll_count = (basic_mode?0:unroll_count);
|
||||
@@ -249,6 +366,7 @@ static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char
|
||||
long base_loop_count = (basic_mode?0:loop_count);
|
||||
long main_loop_count = loop_count;
|
||||
|
||||
char buf[100];
|
||||
char* measurement_template;
|
||||
|
||||
/*********************************
|
||||
@@ -269,7 +387,7 @@ static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char
|
||||
}
|
||||
|
||||
configure_perf_ctrs_FF(0, 1);
|
||||
|
||||
create_and_run_one_time_init_code();
|
||||
run_warmup_experiment(measurement_template);
|
||||
|
||||
if (is_AMD_CPU) {
|
||||
@@ -283,9 +401,9 @@ static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char
|
||||
print_all_measurement_results(measurement_results, 3);
|
||||
}
|
||||
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "RDTSC", 0);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "MPERF", 1);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "APERF", 2);
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "MPERF", 1));
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "APERF", 2));
|
||||
} else {
|
||||
run_experiment(measurement_template, measurement_results_base, 4, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, 4, main_unroll_count, main_loop_count);
|
||||
@@ -297,10 +415,10 @@ static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char
|
||||
print_all_measurement_results(measurement_results, 4);
|
||||
}
|
||||
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "RDTSC", 0);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "Instructions retired", 1);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "Core cycles", 2);
|
||||
compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), "Reference cycles", 3);
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "RDTSC", 0));
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Instructions retired", 1));
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Core cycles", 2));
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), "Reference cycles", 3));
|
||||
}
|
||||
|
||||
/*********************************
|
||||
@@ -321,7 +439,8 @@ static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char
|
||||
}
|
||||
|
||||
for (size_t i=0; i<n_pfc_configs; i+=n_programmable_counters) {
|
||||
configure_perf_ctrs_programmable(i, min(i+n_programmable_counters, n_pfc_configs), 0, 1);
|
||||
configure_perf_ctrs_programmable(i, min(i+n_programmable_counters, n_pfc_configs), 1, 1);
|
||||
// on some microarchitectures (e.g., Broadwell), some events (e.g., L1 misses) are not counted properly if only the OS field is set
|
||||
|
||||
run_experiment(measurement_template, measurement_results_base, n_programmable_counters, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, n_programmable_counters, main_unroll_count, main_loop_count);
|
||||
@@ -334,18 +453,51 @@ static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char
|
||||
}
|
||||
|
||||
for (int c=0; c < n_programmable_counters && i + c < n_pfc_configs; c++) {
|
||||
if (!pfc_configs[i+c].invalid) compute_result_str(buf+strlen(buf), PAGE_SIZE-strlen(buf), pfc_configs[i+c].description, c);
|
||||
if (!pfc_configs[i+c].invalid) seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), pfc_configs[i+c].description, c));
|
||||
}
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
/*********************************
|
||||
* MSRs.
|
||||
********************************/
|
||||
|
||||
return strlen(buf);
|
||||
}
|
||||
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
if (no_mem) {
|
||||
measurement_template = (char*)&measurement_RDMSR_template_noMem;
|
||||
} else {
|
||||
measurement_template = (char*)&measurement_RDMSR_template;
|
||||
}
|
||||
|
||||
for (size_t i=0; i<n_msr_configs; i++) {
|
||||
configure_MSRs(msr_configs[i]);
|
||||
|
||||
run_experiment(measurement_template, measurement_results_base, 1, base_unroll_count, base_loop_count);
|
||||
run_experiment(measurement_template, measurement_results, 1, main_unroll_count, main_loop_count);
|
||||
|
||||
if (verbose) {
|
||||
pr_debug("\nMSR results (unroll_count=%ld, loop_count=%ld):\n\n", base_unroll_count, base_loop_count);
|
||||
print_all_measurement_results(measurement_results_base, 1);
|
||||
pr_debug("MSR results (unroll_count=%ld, loop_count=%ld):\n\n", main_unroll_count, main_loop_count);
|
||||
print_all_measurement_results(measurement_results, 1);
|
||||
}
|
||||
|
||||
seq_printf(m, "%s", compute_result_str(buf, sizeof(buf), msr_configs[i].description, 0));
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
return 0;
|
||||
}
|
||||
static struct kobj_attribute run_attribute =__ATTR(run, 0660, run_show, run_store);
|
||||
|
||||
static int open(struct inode *inode, struct file *file) {
|
||||
return single_open(file, show, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_file_fops = {
|
||||
.llseek = seq_lseek,
|
||||
.open = open,
|
||||
.owner = THIS_MODULE,
|
||||
.read = seq_read,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static struct kobject* nb_kobject;
|
||||
|
||||
@@ -356,24 +508,6 @@ static int __init nb_init (void) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
code = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if (!code) {
|
||||
printk(KERN_ERR "Could not allocate memory for code\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
code_init = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if (!code_init) {
|
||||
printk(KERN_ERR "Could not allocate memory for code_init\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
pfc_config_file_content = kmalloc(PAGE_SIZE+1, GFP_KERNEL);
|
||||
if (!pfc_config_file_content) {
|
||||
printk(KERN_ERR "Could not allocate memory for pfc_config_file_content\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
measurement_results[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
|
||||
measurement_results_base[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
|
||||
@@ -395,12 +529,11 @@ static int __init nb_init (void) {
|
||||
printk(KERN_ERR "Could not allocate memory for runtime_r*\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
runtime_code = __vmalloc(PAGE_SIZE + (unroll_count)*PAGE_SIZE*2 + 10000, GFP_KERNEL, PAGE_KERNEL_EXEC);
|
||||
if (!runtime_code) {
|
||||
pr_debug("failed to allocate executable memory\n");
|
||||
return -1;
|
||||
}
|
||||
runtime_r14 += RUNTIME_R_SIZE/2;
|
||||
runtime_rbp += RUNTIME_R_SIZE/2;
|
||||
runtime_rdi += RUNTIME_R_SIZE/2;
|
||||
runtime_rsi += RUNTIME_R_SIZE/2;
|
||||
runtime_rsp += RUNTIME_R_SIZE/2;
|
||||
|
||||
nb_kobject = kobject_create_and_add("nb", kernel_kobj->parent);
|
||||
if (!nb_kobject) {
|
||||
@@ -408,12 +541,13 @@ static int __init nb_init (void) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int error = sysfs_create_file(nb_kobject, &run_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &clear_attribute.attr);
|
||||
int error = sysfs_create_file(nb_kobject, &clear_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &reset_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &code_init_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &code_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &code_init_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &code_one_time_init_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &config_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &msr_config_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &loop_count_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &unroll_count_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr);
|
||||
@@ -422,7 +556,7 @@ static int __init nb_init (void) {
|
||||
error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &use_huge_pages_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &r14_size_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &verbose_attribute.attr);
|
||||
|
||||
if (error) {
|
||||
@@ -430,27 +564,35 @@ static int __init nb_init (void) {
|
||||
return error;
|
||||
}
|
||||
|
||||
struct proc_dir_entry* proc_file_entry = proc_create("nanoBench", 0, NULL, &proc_file_fops);
|
||||
if(proc_file_entry == NULL) {
|
||||
pr_debug("failed to create file in /proc/\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit nb_exit (void) {
|
||||
kfree(code);
|
||||
kfree(code_init);
|
||||
kfree(code_one_time_init);
|
||||
kfree(pfc_config_file_content);
|
||||
kfree(msr_config_file_content);
|
||||
vfree(runtime_code);
|
||||
vfree(runtime_r14);
|
||||
vfree(runtime_rbp);
|
||||
vfree(runtime_rdi);
|
||||
vfree(runtime_rsi);
|
||||
vfree(runtime_rsp);
|
||||
vfree(runtime_one_time_init_code);
|
||||
vfree(runtime_rbp - RUNTIME_R_SIZE/2);
|
||||
vfree(runtime_rdi - RUNTIME_R_SIZE/2);
|
||||
vfree(runtime_rsi - RUNTIME_R_SIZE/2);
|
||||
vfree(runtime_rsp - RUNTIME_R_SIZE/2);
|
||||
|
||||
if (huge_pages) {
|
||||
vm_unmap_ram(huge_pages, npages);
|
||||
if (n_r14_segments > 0) {
|
||||
for (int i=0; i<n_r14_segments; i++) {
|
||||
kfree(r14_segments[i]);
|
||||
}
|
||||
} else {
|
||||
vfree(runtime_r14 - RUNTIME_R_SIZE/2);
|
||||
}
|
||||
for (int i=0; i<npages; i++) {
|
||||
put_page(pages[i]);
|
||||
}
|
||||
vfree(pages);
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
kfree(measurement_results[i]);
|
||||
@@ -458,6 +600,7 @@ static void __exit nb_exit (void) {
|
||||
}
|
||||
|
||||
kobject_put(nb_kobject);
|
||||
remove_proc_entry("nanoBench", NULL);
|
||||
}
|
||||
|
||||
module_init(nb_init);
|
||||
|
||||
23
nanoBench.sh
23
nanoBench.sh
@@ -21,20 +21,27 @@ done
|
||||
|
||||
args=''
|
||||
while [ "$2" ]; do
|
||||
if [ "$1" == '-asm' ]; then
|
||||
echo ".intel_syntax noprefix" > asm-code.s
|
||||
echo "$2" >> asm-code.s
|
||||
as asm-code.s -o asm-code.o || exit
|
||||
objcopy asm-code.o -O binary asm-code.bin
|
||||
args="$args -code asm-code.bin"
|
||||
shift 2
|
||||
elif [ "$1" == '-asm_init' ]; then
|
||||
if [[ "$1" == -asm_i* ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-init.s
|
||||
echo "$2" >> asm-init.s
|
||||
as asm-init.s -o asm-init.o || exit
|
||||
objcopy asm-init.o -O binary asm-init.bin
|
||||
args="$args -code_init asm-init.bin"
|
||||
shift 2
|
||||
elif [[ "$1" == -asm_o* ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-one-time-init.s
|
||||
echo "$2" >> asm-one-time-init.s
|
||||
as asm-one-time-init.s -o asm-one-time-init.o || exit
|
||||
objcopy asm-one-time-init.o -O binary asm-one-time-init.bin
|
||||
args="$args -code_one_time_init asm-one-time-init.bin"
|
||||
shift 2
|
||||
elif [[ "$1" == -as* ]]; then
|
||||
echo ".intel_syntax noprefix" > asm-code.s
|
||||
echo "$2" >> asm-code.s
|
||||
as asm-code.s -o asm-code.o || exit
|
||||
objcopy asm-code.o -O binary asm-code.bin
|
||||
args="$args -code asm-code.bin"
|
||||
shift 2
|
||||
else
|
||||
args="$args $1"
|
||||
shift
|
||||
|
||||
27
set-R14-size.sh
Executable file
27
set-R14-size.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "This script must be run as root" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: sudo ./set-R14-size.sh <size>"
|
||||
echo "Example: sudo ./enable-HugePages.sh 128M"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
size=$(echo $1 | sed 's/[^0-9]//g')
|
||||
suffix=$(echo $1 | sed 's/[0-9]//g')
|
||||
|
||||
if [[ "$suffix" == "M" ]]; then
|
||||
:
|
||||
elif [[ "$suffix" == "G" ]]; then
|
||||
size=$(($size*1024))
|
||||
else
|
||||
echo "invalid suffix"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$size" > /sys/nb/r14_size
|
||||
cat /sys/nb/r14_size
|
||||
@@ -24,24 +24,25 @@ void print_usage() {
|
||||
printf("\n");
|
||||
printf("nanoBench usage:\n");
|
||||
printf("\n");
|
||||
printf(" -code <filename>: Binary file containing the code to be benchmarked.\n");
|
||||
printf(" -code_init <filename>: Binary file containing code to be executed once in the beginning\n");
|
||||
printf(" -config <filename>: File with performance counter event specifications.\n");
|
||||
printf(" -n_measurements <n>: Number of times the measurements are repeated.\n");
|
||||
printf(" -unroll_count <n>: Number of copies of the benchmark code inside the inner loop.\n");
|
||||
printf(" -loop_count <n>: Number of iterations of the inner loop.\n");
|
||||
printf(" -warm_up_count <n>: Number of runs before the first measurement gets recorded.\n");
|
||||
printf(" -initial_warm_up_count <n>: Number of runs before any measurement is performed.\n");
|
||||
printf(" -avg: Selects the arithmetic mean as the aggregate function.\n");
|
||||
printf(" -median: Selects the median as the aggregate function.\n");
|
||||
printf(" -min: Selects the minimum as the aggregate function.\n");
|
||||
printf(" -basic_mode: Enables basic mode.\n");
|
||||
printf(" -no_mem: The code for reading the perf. ctrs. does not make memory accesses.\n");
|
||||
printf(" -verbose: Outputs the results of all performance counter readings.\n");
|
||||
printf(" -cpu <n>: Pins the measurement thread to CPU n. \n");
|
||||
printf(" -usr <n>: If 1, counts events at a privilege level greater than 0.\n");
|
||||
printf(" -os <n>: If 1, counts events at a privilege level 0.\n");
|
||||
printf(" -debug: Generate a breakpoint trap after running the code to be benchmarked.\n");
|
||||
printf(" -code <filename>: Binary file containing the code to be benchmarked.\n");
|
||||
printf(" -code_init <filename>: Binary file containing code to be executed once before each measurement\n");
|
||||
printf(" -code_one_time_init <filename>: Binary file containing code to be executed once before the first measurement\n");
|
||||
printf(" -config <filename>: File with performance counter event specifications.\n");
|
||||
printf(" -n_measurements <n>: Number of times the measurements are repeated.\n");
|
||||
printf(" -unroll_count <n>: Number of copies of the benchmark code inside the inner loop.\n");
|
||||
printf(" -loop_count <n>: Number of iterations of the inner loop.\n");
|
||||
printf(" -warm_up_count <n>: Number of runs before the first measurement gets recorded.\n");
|
||||
printf(" -initial_warm_up_count <n>: Number of runs before any measurement is performed.\n");
|
||||
printf(" -avg: Selects the arithmetic mean as the aggregate function.\n");
|
||||
printf(" -median: Selects the median as the aggregate function.\n");
|
||||
printf(" -min: Selects the minimum as the aggregate function.\n");
|
||||
printf(" -basic_mode: Enables basic mode.\n");
|
||||
printf(" -no_mem: The code for reading the perf. ctrs. does not make memory accesses.\n");
|
||||
printf(" -verbose: Outputs the results of all performance counter readings.\n");
|
||||
printf(" -cpu <n>: Pins the measurement thread to CPU n. \n");
|
||||
printf(" -usr <n>: If 1, counts events at a privilege level greater than 0.\n");
|
||||
printf(" -os <n>: If 1, counts events at a privilege level 0.\n");
|
||||
printf(" -debug: Generate a breakpoint trap after running the code to be benchmarked.\n");
|
||||
}
|
||||
|
||||
size_t mmap_file(char* filename, char** content) {
|
||||
@@ -67,6 +68,7 @@ int main(int argc, char **argv) {
|
||||
struct option long_opts[] = {
|
||||
{"code", required_argument, 0, 'c'},
|
||||
{"code_init", required_argument, 0, 'i'},
|
||||
{"code_one_time_init", required_argument, 0, 'o'},
|
||||
{"config", required_argument, 0, 'f'},
|
||||
{"n_measurements", required_argument, 0, 'n'},
|
||||
{"unroll_count", required_argument, 0, 'u'},
|
||||
@@ -98,6 +100,9 @@ int main(int argc, char **argv) {
|
||||
case 'i':
|
||||
code_init_length = mmap_file(optarg, &code_init);
|
||||
break;
|
||||
case 'o':
|
||||
code_one_time_init_length = mmap_file(optarg, &code_one_time_init);
|
||||
break;
|
||||
case 'f': ;
|
||||
config_file_name = optarg;
|
||||
break;
|
||||
@@ -180,15 +185,31 @@ int main(int argc, char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
posix_memalign((void**)&runtime_r14, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rbp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rdi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rsi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rsp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
size_t runtime_one_time_init_code_length = code_one_time_init_length + 10000;
|
||||
posix_memalign((void**)&runtime_one_time_init_code, sysconf(_SC_PAGESIZE), runtime_one_time_init_code_length);
|
||||
if (!runtime_one_time_init_code) {
|
||||
fprintf(stderr, "Error: Failed to allocate memory for runtime_one_time_init_code\n");
|
||||
return 1;
|
||||
}
|
||||
if (mprotect(runtime_one_time_init_code, runtime_one_time_init_code_length, (PROT_READ | PROT_WRITE |PROT_EXEC))) {
|
||||
fprintf(stderr, "Error: mprotect failed\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
runtime_r14 = malloc(RUNTIME_R_SIZE);
|
||||
runtime_rbp = malloc(RUNTIME_R_SIZE);
|
||||
runtime_rdi = malloc(RUNTIME_R_SIZE);
|
||||
runtime_rsi = malloc(RUNTIME_R_SIZE);
|
||||
runtime_rsp = malloc(RUNTIME_R_SIZE);
|
||||
if (!runtime_r14 || !runtime_rbp || !runtime_rdi || !runtime_rsi || !runtime_rsp) {
|
||||
fprintf(stderr, "Error: Could not allocate memory for runtime_r*\n");
|
||||
return 1;
|
||||
}
|
||||
runtime_r14 += RUNTIME_R_SIZE/2;
|
||||
runtime_rbp += RUNTIME_R_SIZE/2;
|
||||
runtime_rdi += RUNTIME_R_SIZE/2;
|
||||
runtime_rsi += RUNTIME_R_SIZE/2;
|
||||
runtime_rsp += RUNTIME_R_SIZE/2;
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
measurement_results[i] = malloc(n_measurements*sizeof(int64_t));
|
||||
@@ -224,6 +245,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
create_and_run_one_time_init_code();
|
||||
run_warmup_experiment(measurement_template);
|
||||
|
||||
if (is_AMD_CPU) {
|
||||
@@ -301,11 +323,12 @@ int main(int argc, char **argv) {
|
||||
* Cleanup
|
||||
************************************/
|
||||
free(runtime_code);
|
||||
free(runtime_r14);
|
||||
free(runtime_rbp);
|
||||
free(runtime_rdi);
|
||||
free(runtime_rsi);
|
||||
free(runtime_rsp);
|
||||
free(runtime_one_time_init_code);
|
||||
free(runtime_r14 - RUNTIME_R_SIZE/2);
|
||||
free(runtime_rbp - RUNTIME_R_SIZE/2);
|
||||
free(runtime_rdi - RUNTIME_R_SIZE/2);
|
||||
free(runtime_rsi - RUNTIME_R_SIZE/2);
|
||||
free(runtime_rsp - RUNTIME_R_SIZE/2);
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
free(measurement_results[i]);
|
||||
|
||||
Reference in New Issue
Block a user