mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2026-01-04 11:30:06 +01:00
use different memory areas for R14, RBP, etc.
This commit is contained in:
@@ -62,9 +62,9 @@ The tool will *unroll* the assembler code multiple times, i.e., it will create m
|
||||
|
||||
The config file contains the required information for configuring the programmable performance counters with the desired events. We provide example configuration files for recent Intel and AMD microarchitectures in the `config` folder. When using the kernel-module, the config file must not be larger than 4 kB.
|
||||
|
||||
The assembler code sequence may use and modify any general-purpose or vector registers, including the stack pointer. There is no need to restore the registers to their original values at the end (unless the `-loop` or `-no_mem` options are used).
|
||||
The assembler code sequence may use and modify any general-purpose or vector registers (unless the `-loop` or `-no_mem` options are used), including the stack pointer. There is no need to restore the registers to their original values at the end.
|
||||
|
||||
R14, RDI, RSI, RSP, and RBP are initialized with addresses in a dedicated memory area (of about 2 MB), that can be freely modified by the assembler code. The addresses in R14, RDI, RSI, RSP, and RBP are at least 4 kB apart from each other.
|
||||
R14, RDI, RSI, RSP, and RBP are initialized with addresses in the middle of dedicated memory areas (of about 1 MB each), that can be freely modified by the assembler code.
|
||||
|
||||
All other registers have initially undefined values. They can, however, be initialized as shown in the following example.
|
||||
|
||||
|
||||
@@ -40,7 +40,11 @@ int is_AMD_CPU = 0;
|
||||
int n_programmable_counters;
|
||||
|
||||
char* runtime_code;
|
||||
void* runtime_mem;
|
||||
void* runtime_r14;
|
||||
void* runtime_rbp;
|
||||
void* runtime_rdi;
|
||||
void* runtime_rsi;
|
||||
void* runtime_rsp;
|
||||
int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
|
||||
void* RSP_mem;
|
||||
|
||||
@@ -378,8 +382,24 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
*(void**)(&runtime_code[rci]) = &RSP_mem;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_MEM)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_mem;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_R14)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_r14 + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RBP)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rbp + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RDI)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rdi + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSI)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rsi + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSP)) {
|
||||
*(void**)(&runtime_code[rci]) = runtime_rsp + RUNTIME_R_SIZE/2;
|
||||
templateI += 8;
|
||||
rci += 8;
|
||||
} else {
|
||||
@@ -509,7 +529,6 @@ int starts_with_magic_bytes(char* c, int64_t magic_bytes) {
|
||||
|
||||
void measurement_template_Intel() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -580,7 +599,6 @@ void measurement_template_Intel() {
|
||||
|
||||
void measurement_template_Intel_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -638,7 +656,6 @@ void measurement_template_Intel_noMem() {
|
||||
|
||||
void measurement_template_AMD() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -727,7 +744,6 @@ void measurement_template_AMD() {
|
||||
|
||||
void measurement_template_AMD_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -805,7 +821,6 @@ void measurement_template_AMD_noMem() {
|
||||
|
||||
void measurement_FF_template_Intel() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -873,7 +888,6 @@ void measurement_FF_template_Intel() {
|
||||
|
||||
void measurement_FF_template_Intel_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -929,7 +943,6 @@ void measurement_FF_template_Intel_noMem() {
|
||||
|
||||
void measurement_FF_template_AMD() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -989,7 +1002,6 @@ void measurement_FF_template_AMD() {
|
||||
|
||||
void measurement_FF_template_AMD_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -1035,7 +1047,6 @@ void measurement_FF_template_AMD_noMem() {
|
||||
|
||||
void measurement_RDTSC_template() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
@@ -1074,7 +1085,6 @@ void measurement_RDTSC_template() {
|
||||
|
||||
void measurement_RDTSC_template_noMem() {
|
||||
SAVE_REGS_FLAGS();
|
||||
INITIALIZE_REGS();
|
||||
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
|
||||
asm volatile(
|
||||
".intel_syntax noprefix \n"
|
||||
|
||||
@@ -149,8 +149,14 @@ extern int n_programmable_counters;
|
||||
// Pointer to a memory region that is writable and executable.
|
||||
extern char* runtime_code;
|
||||
|
||||
// During measurement, RSP, RDI, RSI, and R14 will point to locations in runtime_mem.
|
||||
extern void* runtime_mem;
|
||||
#define RUNTIME_R_SIZE (1024*1024)
|
||||
|
||||
// During measurements, R14, RBP, RDI, RSI, and RSP will contain these addresses plus RUNTIME_R_SIZE/2.
|
||||
extern void* runtime_r14;
|
||||
extern void* runtime_rbp;
|
||||
extern void* runtime_rdi;
|
||||
extern void* runtime_rsi;
|
||||
extern void* runtime_rsp;
|
||||
|
||||
// Stores performance counter values during measurements.
|
||||
extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
|
||||
@@ -198,9 +204,13 @@ void print_all_measurement_results(int64_t* results[], int n_counters);
|
||||
#define MAGIC_BYTES_INIT 0x10b513b1C2813F04
|
||||
#define MAGIC_BYTES_CODE 0x20b513b1C2813F04
|
||||
#define MAGIC_BYTES_RSP_ADDRESS 0x30b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_MEM 0x40b513b1C2813F04
|
||||
#define MAGIC_BYTES_PFC 0x50b513b1C2813F04
|
||||
#define MAGIC_BYTES_TEMPLATE_END 0x60b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_R14 0x40b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RBP 0x50b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RDI 0x60b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RSI 0x70b513b1C2813F04
|
||||
#define MAGIC_BYTES_RUNTIME_RSP 0x80b513b1C2813F04
|
||||
#define MAGIC_BYTES_PFC 0x90b513b1C2813F04
|
||||
#define MAGIC_BYTES_TEMPLATE_END 0xA0b513b1C2813F04
|
||||
|
||||
#define STRINGIFY2(X) #X
|
||||
#define STRINGIFY(X) STRINGIFY2(X)
|
||||
@@ -232,11 +242,22 @@ void measurement_RDTSC_template_noMem(void);
|
||||
"pushfq\n" \
|
||||
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
|
||||
"mov [r15], rsp\n" \
|
||||
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_MEM)"\n" \
|
||||
"add rsp, 0xfffff\n" \
|
||||
"mov r15, 0xfff\n" /*4 kB alignment*/ \
|
||||
"not r15\n" \
|
||||
"and rsp, r15\n" \
|
||||
"mov rax, 0\n" \
|
||||
"mov rbx, 0\n" \
|
||||
"mov rcx, 0\n" \
|
||||
"mov rdx, 0\n" \
|
||||
"mov r8, 0\n" \
|
||||
"mov r9, 0\n" \
|
||||
"mov r10, 0\n" \
|
||||
"mov r11, 0\n" \
|
||||
"mov r12, 0\n" \
|
||||
"mov r13, 0\n" \
|
||||
"mov r15, 0\n" \
|
||||
"mov r14, "STRINGIFY(MAGIC_BYTES_RUNTIME_R14)"\n" \
|
||||
"mov rbp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RBP)"\n" \
|
||||
"mov rdi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RDI)"\n" \
|
||||
"mov rsi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSI)"\n" \
|
||||
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSP)"\n" \
|
||||
".att_syntax noprefix");
|
||||
|
||||
#define RESTORE_REGS_FLAGS() \
|
||||
@@ -253,28 +274,4 @@ void measurement_RDTSC_template_noMem(void);
|
||||
"pop rbx\n" \
|
||||
".att_syntax noprefix");
|
||||
|
||||
#define INITIALIZE_REGS() \
|
||||
asm volatile( \
|
||||
".intel_syntax noprefix\n" \
|
||||
"mov rax, 0\n" \
|
||||
"mov rbx, 0\n" \
|
||||
"mov rcx, 0\n" \
|
||||
"mov rdx, 0\n" \
|
||||
"mov r8, 0\n" \
|
||||
"mov r9, 0\n" \
|
||||
"mov r10, 0\n" \
|
||||
"mov r11, 0\n" \
|
||||
"mov r12, 0\n" \
|
||||
"mov r13, 0\n" \
|
||||
"mov r15, 0\n" \
|
||||
"mov r14, rsp\n" \
|
||||
"add r14, 0x1000\n" \
|
||||
"mov rdi, rsp\n" \
|
||||
"add rdi, 0x2000\n" \
|
||||
"mov rsi, rsp\n" \
|
||||
"add rsi, 0x3000\n" \
|
||||
"mov rbp, rsp\n" \
|
||||
"add rbp, 0x4000\n" \
|
||||
".att_syntax noprefix");
|
||||
|
||||
#endif
|
||||
@@ -316,19 +316,19 @@ static int __init nb_init (void) {
|
||||
}
|
||||
|
||||
code = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if(!code){
|
||||
if (!code) {
|
||||
printk(KERN_ERR "Could not allocate memory for code\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
code_init = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if(!code_init){
|
||||
if (!code_init) {
|
||||
printk(KERN_ERR "Could not allocate memory for code_init\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
pfc_config_file_content = kmalloc(PAGE_SIZE+1, GFP_KERNEL);
|
||||
if(!pfc_config_file_content){
|
||||
if (!pfc_config_file_content) {
|
||||
printk(KERN_ERR "Could not allocate memory for pfc_config_file_content\n");
|
||||
return -1;
|
||||
}
|
||||
@@ -336,7 +336,7 @@ static int __init nb_init (void) {
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
measurement_results[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
|
||||
measurement_results_base[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
|
||||
if(!measurement_results[i] || !measurement_results_base[i]){
|
||||
if (!measurement_results[i] || !measurement_results_base[i]) {
|
||||
printk(KERN_ERR "Could not allocate memory for measurement_results\n");
|
||||
return -1;
|
||||
}
|
||||
@@ -344,9 +344,14 @@ static int __init nb_init (void) {
|
||||
memset(measurement_results_base[i], 0, n_measurements*sizeof(int64_t));
|
||||
}
|
||||
|
||||
runtime_mem = kmalloc(2*1024*1024, GFP_KERNEL);
|
||||
if(!runtime_mem){
|
||||
printk(KERN_ERR "Could not allocate memory for runtime_mem\n");
|
||||
// vmalloc addresses are page aligned
|
||||
runtime_r14 = vmalloc(RUNTIME_R_SIZE);
|
||||
runtime_rbp = vmalloc(RUNTIME_R_SIZE);
|
||||
runtime_rdi = vmalloc(RUNTIME_R_SIZE);
|
||||
runtime_rsi = vmalloc(RUNTIME_R_SIZE);
|
||||
runtime_rsp = vmalloc(RUNTIME_R_SIZE);
|
||||
if (!runtime_r14 || !runtime_rbp || !runtime_rdi || !runtime_rsi || !runtime_rsp) {
|
||||
printk(KERN_ERR "Could not allocate memory for runtime_r*\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -357,7 +362,7 @@ static int __init nb_init (void) {
|
||||
}
|
||||
|
||||
nb_kobject = kobject_create_and_add("nb", kernel_kobj->parent);
|
||||
if(!nb_kobject) {
|
||||
if (!nb_kobject) {
|
||||
pr_debug("failed to create and add nb\n");
|
||||
return -1;
|
||||
}
|
||||
@@ -387,15 +392,19 @@ static int __init nb_init (void) {
|
||||
}
|
||||
|
||||
static void __exit nb_exit (void) {
|
||||
if (code) kfree(code);
|
||||
if (code_init) kfree(code_init);
|
||||
if (pfc_config_file_content) kfree(pfc_config_file_content);
|
||||
if (runtime_code) vfree(runtime_code);
|
||||
if (runtime_mem) kfree(runtime_mem);
|
||||
kfree(code);
|
||||
kfree(code_init);
|
||||
kfree(pfc_config_file_content);
|
||||
vfree(runtime_code);
|
||||
vfree(runtime_r14);
|
||||
vfree(runtime_rbp);
|
||||
vfree(runtime_rdi);
|
||||
vfree(runtime_rsi);
|
||||
vfree(runtime_rsp);
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
if (measurement_results[i]) kfree(measurement_results[i]);
|
||||
if (measurement_results_base[i]) kfree(measurement_results_base[i]);
|
||||
kfree(measurement_results[i]);
|
||||
kfree(measurement_results_base[i]);
|
||||
}
|
||||
|
||||
kobject_put(nb_kobject);
|
||||
|
||||
@@ -180,16 +180,20 @@ int main(int argc, char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
runtime_mem = malloc(2*1024*1024);
|
||||
if(!runtime_mem){
|
||||
fprintf(stderr, "Error: Could not allocate memory for runtime_mem\n");
|
||||
posix_memalign((void**)&runtime_r14, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rbp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rdi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rsi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
posix_memalign((void**)&runtime_rsp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
|
||||
if (!runtime_r14 || !runtime_rbp || !runtime_rdi || !runtime_rsi || !runtime_rsp) {
|
||||
fprintf(stderr, "Error: Could not allocate memory for runtime_r*\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
measurement_results[i] = malloc(n_measurements*sizeof(int64_t));
|
||||
measurement_results_base[i] = malloc(n_measurements*sizeof(int64_t));
|
||||
if(!measurement_results[i] || !measurement_results_base[i]){
|
||||
if (!measurement_results[i] || !measurement_results_base[i]) {
|
||||
fprintf(stderr, "Error: Could not allocate memory for measurement_results\n");
|
||||
return 1;
|
||||
}
|
||||
@@ -297,7 +301,11 @@ int main(int argc, char **argv) {
|
||||
* Cleanup
|
||||
************************************/
|
||||
free(runtime_code);
|
||||
free(runtime_mem);
|
||||
free(runtime_r14);
|
||||
free(runtime_rbp);
|
||||
free(runtime_rdi);
|
||||
free(runtime_rsi);
|
||||
free(runtime_rsp);
|
||||
|
||||
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
|
||||
free(measurement_results[i]);
|
||||
|
||||
Reference in New Issue
Block a user