use different memory areas for R14, RBP, etc.

This commit is contained in:
Andreas Abel
2019-03-25 15:08:39 +01:00
parent eb66a00738
commit 1f9c1f2321
5 changed files with 99 additions and 75 deletions

View File

@@ -62,9 +62,9 @@ The tool will *unroll* the assembler code multiple times, i.e., it will create m
The config file contains the required information for configuring the programmable performance counters with the desired events. We provide example configuration files for recent Intel and AMD microarchitectures in the `config` folder. When using the kernel-module, the config file must not be larger than 4 kB.
The assembler code sequence may use and modify any general-purpose or vector registers, including the stack pointer. There is no need to restore the registers to their original values at the end (unless the `-loop` or `-no_mem` options are used).
The assembler code sequence may use and modify any general-purpose or vector registers (unless the `-loop` or `-no_mem` options are used), including the stack pointer. There is no need to restore the registers to their original values at the end.
R14, RDI, RSI, RSP, and RBP are initialized with addresses in a dedicated memory area (of about 2 MB), that can be freely modified by the assembler code. The addresses in R14, RDI, RSI, RSP, and RBP are at least 4 kB apart from each other.
R14, RDI, RSI, RSP, and RBP are initialized with addresses in the middle of dedicated memory areas (of about 1 MB each), that can be freely modified by the assembler code.
All other registers have initially undefined values. They can, however, be initialized as shown in the following example.

View File

@@ -40,7 +40,11 @@ int is_AMD_CPU = 0;
int n_programmable_counters;
char* runtime_code;
void* runtime_mem;
void* runtime_r14;
void* runtime_rbp;
void* runtime_rdi;
void* runtime_rsi;
void* runtime_rsp;
int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
void* RSP_mem;
@@ -378,8 +382,24 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
*(void**)(&runtime_code[rci]) = &RSP_mem;
templateI += 8;
rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_MEM)) {
*(void**)(&runtime_code[rci]) = runtime_mem;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_R14)) {
*(void**)(&runtime_code[rci]) = runtime_r14 + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RBP)) {
*(void**)(&runtime_code[rci]) = runtime_rbp + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RDI)) {
*(void**)(&runtime_code[rci]) = runtime_rdi + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSI)) {
*(void**)(&runtime_code[rci]) = runtime_rsi + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSP)) {
*(void**)(&runtime_code[rci]) = runtime_rsp + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
} else {
@@ -509,7 +529,6 @@ int starts_with_magic_bytes(char* c, int64_t magic_bytes) {
void measurement_template_Intel() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -580,7 +599,6 @@ void measurement_template_Intel() {
void measurement_template_Intel_noMem() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -638,7 +656,6 @@ void measurement_template_Intel_noMem() {
void measurement_template_AMD() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -727,7 +744,6 @@ void measurement_template_AMD() {
void measurement_template_AMD_noMem() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -805,7 +821,6 @@ void measurement_template_AMD_noMem() {
void measurement_FF_template_Intel() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -873,7 +888,6 @@ void measurement_FF_template_Intel() {
void measurement_FF_template_Intel_noMem() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -929,7 +943,6 @@ void measurement_FF_template_Intel_noMem() {
void measurement_FF_template_AMD() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -989,7 +1002,6 @@ void measurement_FF_template_AMD() {
void measurement_FF_template_AMD_noMem() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -1035,7 +1047,6 @@ void measurement_FF_template_AMD_noMem() {
void measurement_RDTSC_template() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
@@ -1074,7 +1085,6 @@ void measurement_RDTSC_template() {
void measurement_RDTSC_template_noMem() {
SAVE_REGS_FLAGS();
INITIALIZE_REGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"

View File

@@ -149,8 +149,14 @@ extern int n_programmable_counters;
// Pointer to a memory region that is writable and executable.
extern char* runtime_code;
// During measurement, RSP, RDI, RSI, and R14 will point to locations in runtime_mem.
extern void* runtime_mem;
#define RUNTIME_R_SIZE (1024*1024)
// During measurements, R14, RBP, RDI, RSI, and RSP will contain these addresses plus RUNTIME_R_SIZE/2.
extern void* runtime_r14;
extern void* runtime_rbp;
extern void* runtime_rdi;
extern void* runtime_rsi;
extern void* runtime_rsp;
// Stores performance counter values during measurements.
extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
@@ -198,9 +204,13 @@ void print_all_measurement_results(int64_t* results[], int n_counters);
#define MAGIC_BYTES_INIT 0x10b513b1C2813F04
#define MAGIC_BYTES_CODE 0x20b513b1C2813F04
#define MAGIC_BYTES_RSP_ADDRESS 0x30b513b1C2813F04
#define MAGIC_BYTES_RUNTIME_MEM 0x40b513b1C2813F04
#define MAGIC_BYTES_PFC 0x50b513b1C2813F04
#define MAGIC_BYTES_TEMPLATE_END 0x60b513b1C2813F04
#define MAGIC_BYTES_RUNTIME_R14 0x40b513b1C2813F04
#define MAGIC_BYTES_RUNTIME_RBP 0x50b513b1C2813F04
#define MAGIC_BYTES_RUNTIME_RDI 0x60b513b1C2813F04
#define MAGIC_BYTES_RUNTIME_RSI 0x70b513b1C2813F04
#define MAGIC_BYTES_RUNTIME_RSP 0x80b513b1C2813F04
#define MAGIC_BYTES_PFC 0x90b513b1C2813F04
#define MAGIC_BYTES_TEMPLATE_END 0xA0b513b1C2813F04
#define STRINGIFY2(X) #X
#define STRINGIFY(X) STRINGIFY2(X)
@@ -232,11 +242,22 @@ void measurement_RDTSC_template_noMem(void);
"pushfq\n" \
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
"mov [r15], rsp\n" \
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_MEM)"\n" \
"add rsp, 0xfffff\n" \
"mov r15, 0xfff\n" /*4 kB alignment*/ \
"not r15\n" \
"and rsp, r15\n" \
"mov rax, 0\n" \
"mov rbx, 0\n" \
"mov rcx, 0\n" \
"mov rdx, 0\n" \
"mov r8, 0\n" \
"mov r9, 0\n" \
"mov r10, 0\n" \
"mov r11, 0\n" \
"mov r12, 0\n" \
"mov r13, 0\n" \
"mov r15, 0\n" \
"mov r14, "STRINGIFY(MAGIC_BYTES_RUNTIME_R14)"\n" \
"mov rbp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RBP)"\n" \
"mov rdi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RDI)"\n" \
"mov rsi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSI)"\n" \
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSP)"\n" \
".att_syntax noprefix");
#define RESTORE_REGS_FLAGS() \
@@ -253,28 +274,4 @@ void measurement_RDTSC_template_noMem(void);
"pop rbx\n" \
".att_syntax noprefix");
#define INITIALIZE_REGS() \
asm volatile( \
".intel_syntax noprefix\n" \
"mov rax, 0\n" \
"mov rbx, 0\n" \
"mov rcx, 0\n" \
"mov rdx, 0\n" \
"mov r8, 0\n" \
"mov r9, 0\n" \
"mov r10, 0\n" \
"mov r11, 0\n" \
"mov r12, 0\n" \
"mov r13, 0\n" \
"mov r15, 0\n" \
"mov r14, rsp\n" \
"add r14, 0x1000\n" \
"mov rdi, rsp\n" \
"add rdi, 0x2000\n" \
"mov rsi, rsp\n" \
"add rsi, 0x3000\n" \
"mov rbp, rsp\n" \
"add rbp, 0x4000\n" \
".att_syntax noprefix");
#endif

View File

@@ -316,19 +316,19 @@ static int __init nb_init (void) {
}
code = kmalloc(PAGE_SIZE, GFP_KERNEL);
if(!code){
if (!code) {
printk(KERN_ERR "Could not allocate memory for code\n");
return -1;
}
code_init = kmalloc(PAGE_SIZE, GFP_KERNEL);
if(!code_init){
if (!code_init) {
printk(KERN_ERR "Could not allocate memory for code_init\n");
return -1;
}
pfc_config_file_content = kmalloc(PAGE_SIZE+1, GFP_KERNEL);
if(!pfc_config_file_content){
if (!pfc_config_file_content) {
printk(KERN_ERR "Could not allocate memory for pfc_config_file_content\n");
return -1;
}
@@ -336,7 +336,7 @@ static int __init nb_init (void) {
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
measurement_results[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
measurement_results_base[i] = kmalloc(n_measurements*sizeof(int64_t), GFP_KERNEL);
if(!measurement_results[i] || !measurement_results_base[i]){
if (!measurement_results[i] || !measurement_results_base[i]) {
printk(KERN_ERR "Could not allocate memory for measurement_results\n");
return -1;
}
@@ -344,9 +344,14 @@ static int __init nb_init (void) {
memset(measurement_results_base[i], 0, n_measurements*sizeof(int64_t));
}
runtime_mem = kmalloc(2*1024*1024, GFP_KERNEL);
if(!runtime_mem){
printk(KERN_ERR "Could not allocate memory for runtime_mem\n");
// vmalloc addresses are page aligned
runtime_r14 = vmalloc(RUNTIME_R_SIZE);
runtime_rbp = vmalloc(RUNTIME_R_SIZE);
runtime_rdi = vmalloc(RUNTIME_R_SIZE);
runtime_rsi = vmalloc(RUNTIME_R_SIZE);
runtime_rsp = vmalloc(RUNTIME_R_SIZE);
if (!runtime_r14 || !runtime_rbp || !runtime_rdi || !runtime_rsi || !runtime_rsp) {
printk(KERN_ERR "Could not allocate memory for runtime_r*\n");
return -1;
}
@@ -357,7 +362,7 @@ static int __init nb_init (void) {
}
nb_kobject = kobject_create_and_add("nb", kernel_kobj->parent);
if(!nb_kobject) {
if (!nb_kobject) {
pr_debug("failed to create and add nb\n");
return -1;
}
@@ -387,15 +392,19 @@ static int __init nb_init (void) {
}
static void __exit nb_exit (void) {
if (code) kfree(code);
if (code_init) kfree(code_init);
if (pfc_config_file_content) kfree(pfc_config_file_content);
if (runtime_code) vfree(runtime_code);
if (runtime_mem) kfree(runtime_mem);
kfree(code);
kfree(code_init);
kfree(pfc_config_file_content);
vfree(runtime_code);
vfree(runtime_r14);
vfree(runtime_rbp);
vfree(runtime_rdi);
vfree(runtime_rsi);
vfree(runtime_rsp);
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
if (measurement_results[i]) kfree(measurement_results[i]);
if (measurement_results_base[i]) kfree(measurement_results_base[i]);
kfree(measurement_results[i]);
kfree(measurement_results_base[i]);
}
kobject_put(nb_kobject);

View File

@@ -180,16 +180,20 @@ int main(int argc, char **argv) {
return 1;
}
runtime_mem = malloc(2*1024*1024);
if(!runtime_mem){
fprintf(stderr, "Error: Could not allocate memory for runtime_mem\n");
posix_memalign((void**)&runtime_r14, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
posix_memalign((void**)&runtime_rbp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
posix_memalign((void**)&runtime_rdi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
posix_memalign((void**)&runtime_rsi, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
posix_memalign((void**)&runtime_rsp, sysconf(_SC_PAGESIZE), RUNTIME_R_SIZE);
if (!runtime_r14 || !runtime_rbp || !runtime_rdi || !runtime_rsi || !runtime_rsp) {
fprintf(stderr, "Error: Could not allocate memory for runtime_r*\n");
return 1;
}
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
measurement_results[i] = malloc(n_measurements*sizeof(int64_t));
measurement_results_base[i] = malloc(n_measurements*sizeof(int64_t));
if(!measurement_results[i] || !measurement_results_base[i]){
if (!measurement_results[i] || !measurement_results_base[i]) {
fprintf(stderr, "Error: Could not allocate memory for measurement_results\n");
return 1;
}
@@ -297,7 +301,11 @@ int main(int argc, char **argv) {
* Cleanup
************************************/
free(runtime_code);
free(runtime_mem);
free(runtime_r14);
free(runtime_rbp);
free(runtime_rdi);
free(runtime_rsi);
free(runtime_rsp);
for (int i=0; i<MAX_PROGRAMMABLE_COUNTERS; i++) {
free(measurement_results[i]);