various improvements

This commit is contained in:
Andreas Abel
2019-08-09 18:10:12 +02:00
parent de6f3606c3
commit 3588b20e52
8 changed files with 592 additions and 188 deletions

View File

@@ -29,23 +29,30 @@ size_t code_length = 0;
char* code_init = NULL;
size_t code_init_length = 0;
char* code_one_time_init = NULL;
size_t code_one_time_init_length = 0;
struct pfc_config pfc_configs[1000] = {{0}};
size_t n_pfc_configs = 0;
char* pfc_config_file_content = NULL;
struct msr_config msr_configs[1000] = {{0}};
size_t n_msr_configs = 0;
char* msr_config_file_content = NULL;
unsigned long cur_rdmsr = 0;
int is_Intel_CPU = 0;
int is_AMD_CPU = 0;
int n_programmable_counters;
char* runtime_code;
char* runtime_one_time_init_code;
void* runtime_r14;
void* runtime_rbp;
void* runtime_rdi;
void* runtime_rsi;
void* runtime_rsp;
void* huge_pages = NULL;
int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
void* RSP_mem;
@@ -199,6 +206,53 @@ void parse_counter_configs() {
}
}
#ifdef __KERNEL__
void parse_msr_configs() {
n_msr_configs = 0;
char* line;
char* next_line = msr_config_file_content;
while ((line = strsep(&next_line, "\n")) != NULL) {
if (strlen(line) == 0 || line[0] == '#') {
continue;
}
char* wrmsr_str = strsep(&line, " \t");
char* rdmsr_str = strsep(&line, " \t");
strreplace(rdmsr_str, 'h', '\0'); strreplace(rdmsr_str, 'H', '\0');
if (line && strlen(line) > 0) {
msr_configs[n_msr_configs].description = line;
} else {
msr_configs[n_msr_configs].description = rdmsr_str;
}
nb_strtoul(rdmsr_str+4, 16, &(msr_configs[n_msr_configs].rdmsr));
size_t n_wrmsr = 0;
char* tok = wrmsr_str;
char* ce;
while ((ce = strsep(&tok, ".")) != NULL) {
if (n_wrmsr >= 10) {
print_error("Error: n_wrmsr >= 10");
break;
}
char* msr_str = strsep(&ce, "=")+4;
pr_debug("msr_str: %s", msr_str);
strreplace(msr_str, 'h', '\0'); strreplace(msr_str, 'H', '\0');
nb_strtoul(msr_str, 16, &(msr_configs[n_msr_configs].wrmsr[n_wrmsr]));
strreplace(ce, 'h', '\0'); strreplace(ce, 'H', '\0');
nb_strtoul(ce, 0, &(msr_configs[n_msr_configs].wrmsr_val[n_wrmsr]));
n_wrmsr++;
}
msr_configs[n_msr_configs].n_wrmsr = n_wrmsr;
n_msr_configs++;
}
}
#endif
#ifndef __KERNEL__
uint64_t read_value_from_cmd(char* cmd) {
FILE* fp;
@@ -340,6 +394,13 @@ void configure_perf_ctrs_programmable(int start, int end, unsigned int usr, unsi
}
}
void configure_MSRs(struct msr_config config) {
for (size_t i=0; i<config.n_wrmsr; i++) {
write_msr(config.wrmsr[i], config.wrmsr_val[i]);
}
cur_rdmsr = config.rdmsr;
}
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count) {
int templateI = 0;
int rci = 0;
@@ -377,39 +438,30 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
}
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC)) {
*(void**)(&runtime_code[rci]) = pfc_mem;
templateI += 8;
rci += 8;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_MSR)) {
*(void**)(&runtime_code[rci]) = (void*)cur_rdmsr;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RSP_ADDRESS)) {
*(void**)(&runtime_code[rci]) = &RSP_mem;
templateI += 8;
rci += 8;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_R14)) {
if (huge_pages) {
*(void**)(&runtime_code[rci]) = huge_pages;
} else {
*(void**)(&runtime_code[rci]) = runtime_r14 + RUNTIME_R_SIZE/2;
}
templateI += 8;
rci += 8;
*(void**)(&runtime_code[rci]) = runtime_r14;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RBP)) {
*(void**)(&runtime_code[rci]) = runtime_rbp + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
*(void**)(&runtime_code[rci]) = runtime_rbp;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RDI)) {
*(void**)(&runtime_code[rci]) = runtime_rdi + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
*(void**)(&runtime_code[rci]) = runtime_rdi;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSI)) {
*(void**)(&runtime_code[rci]) = runtime_rsi + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
*(void**)(&runtime_code[rci]) = runtime_rsi;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_RUNTIME_RSP)) {
*(void**)(&runtime_code[rci]) = runtime_rsp + RUNTIME_R_SIZE/2;
templateI += 8;
rci += 8;
*(void**)(&runtime_code[rci]) = runtime_rsp;
templateI += 8; rci += 8;
} else {
runtime_code[rci++] = measurement_template[templateI];
templateI++;
runtime_code[rci++] = measurement_template[templateI++];
}
}
templateI += 8;
@@ -418,6 +470,48 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
} while (measurement_template[templateI-1] != '\xC3'); // 0xC3 = ret
}
void create_and_run_one_time_init_code() {
if (code_one_time_init_length == 0) return;
char* template = (char*)&one_time_init_template;
int templateI = 0;
int rci = 0;
while (!starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_TEMPLATE_END)) {
if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_INIT)) {
templateI += 8;
memcpy(&runtime_one_time_init_code[rci], code_one_time_init, code_one_time_init_length);
rci += code_one_time_init_length;
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RSP_ADDRESS)) {
*(void**)(&runtime_one_time_init_code[rci]) = &RSP_mem;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_R14)) {
*(void**)(&runtime_one_time_init_code[rci]) = runtime_r14;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RBP)) {
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rbp;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RDI)) {
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rdi;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSI)) {
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rsi;
templateI += 8; rci += 8;
} else if (starts_with_magic_bytes(&template[templateI], MAGIC_BYTES_RUNTIME_RSP)) {
*(void**)(&runtime_one_time_init_code[rci]) = runtime_rsp;
templateI += 8; rci += 8;
} else {
runtime_one_time_init_code[rci++] = template[templateI++];
}
}
templateI += 8;
do {
runtime_one_time_init_code[rci++] = template[templateI++];
} while (template[templateI-1] != '\xC3'); // 0xC3 = ret
((void(*)(void))runtime_one_time_init_code)();
}
void run_warmup_experiment(char* measurement_template) {
if (!initial_warm_up_count) return;
@@ -1112,3 +1206,79 @@ void measurement_RDTSC_template_noMem() {
RESTORE_REGS_FLAGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
}
void measurement_RDMSR_template() {
SAVE_REGS_FLAGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
"push rax \n"
"lahf \n"
"seto al \n"
"push rax \n"
"push rcx \n"
"push rdx \n"
"push r15 \n"
"mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n"
"mov qword ptr [r15], 0 \n"
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
"lfence; rdmsr; lfence \n"
"shl rdx, 32; or rdx, rax \n"
"sub [r15], rdx \n"
"lfence \n"
"pop r15; lfence \n"
"pop rdx; lfence \n"
"pop rcx; lfence \n"
"pop rax; lfence \n"
"cmp al, -127; lfence \n"
"sahf; lfence \n"
"pop rax; \n"
"lfence \n"
".att_syntax noprefix ");
asm(".quad "STRINGIFY(MAGIC_BYTES_CODE));
asm volatile(
".intel_syntax noprefix \n"
"lfence \n"
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
"lfence; rdmsr; lfence \n"
"shl rdx, 32; or rdx, rax \n"
"mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n"
"add [r15], rdx \n"
".att_syntax noprefix ");
RESTORE_REGS_FLAGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
}
void measurement_RDMSR_template_noMem() {
SAVE_REGS_FLAGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
asm volatile(
".intel_syntax noprefix \n"
"mov r8, 0 \n"
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
"lfence; rdmsr; lfence \n"
"shl rdx, 32; or rdx, rax \n"
"sub r8, rdx \n"
"lfence \n"
".att_syntax noprefix ");
asm(".quad "STRINGIFY(MAGIC_BYTES_CODE));
asm volatile(
".intel_syntax noprefix \n"
"lfence \n"
"mov rcx, "STRINGIFY(MAGIC_BYTES_MSR)" \n"
"lfence; rdmsr; lfence \n"
"shl rdx, 32; or rdx, rax \n"
"add r8, rdx \n"
"mov r15, "STRINGIFY(MAGIC_BYTES_PFC)" \n"
"mov [r15], r8 \n"
".att_syntax noprefix ");
RESTORE_REGS_FLAGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
}
void one_time_init_template() {
SAVE_REGS_FLAGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_INIT));
RESTORE_REGS_FLAGS();
asm(".quad "STRINGIFY(MAGIC_BYTES_TEMPLATE_END));
}

View File

@@ -120,6 +120,9 @@ extern size_t code_length;
extern char* code_init;
extern size_t code_init_length;
extern char* code_one_time_init;
extern size_t code_one_time_init_length;
struct pfc_config {
unsigned long evt_num;
unsigned long umask;
@@ -134,33 +137,41 @@ struct pfc_config {
unsigned int invalid;
char* description;
};
extern struct pfc_config pfc_configs[];
extern size_t n_pfc_configs;
extern char* pfc_config_file_content;
struct msr_config {
unsigned long rdmsr;
unsigned long wrmsr[10];
unsigned long wrmsr_val[10];
size_t n_wrmsr;
char* description;
};
extern struct msr_config msr_configs[];
extern size_t n_msr_configs;
extern char* msr_config_file_content;
extern int is_Intel_CPU;
extern int is_AMD_CPU;
#define MAX_PROGRAMMABLE_COUNTERS 6
extern int n_programmable_counters;
// Pointer to a memory region that is writable and executable.
// Pointers to a memory regions that are writable and executable.
extern char* runtime_code;
extern char* runtime_one_time_init_code;
#define RUNTIME_R_SIZE (1024*1024)
// During measurements, R14, RBP, RDI, RSI, and RSP will contain these addresses plus RUNTIME_R_SIZE/2.
// If r14_size is set in the kernel module, R14 will not have this offset.
extern void* runtime_r14;
extern void* runtime_rbp;
extern void* runtime_rdi;
extern void* runtime_rsi;
extern void* runtime_rsp;
// If non-null, R14 will contain this address instead of runtime_r14.
extern void* huge_pages;
// Stores performance counter values during measurements.
extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
@@ -178,6 +189,7 @@ extern int cpu;
int check_cpuid(void);
void parse_counter_configs(void);
void parse_msr_configs(void);
uint64_t read_value_from_cmd(char* cmd);
@@ -191,10 +203,12 @@ void configure_perf_ctrs_FF(unsigned int usr, unsigned int os);
// start and end are indices into the pfc_configs array.
void configure_perf_ctrs_programmable(int start, int end, unsigned int usr, unsigned int os);
void configure_MSRs(struct msr_config config);
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
void run_warmup_experiment(char* measurement_template);
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
void create_and_run_one_time_init_code(void);
char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter);
int64_t get_aggregate_value_100(int64_t* values, size_t length);
@@ -213,7 +227,8 @@ void print_all_measurement_results(int64_t* results[], int n_counters);
#define MAGIC_BYTES_RUNTIME_RSI 0x70b513b1C2813F04
#define MAGIC_BYTES_RUNTIME_RSP 0x80b513b1C2813F04
#define MAGIC_BYTES_PFC 0x90b513b1C2813F04
#define MAGIC_BYTES_TEMPLATE_END 0xA0b513b1C2813F04
#define MAGIC_BYTES_MSR 0xA0b513b1C2813F04
#define MAGIC_BYTES_TEMPLATE_END 0xB0b513b1C2813F04
#define STRINGIFY2(X) #X
#define STRINGIFY(X) STRINGIFY2(X)
@@ -231,6 +246,9 @@ void measurement_FF_template_AMD(void);
void measurement_FF_template_AMD_noMem(void);
void measurement_RDTSC_template(void);
void measurement_RDTSC_template_noMem(void);
void measurement_RDMSR_template(void);
void measurement_RDMSR_template_noMem(void);
void one_time_init_template(void);
// RBX, RBP, and R12R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
#define SAVE_REGS_FLAGS() \