mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-15 19:10:08 +01:00
264 lines
9.7 KiB
C
264 lines
9.7 KiB
C
#ifndef NANOBENCH_H
|
||
#define NANOBENCH_H
|
||
|
||
#ifdef __KERNEL__
|
||
#include <linux/module.h>
|
||
#include <linux/sort.h>
|
||
#else
|
||
#include <inttypes.h>
|
||
#include <stddef.h>
|
||
#include <stdint.h>
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#endif
|
||
|
||
#include <cpuid.h>
|
||
|
||
#ifdef __KERNEL__
|
||
#define print_error(...) pr_debug(__VA_ARGS__)
|
||
#define print_verbose(...) if (verbose) pr_debug(__VA_ARGS__)
|
||
#define print_user_verbose(...) pr_debug(__VA_ARGS__)
|
||
#define nb_strtoul(s, base, res) kstrtoul(s, base, res)
|
||
#define qsort(base, n, size, comp) sort(base, n, size, comp, NULL)
|
||
#else
|
||
#define print_error(...) fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n");
|
||
#define print_verbose(...) if (verbose) printf(__VA_ARGS__);
|
||
#define print_user_verbose(...) if (verbose) printf(__VA_ARGS__);
|
||
#define nb_strtoul(s, base, res) *res = strtoul(s, NULL, base)
|
||
#endif
|
||
|
||
#ifndef MSR_IA32_PMC0
|
||
#define MSR_IA32_PMC0 0x0C1
|
||
#endif
|
||
#ifndef MSR_IA32_PERFEVTSEL0
|
||
#define MSR_IA32_PERFEVTSEL0 0x186
|
||
#endif
|
||
#ifndef MSR_OFFCORE_RSP0
|
||
#define MSR_OFFCORE_RSP0 0x1A6
|
||
#endif
|
||
#ifndef MSR_OFFCORE_RSP1
|
||
#define MSR_OFFCORE_RSP1 0x1A7
|
||
#endif
|
||
#ifndef MSR_IA32_FIXED_CTR0
|
||
#define MSR_IA32_FIXED_CTR0 0x309
|
||
#endif
|
||
#ifndef MSR_IA32_FIXED_CTR_CTRL
|
||
#define MSR_IA32_FIXED_CTR_CTRL 0x38D
|
||
#endif
|
||
#ifndef MSR_IA32_PERF_GLOBAL_CTRL
|
||
#define MSR_IA32_PERF_GLOBAL_CTRL 0x38F
|
||
#endif
|
||
#ifndef MSR_PEBS_FRONTEND
|
||
#define MSR_PEBS_FRONTEND 0x3F7
|
||
#endif
|
||
#ifndef CORE_X86_MSR_PERF_CTL
|
||
#define CORE_X86_MSR_PERF_CTL 0xC0010200
|
||
#endif
|
||
#ifndef CORE_X86_MSR_PERF_CTR
|
||
#define CORE_X86_MSR_PERF_CTR 0xC0010201
|
||
#endif
|
||
|
||
|
||
// How often the measurement will be repeated.
|
||
extern long n_measurements;
|
||
#define N_MEASUREMENTS_DEFAULT 10;
|
||
|
||
// How often the code to be measured will be unrolled.
|
||
extern long unroll_count;
|
||
#define UNROLL_COUNT_DEFAULT 1000;
|
||
|
||
// Number of iterations of the inner loop. Setting this to 0 will disable the inner loop; the code to be measured is then executed unroll_count many times.
|
||
extern long loop_count;
|
||
#define LOOP_COUNT_DEFAULT 0;
|
||
|
||
// Number of executions of the measurement code before each sequence of measurement runs.
|
||
extern long warm_up_count;
|
||
#define WARM_UP_COUNT_DEFAULT 5;
|
||
|
||
// Number of executions of the measurement code before the first measurement.
|
||
extern long initial_warm_up_count;
|
||
#define INITIAL_WARM_UP_COUNT_DEFAULT 0;
|
||
|
||
// If enabled, the temporary performance counter values are stored in registers instead of in memory;
|
||
// the code to be measured must then not use registers R8-R13
|
||
extern int no_mem;
|
||
#define NO_MEM_DEFAULT 0;
|
||
|
||
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
|
||
// measurements.
|
||
// If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference
|
||
// between the two measurements.
|
||
extern int basic_mode;
|
||
#define BASIC_MODE_DEFAULT 0;
|
||
|
||
enum agg_enum {AVG_20_80, MIN, MED};
|
||
extern int aggregate_function;
|
||
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
|
||
|
||
extern int verbose;
|
||
#define VERBOSE_DEFAULT 0;
|
||
|
||
extern char* code;
|
||
extern size_t code_length;
|
||
|
||
extern char* code_init;
|
||
extern size_t code_init_length;
|
||
|
||
struct pfc_config {
|
||
unsigned long evt_num;
|
||
unsigned long umask;
|
||
unsigned long cmask;
|
||
unsigned int any;
|
||
unsigned int edge;
|
||
unsigned int inv;
|
||
unsigned long msr_3f6h;
|
||
unsigned long msr_pf;
|
||
unsigned long msr_rsp0;
|
||
unsigned long msr_rsp1;
|
||
unsigned int invalid;
|
||
char* description;
|
||
};
|
||
|
||
extern struct pfc_config pfc_configs[];
|
||
extern size_t n_pfc_configs;
|
||
|
||
extern char* pfc_config_file_content;
|
||
|
||
extern int is_Intel_CPU;
|
||
extern int is_AMD_CPU;
|
||
|
||
#define MAX_PROGRAMMABLE_COUNTERS 6
|
||
extern int n_programmable_counters;
|
||
|
||
// Pointer to a memory region that is writable and executable.
|
||
extern char* runtime_code;
|
||
|
||
// During measurement, RSP, RDI, RSI, and R14 will point to locations in runtime_mem.
|
||
extern void* runtime_mem;
|
||
|
||
// Stores performance counter values during measurements.
|
||
extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
|
||
|
||
// Stores the RSP during measurements.
|
||
extern void* RSP_mem;
|
||
|
||
extern int64_t* measurement_results[MAX_PROGRAMMABLE_COUNTERS];
|
||
extern int64_t* measurement_results_base[MAX_PROGRAMMABLE_COUNTERS];
|
||
|
||
// Process should be pinned to this CPU.
|
||
extern int cpu;
|
||
|
||
// Checks whether we have an Intel or AMD CPU and determines the number of programmable counters.
|
||
// Returns 0 if successful, 1 otherwise.
|
||
int check_cpuid(void);
|
||
|
||
void parse_counter_configs(void);
|
||
|
||
uint64_t read_value_from_cmd(char* cmd);
|
||
|
||
uint64_t read_msr(unsigned int msr);
|
||
void write_msr(unsigned int msr, uint64_t value);
|
||
|
||
// Enables and clears the fixed-function performance counters.
|
||
void configure_perf_ctrs_FF(unsigned int usr, unsigned int os);
|
||
|
||
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
|
||
// start and end are indices into the pfc_configs array.
|
||
void configure_perf_ctrs_programmable(int start, int end, unsigned int usr, unsigned int os);
|
||
|
||
|
||
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
|
||
void run_warmup_experiment(char* measurement_template);
|
||
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
|
||
|
||
char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter);
|
||
int64_t get_aggregate_value_100(int64_t* values, size_t length);
|
||
int cmpInt64(const void *a, const void *b);
|
||
long long ll_abs(long long val);
|
||
|
||
void print_all_measurement_results(int64_t* results[], int n_counters);
|
||
|
||
|
||
#define MAGIC_BYTES_INIT 0x10b513b1C2813F04
|
||
#define MAGIC_BYTES_CODE 0x20b513b1C2813F04
|
||
#define MAGIC_BYTES_RSP_ADDRESS 0x30b513b1C2813F04
|
||
#define MAGIC_BYTES_RUNTIME_MEM 0x40b513b1C2813F04
|
||
#define MAGIC_BYTES_PFC 0x50b513b1C2813F04
|
||
#define MAGIC_BYTES_TEMPLATE_END 0x60b513b1C2813F04
|
||
|
||
#define STRINGIFY2(X) #X
|
||
#define STRINGIFY(X) STRINGIFY2(X)
|
||
|
||
int starts_with_magic_bytes(char* c, int64_t magic_bytes);
|
||
|
||
// The following functions must not use global variables (or anything that uses RIP-relative addressing)
|
||
void measurement_template_Intel(void);
|
||
void measurement_template_Intel_noMem(void);
|
||
void measurement_template_AMD(void);
|
||
void measurement_template_AMD_noMem(void);
|
||
void measurement_FF_template_Intel(void);
|
||
void measurement_FF_template_Intel_noMem(void);
|
||
void measurement_FF_template_AMD(void);
|
||
void measurement_FF_template_AMD_noMem(void);
|
||
void measurement_RDTSC_template(void);
|
||
void measurement_RDTSC_template_noMem(void);
|
||
|
||
// RBX, RBP, and R12–R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
|
||
#define SAVE_REGS_FLAGS() \
|
||
asm volatile( \
|
||
".intel_syntax noprefix\n" \
|
||
"push rbx\n" \
|
||
"push rbp\n" \
|
||
"push r12\n" \
|
||
"push r13\n" \
|
||
"push r14\n" \
|
||
"push r15\n" \
|
||
"pushfq\n" \
|
||
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
|
||
"mov [r15], rsp\n" \
|
||
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_MEM)"\n" \
|
||
"add rsp, 0xfffff\n" \
|
||
"mov r15, 0xfff\n" /*4 kB alignment*/ \
|
||
"not r15\n" \
|
||
"and rsp, r15\n" \
|
||
".att_syntax noprefix");
|
||
|
||
#define RESTORE_REGS_FLAGS() \
|
||
asm volatile( \
|
||
".intel_syntax noprefix\n" \
|
||
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
|
||
"mov rsp, [r15]\n" \
|
||
"popfq\n" \
|
||
"pop r15\n" \
|
||
"pop r14\n" \
|
||
"pop r13\n" \
|
||
"pop r12\n" \
|
||
"pop rbp\n" \
|
||
"pop rbx\n" \
|
||
".att_syntax noprefix");
|
||
|
||
#define INITIALIZE_REGS() \
|
||
asm volatile( \
|
||
".intel_syntax noprefix\n" \
|
||
"mov rax, 0\n" \
|
||
"mov rbx, 0\n" \
|
||
"mov rcx, 0\n" \
|
||
"mov rdx, 0\n" \
|
||
"mov r8, 0\n" \
|
||
"mov r9, 0\n" \
|
||
"mov r10, 0\n" \
|
||
"mov r11, 0\n" \
|
||
"mov r12, 0\n" \
|
||
"mov r13, 0\n" \
|
||
"mov r14, rsp\n" \
|
||
"add r14, 0x1000\n" \
|
||
"mov rdi, rsp\n" \
|
||
"add rdi, 0x2000\n" \
|
||
"mov rsi, rsp\n" \
|
||
"add rsi, 0x3000\n" \
|
||
"mov rbp, rsp\n" \
|
||
"add rbp, 0x4000\n" \
|
||
".att_syntax noprefix");
|
||
|
||
#endif |