// nanoBench
//
// Copyright (C) 2019 Andreas Abel
//
// This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License along with this program. If not, see .
#ifndef NANOBENCH_H
#define NANOBENCH_H
#ifdef __KERNEL__
#include
#include
#else
#include
#include
#include
#include
#include
#include
#endif
#include
#ifdef __KERNEL__
#define print_error(...) pr_debug(__VA_ARGS__)
#define print_verbose(...) if (verbose) pr_debug(__VA_ARGS__)
#define print_user_verbose(...) pr_debug(__VA_ARGS__)
#define nb_strtoul(s, base, res) kstrtoul(s, base, res)
#define qsort(base, n, size, comp) sort(base, n, size, comp, NULL)
#else
#define print_error(...) fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n");
#define print_verbose(...) if (verbose) printf(__VA_ARGS__);
#define print_user_verbose(...) if (verbose) printf(__VA_ARGS__);
#define nb_strtoul(s, base, res) *res = strtoul(s, NULL, base)
#endif
#ifndef MSR_IA32_PMC0
#define MSR_IA32_PMC0 0x0C1
#endif
#ifndef MSR_IA32_PERFEVTSEL0
#define MSR_IA32_PERFEVTSEL0 0x186
#endif
#ifndef MSR_OFFCORE_RSP0
#define MSR_OFFCORE_RSP0 0x1A6
#endif
#ifndef MSR_OFFCORE_RSP1
#define MSR_OFFCORE_RSP1 0x1A7
#endif
#ifndef MSR_IA32_FIXED_CTR0
#define MSR_IA32_FIXED_CTR0 0x309
#endif
#ifndef MSR_IA32_FIXED_CTR_CTRL
#define MSR_IA32_FIXED_CTR_CTRL 0x38D
#endif
#ifndef MSR_IA32_PERF_GLOBAL_CTRL
#define MSR_IA32_PERF_GLOBAL_CTRL 0x38F
#endif
#ifndef MSR_PEBS_FRONTEND
#define MSR_PEBS_FRONTEND 0x3F7
#endif
#ifndef CORE_X86_MSR_PERF_CTL
#define CORE_X86_MSR_PERF_CTL 0xC0010200
#endif
#ifndef CORE_X86_MSR_PERF_CTR
#define CORE_X86_MSR_PERF_CTR 0xC0010201
#endif
// How often the measurement will be repeated.
extern long n_measurements;
#define N_MEASUREMENTS_DEFAULT 10;
// How often the code to be measured will be unrolled.
extern long unroll_count;
#define UNROLL_COUNT_DEFAULT 1000;
// Number of iterations of the inner loop. Setting this to 0 will disable the inner loop; the code to be measured is then executed unroll_count many times.
extern long loop_count;
#define LOOP_COUNT_DEFAULT 0;
// Number of executions of the measurement code before each sequence of measurement runs.
extern long warm_up_count;
#define WARM_UP_COUNT_DEFAULT 5;
// Number of executions of the measurement code before the first measurement.
extern long initial_warm_up_count;
#define INITIAL_WARM_UP_COUNT_DEFAULT 0;
// By default, the code to be benchmarked is aligned to 64 bytes. This parameter allows to specify an offset to this alignment.
extern size_t alignment_offset;
#define ALIGNMENT_OFFSET_DEFAULT 0;
// If enabled, the front-end buffers are drained between code_late_init and code by executing a sequence of 128 15-Byte NOP instructions.
extern int drain_frontend;
#define DRAIN_FRONTEND_DEFAULT 0;
// If enabled, the temporary performance counter values are stored in registers instead of in memory;
// the code to be measured must then not use registers R8-R13
extern int no_mem;
#define NO_MEM_DEFAULT 0;
// If enabled, the measurement results are not divided by the number of repetitions.
extern int no_normalization;
#define NO_NORMALIZATION_DEFAULT 0;
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
// measurements.
// If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference
// between the two measurements.
extern int basic_mode;
#define BASIC_MODE_DEFAULT 0;
// If enabled, the result includes measurements using the fixed-function performance counters and the RDTSC instruction.
extern int use_fixed_counters;
#define USE_FIXED_COUNTERS_DEFAULT 0;
enum agg_enum {AVG_20_80, MIN, MAX, MED};
extern int aggregate_function;
#define AGGREGATE_FUNCTION_DEFAULT AVG_20_80;
extern int verbose;
#define VERBOSE_DEFAULT 0;
// Whether to generate a breakpoint trap after executing the code to be benchmarked.
extern int debug;
#define DEBUG_DEFAULT 0;
extern char* code;
extern size_t code_length;
extern char* code_init;
extern size_t code_init_length;
extern char* code_late_init;
extern size_t code_late_init_length;
extern char* code_one_time_init;
extern size_t code_one_time_init_length;
struct pfc_config {
unsigned long evt_num;
unsigned long umask;
unsigned long cmask;
unsigned int any;
unsigned int edge;
unsigned int inv;
unsigned long msr_3f6h;
unsigned long msr_pf;
unsigned long msr_rsp0;
unsigned long msr_rsp1;
unsigned int ctr;
char* description;
};
extern struct pfc_config pfc_configs[];
extern size_t n_pfc_configs;
extern char* pfc_config_file_content;
struct msr_config {
unsigned long rdmsr;
unsigned long wrmsr[10];
unsigned long wrmsr_val[10];
size_t n_wrmsr;
char* description;
};
extern struct msr_config msr_configs[];
extern size_t n_msr_configs;
extern char* msr_config_file_content;
extern int is_Intel_CPU;
extern int is_AMD_CPU;
#define MAX_PROGRAMMABLE_COUNTERS 8
extern int n_programmable_counters;
// Pointers to a memory regions that are writable and executable.
extern char* runtime_code;
extern char* runtime_one_time_init_code;
#define RUNTIME_R_SIZE (1024*1024)
// During measurements, R14, RBP, RDI, RSI, and RSP will contain these addresses plus RUNTIME_R_SIZE/2.
// If r14_size is set in the kernel module, R14 will not have this offset.
extern void* runtime_r14;
extern void* runtime_rbp;
extern void* runtime_rdi;
extern void* runtime_rsi;
extern void* runtime_rsp;
// Stores performance counter values during measurements.
extern int64_t pfc_mem[MAX_PROGRAMMABLE_COUNTERS];
// Stores the RSP during measurements.
extern void* RSP_mem;
extern int64_t* measurement_results[MAX_PROGRAMMABLE_COUNTERS];
extern int64_t* measurement_results_base[MAX_PROGRAMMABLE_COUNTERS];
// Process should be pinned to this CPU.
extern int cpu;
// Checks whether we have an Intel or AMD CPU and determines the number of programmable counters.
// Returns 0 if successful, 1 otherwise.
int check_cpuid(void);
void parse_counter_configs(void);
void parse_msr_configs(void);
uint64_t read_value_from_cmd(char* cmd);
uint64_t read_msr(unsigned int msr);
void write_msr(unsigned int msr, uint64_t value);
// Enables and clears the fixed-function performance counters.
void configure_perf_ctrs_FF_Intel(unsigned int usr, unsigned int os);
// Clears the programmable performance counters and writes the configurations to the corresponding MSRs.
// next_pfc_config is an index into the pfc_configs array; the function takes up to n_counters many configurations from this array;
// it returns the index of the next configuration, and writes the descriptions of the applicable configurations to the corresponding array.
size_t configure_perf_ctrs_programmable(size_t next_pfc_config, int n_counters, unsigned int usr, unsigned int os, char* descriptions[]);
void configure_MSRs(struct msr_config config);
size_t get_required_runtime_code_length(void);
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count);
void run_initial_warmup_experiment(void);
void run_experiment(char* measurement_template, int64_t* results[], int n_counters, long local_unroll_count, long local_loop_count);
void create_and_run_one_time_init_code(void);
char* compute_result_str(char* buf, size_t buf_len, char* desc, int counter);
int64_t get_aggregate_value(int64_t* values, size_t length, size_t scale);
int cmpInt64(const void *a, const void *b);
long long ll_abs(long long val);
void print_all_measurement_results(int64_t* results[], int n_counters);
#define MAGIC_BYTES_INIT 0x10B513B1C2813F04
#define MAGIC_BYTES_CODE 0x20B513B1C2813F04
#define MAGIC_BYTES_RSP_ADDRESS 0x30B513B1C2813F04
#define MAGIC_BYTES_RUNTIME_R14 0x40B513B1C2813F04
#define MAGIC_BYTES_RUNTIME_RBP 0x50B513B1C2813F04
#define MAGIC_BYTES_RUNTIME_RDI 0x60B513B1C2813F04
#define MAGIC_BYTES_RUNTIME_RSI 0x70B513B1C2813F04
#define MAGIC_BYTES_RUNTIME_RSP 0x80B513B1C2813F04
#define MAGIC_BYTES_PFC 0x90B513B1C2813F04
#define MAGIC_BYTES_MSR 0xA0B513B1C2813F04
#define MAGIC_BYTES_TEMPLATE_END 0xB0B513B1C2813F04
#define MAGIC_BYTES_PFC_START 0xC0B513B1C2813F04
#define MAGIC_BYTES_PFC_END 0xD0B513B1C2813F04
#define MAGIC_BYTES_CODE_PFC_START 0xE0B513B1C2813F04
#define MAGIC_BYTES_CODE_PFC_STOP 0xF0B513B1C2813F04
#define STRINGIFY2(X) #X
#define STRINGIFY(X) STRINGIFY2(X)
int starts_with_magic_bytes(char* c, int64_t magic_bytes);
// The following functions must not use global variables (or anything that uses RIP-relative addressing)
void measurement_template_Intel_2(void);
void measurement_template_Intel_4(void);
void measurement_template_Intel_noMem_2(void);
void measurement_template_Intel_noMem_4(void);
void measurement_template_AMD(void);
void measurement_template_AMD_noMem(void);
void measurement_FF_template_Intel(void);
void measurement_FF_template_Intel_noMem(void);
void measurement_FF_template_AMD(void);
void measurement_FF_template_AMD_noMem(void);
void measurement_RDTSC_template(void);
void measurement_RDTSC_template_noMem(void);
void measurement_RDMSR_template(void);
void measurement_RDMSR_template_noMem(void);
void one_time_init_template(void);
void initial_warm_up_template(void);
// RBX, RBP, and R12–R15 are callee saved registers according to the "System V AMD64 ABI" (https://en.wikipedia.org/wiki/X86_calling_conventions)
#define SAVE_REGS_FLAGS() \
asm volatile( \
".intel_syntax noprefix\n" \
"push rbx\n" \
"push rbp\n" \
"push r12\n" \
"push r13\n" \
"push r14\n" \
"push r15\n" \
"pushfq\n" \
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
"mov [r15], rsp\n" \
"mov rax, 0\n" \
"mov rbx, 0\n" \
"mov rcx, 0\n" \
"mov rdx, 0\n" \
"mov r8, 0\n" \
"mov r9, 0\n" \
"mov r10, 0\n" \
"mov r11, 0\n" \
"mov r12, 0\n" \
"mov r13, 0\n" \
"mov r15, 0\n" \
"mov r14, "STRINGIFY(MAGIC_BYTES_RUNTIME_R14)"\n" \
"mov rbp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RBP)"\n" \
"mov rdi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RDI)"\n" \
"mov rsi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSI)"\n" \
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSP)"\n" \
".att_syntax noprefix");
#define RESTORE_REGS_FLAGS() \
asm volatile( \
".intel_syntax noprefix\n" \
"mov r15, "STRINGIFY(MAGIC_BYTES_RSP_ADDRESS)"\n" \
"mov rsp, [r15]\n" \
"popfq\n" \
"pop r15\n" \
"pop r14\n" \
"pop r13\n" \
"pop r12\n" \
"pop rbp\n" \
"pop rbx\n" \
".att_syntax noprefix");
#endif