diff --git a/README.md b/README.md index 7270e72..922155a 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line | `-loop_count ` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` | | `-warm_up_count ` | Number of runs of the generated benchmark code sequence (in each invocation of `run(...)`) before the first measurement result gets recorded . This can, for example, be useful for excluding outliers due to cold caches. `[Default: n=5]` | | `-initial_warm_up_count ` | Number of runs of the benchmark code sequence before the first invocation of `run(...)`. This can be useful for benchmarking instructions that require a warm-up period before they can execute at full speed, like [AVX2 instructions on some microarchitectures](https://www.agner.org/optimize/blog/read.php?i=415). `[Default: n=0]` | +| `-alignment_offset ` | By default, the code to be benchmarked is aligned to 64 bytes. This parameter allows to specify an additional offset. `[Default: n=0]` | | `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` | | `-median` | Selects the median as the aggregate function. | | `-min` | Selects the minimum as the aggregate function. | diff --git a/common/nanoBench.c b/common/nanoBench.c index 5bb1b5e..64c78f8 100644 --- a/common/nanoBench.c +++ b/common/nanoBench.c @@ -16,6 +16,7 @@ long unroll_count = UNROLL_COUNT_DEFAULT; long loop_count = LOOP_COUNT_DEFAULT; long warm_up_count = WARM_UP_COUNT_DEFAULT; long initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT; +size_t alignment_offset = ALIGNMENT_OFFSET_DEFAULT; int no_mem = NO_MEM_DEFAULT; int basic_mode = BASIC_MODE_DEFAULT; @@ -408,7 +409,7 @@ void configure_MSRs(struct msr_config config) { } size_t get_required_runtime_code_length() { - size_t req_code_length = code_length; + size_t req_code_length = code_length + alignment_offset + 64; for (size_t i=0; i+7 0) { + runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xC7'; runtime_code[rcI++] = '\xC7'; + *(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count + } + + size_t dist = get_distance_to_code(measurement_template, templateI); + size_t nFill = (64 - ((uintptr_t)&runtime_code[rcI+dist] % 64)) % 64; + nFill += alignment_offset; + for (size_t i=0; i 0) { - runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xC7'; runtime_code[rcI++] = '\xC7'; - *(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count - rcI_loop_start = rcI; - } + rcI_code_start = rcI; } if (!code_contains_magic_bytes) { @@ -488,7 +510,7 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo if (local_loop_count > 0) { runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xFF'; runtime_code[rcI++] = '\xCF'; // dec R15 runtime_code[rcI++] = '\x0F'; runtime_code[rcI++] = '\x85'; - *(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_loop_start-rcI-4); rcI += 4; // jnz loop_start + *(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_code_start-rcI-4); rcI += 4; // jnz loop_start } if (debug) { diff --git a/common/nanoBench.h b/common/nanoBench.h index 8c065a8..259671b 100644 --- a/common/nanoBench.h +++ b/common/nanoBench.h @@ -1,5 +1,5 @@ // nanoBench -// +// // Copyright (C) 2019 Andreas Abel // // This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License. @@ -12,7 +12,7 @@ #ifndef NANOBENCH_H #define NANOBENCH_H -#ifdef __KERNEL__ +#ifdef __KERNEL__ #include #include #else @@ -21,7 +21,7 @@ #include #include #include - #include + #include #endif #include @@ -91,12 +91,16 @@ extern long warm_up_count; extern long initial_warm_up_count; #define INITIAL_WARM_UP_COUNT_DEFAULT 0; +// By default, the code to be benchmarked is aligned to 64 bytes. This parameter allows to specify an offset to this alignment. +extern size_t alignment_offset; +#define ALIGNMENT_OFFSET_DEFAULT 0; + // If enabled, the temporary performance counter values are stored in registers instead of in memory; // the code to be measured must then not use registers R8-R13 extern int no_mem; #define NO_MEM_DEFAULT 0; -// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two +// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two // measurements. // If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference // between the two measurements. @@ -129,7 +133,7 @@ struct pfc_config { unsigned long cmask; unsigned int any; unsigned int edge; - unsigned int inv; + unsigned int inv; unsigned long msr_3f6h; unsigned long msr_pf; unsigned long msr_rsp0; @@ -290,7 +294,7 @@ void one_time_init_template(void); "mov rsi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSI)"\n" \ "mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSP)"\n" \ ".att_syntax noprefix"); - + #define RESTORE_REGS_FLAGS() \ asm volatile( \ ".intel_syntax noprefix\n" \ diff --git a/kernel-nanoBench.sh b/kernel-nanoBench.sh index c41af72..5301db3 100755 --- a/kernel-nanoBench.sh +++ b/kernel-nanoBench.sh @@ -83,6 +83,9 @@ while [ "$1" ]; do elif [[ "$1" == -initial* ]]; then echo "$2" > /sys/nb/initial_warm_up shift 2 + elif [[ "$1" == -al* ]]; then + echo "$2" > /sys/nb/alignment_offset + shift 2 elif [[ "$1" == -min* ]]; then echo "min" > /sys/nb/agg shift @@ -105,6 +108,7 @@ while [ "$1" ]; do echo " -loop_count : Number of iterations of the inner loop." echo " -warm_up_count : Number of runs before the first measurement gets recorded." echo " -initial_warm_up_count : Number of runs before any measurement is performed." + echo " -alignment_offset : Alignment offset." echo " -avg: Selects the arithmetic mean as the aggregate function." echo " -median: Selects the median as the aggregate function." echo " -min: Selects the minimum as the aggregate function." diff --git a/kernel/nb_km.c b/kernel/nb_km.c index baf423a..eb020c9 100644 --- a/kernel/nb_km.c +++ b/kernel/nb_km.c @@ -233,6 +233,15 @@ static ssize_t initial_warm_up_store(struct kobject *kobj, struct kobj_attribute } static struct kobj_attribute initial_warm_up_attribute =__ATTR(initial_warm_up, 0660, initial_warm_up_show, initial_warm_up_store); +static ssize_t alignment_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + return sprintf(buf, "%zu\n", alignment_offset); +} +static ssize_t alignment_offset_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + sscanf(buf, "%zu", &alignment_offset); + return count; +} +static struct kobj_attribute alignment_offset_attribute =__ATTR(alignment_offset, 0660, alignment_offset_show, alignment_offset_store); + static ssize_t basic_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%u\n", basic_mode); } @@ -618,6 +627,7 @@ static int __init nb_init(void) { error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr); error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr); error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr); + error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr); error |= sysfs_create_file(nb_kobject, &agg_attribute.attr); error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr); error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr); diff --git a/kernelNanoBench.py b/kernelNanoBench.py index 6ccfcc2..f7e9853 100644 --- a/kernelNanoBench.py +++ b/kernelNanoBench.py @@ -53,7 +53,8 @@ paramDict = dict() # Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters(). # Otherwise, reset() needs to be called first. def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None, - warmUpCount=None, initialWarmUpCount=None, aggregateFunction=None, basicMode=None, noMem=None, codeOffset=0, verbose=None): + warmUpCount=None, initialWarmUpCount=None, alignmentOffset=0, codeOffset=0, aggregateFunction=None, basicMode=None, noMem=None, + verbose=None): if not ramdiskCreated: createRamdisk() if config is not None: @@ -97,6 +98,16 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf writeFile('/sys/nb/initial_warm_up', str(initialWarmUpCount)) paramDict['initialWarmUpCount'] = initialWarmUpCount + if alignmentOffset is not None: + if paramDict.get('alignmentOffset', None) != alignmentOffset: + writeFile('/sys/nb/alignment_offset', str(alignmentOffset)) + paramDict['alignmentOffset'] = alignmentOffset + + if codeOffset is not None: + if paramDict.get('codeOffset', None) != codeOffset: + writeFile('/sys/nb/code_offset', str(codeOffset)) + paramDict['codeOffset'] = codeOffset + if aggregateFunction is not None: if paramDict.get('aggregateFunction', None) != aggregateFunction: writeFile('/sys/nb/agg', aggregateFunction) @@ -112,11 +123,6 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf writeFile('/sys/nb/no_mem', str(int(noMem))) paramDict['noMem'] = noMem - if codeOffset is not None: - if paramDict.get('codeOffset', None) != codeOffset: - writeFile('/sys/nb/code_offset', str(codeOffset)) - paramDict['codeOffset'] = codeOffset - if verbose is not None: if paramDict.get('verbose', None) != verbose: writeFile('/sys/nb/verbose', str(int(verbose))) diff --git a/user/nanoBench_main.c b/user/nanoBench_main.c index 4e95354..ac6f156 100644 --- a/user/nanoBench_main.c +++ b/user/nanoBench_main.c @@ -33,6 +33,7 @@ void print_usage() { printf(" -loop_count : Number of iterations of the inner loop.\n"); printf(" -warm_up_count : Number of runs before the first measurement gets recorded.\n"); printf(" -initial_warm_up_count : Number of runs before any measurement is performed.\n"); + printf(" -alignment_offset : Alignment offset.\n"); printf(" -avg: Selects the arithmetic mean as the aggregate function.\n"); printf(" -median: Selects the median as the aggregate function.\n"); printf(" -min: Selects the minimum as the aggregate function.\n"); @@ -75,6 +76,7 @@ int main(int argc, char **argv) { {"loop_count", required_argument, 0, 'l'}, {"warm_up_count", required_argument, 0, 'w'}, {"initial_warm_up_count", required_argument, 0, 'a'}, + {"alignment_offset", required_argument, 0, 'm'}, {"avg", no_argument, &aggregate_function, AVG_20_80}, {"median", no_argument, &aggregate_function, MED}, {"min", no_argument, &aggregate_function, MIN}, @@ -125,6 +127,9 @@ int main(int argc, char **argv) { case 'a': initial_warm_up_count = atol(optarg); break; + case 'm': + alignment_offset = (size_t)atol(optarg); + break; case 'p': cpu = atol(optarg); break;