code alignment

This commit is contained in:
Andreas Abel
2020-05-11 23:39:33 +02:00
parent 47101197a9
commit b40b898de8
7 changed files with 72 additions and 20 deletions

View File

@@ -140,6 +140,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
| `-loop_count <n>` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` |
| `-warm_up_count <n>` | Number of runs of the generated benchmark code sequence (in each invocation of `run(...)`) before the first measurement result gets recorded . This can, for example, be useful for excluding outliers due to cold caches. `[Default: n=5]` |
| `-initial_warm_up_count <n>` | Number of runs of the benchmark code sequence before the first invocation of `run(...)`. This can be useful for benchmarking instructions that require a warm-up period before they can execute at full speed, like [AVX2 instructions on some microarchitectures](https://www.agner.org/optimize/blog/read.php?i=415). `[Default: n=0]` |
| `-alignment_offset <n>` | By default, the code to be benchmarked is aligned to 64 bytes. This parameter allows to specify an additional offset. `[Default: n=0]` |
| `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` |
| `-median` | Selects the median as the aggregate function. |
| `-min` | Selects the minimum as the aggregate function. |

View File

@@ -16,6 +16,7 @@ long unroll_count = UNROLL_COUNT_DEFAULT;
long loop_count = LOOP_COUNT_DEFAULT;
long warm_up_count = WARM_UP_COUNT_DEFAULT;
long initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT;
size_t alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
int no_mem = NO_MEM_DEFAULT;
int basic_mode = BASIC_MODE_DEFAULT;
@@ -408,7 +409,7 @@ void configure_MSRs(struct msr_config config) {
}
size_t get_required_runtime_code_length() {
size_t req_code_length = code_length;
size_t req_code_length = code_length + alignment_offset + 64;
for (size_t i=0; i+7<code_length; i++) {
if (starts_with_magic_bytes(&code[i], MAGIC_BYTES_CODE_PFC_START) || starts_with_magic_bytes(&code[i], MAGIC_BYTES_CODE_PFC_STOP)) {
req_code_length += 100;
@@ -417,12 +418,25 @@ size_t get_required_runtime_code_length() {
return code_init_length + 2*unroll_count*req_code_length + 10000;
}
size_t get_distance_to_code(char* measurement_template, size_t templateI) {
size_t dist = 0;
while (!starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_CODE)) {
if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC_START)) {
templateI += 8;
} else {
templateI++;
dist++;
}
}
return dist;
}
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count) {
size_t templateI = 0;
size_t codeI = 0;
long unrollI = 0;
size_t rcI = 0;
size_t rcI_loop_start = 0;
size_t rcI_code_start = 0;
size_t magic_bytes_pfc_start_I = 0;
size_t magic_bytes_code_I = 0;
@@ -443,6 +457,18 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
templateI += 8;
memcpy(&runtime_code[rcI], code_init, code_init_length);
rcI += code_init_length;
if (local_loop_count > 0) {
runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xC7'; runtime_code[rcI++] = '\xC7';
*(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count
}
size_t dist = get_distance_to_code(measurement_template, templateI);
size_t nFill = (64 - ((uintptr_t)&runtime_code[rcI+dist] % 64)) % 64;
nFill += alignment_offset;
for (size_t i=0; i<nFill; i++) {
runtime_code[rcI++] = '\x90';
}
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC_START)) {
magic_bytes_pfc_start_I = templateI;
templateI += 8;
@@ -451,11 +477,7 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
templateI += 8;
if (unrollI == 0 && codeI == 0) {
if (local_loop_count > 0) {
runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xC7'; runtime_code[rcI++] = '\xC7';
*(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count
rcI_loop_start = rcI;
}
rcI_code_start = rcI;
}
if (!code_contains_magic_bytes) {
@@ -488,7 +510,7 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
if (local_loop_count > 0) {
runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xFF'; runtime_code[rcI++] = '\xCF'; // dec R15
runtime_code[rcI++] = '\x0F'; runtime_code[rcI++] = '\x85';
*(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_loop_start-rcI-4); rcI += 4; // jnz loop_start
*(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_code_start-rcI-4); rcI += 4; // jnz loop_start
}
if (debug) {

View File

@@ -1,5 +1,5 @@
// nanoBench
//
//
// Copyright (C) 2019 Andreas Abel
//
// This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License.
@@ -12,7 +12,7 @@
#ifndef NANOBENCH_H
#define NANOBENCH_H
#ifdef __KERNEL__
#ifdef __KERNEL__
#include <linux/module.h>
#include <linux/sort.h>
#else
@@ -21,7 +21,7 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string.h>
#endif
#include <cpuid.h>
@@ -91,12 +91,16 @@ extern long warm_up_count;
extern long initial_warm_up_count;
#define INITIAL_WARM_UP_COUNT_DEFAULT 0;
// By default, the code to be benchmarked is aligned to 64 bytes. This parameter allows to specify an offset to this alignment.
extern size_t alignment_offset;
#define ALIGNMENT_OFFSET_DEFAULT 0;
// If enabled, the temporary performance counter values are stored in registers instead of in memory;
// the code to be measured must then not use registers R8-R13
extern int no_mem;
#define NO_MEM_DEFAULT 0;
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
// measurements.
// If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference
// between the two measurements.
@@ -129,7 +133,7 @@ struct pfc_config {
unsigned long cmask;
unsigned int any;
unsigned int edge;
unsigned int inv;
unsigned int inv;
unsigned long msr_3f6h;
unsigned long msr_pf;
unsigned long msr_rsp0;
@@ -290,7 +294,7 @@ void one_time_init_template(void);
"mov rsi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSI)"\n" \
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSP)"\n" \
".att_syntax noprefix");
#define RESTORE_REGS_FLAGS() \
asm volatile( \
".intel_syntax noprefix\n" \

View File

@@ -83,6 +83,9 @@ while [ "$1" ]; do
elif [[ "$1" == -initial* ]]; then
echo "$2" > /sys/nb/initial_warm_up
shift 2
elif [[ "$1" == -al* ]]; then
echo "$2" > /sys/nb/alignment_offset
shift 2
elif [[ "$1" == -min* ]]; then
echo "min" > /sys/nb/agg
shift
@@ -105,6 +108,7 @@ while [ "$1" ]; do
echo " -loop_count <n>: Number of iterations of the inner loop."
echo " -warm_up_count <n>: Number of runs before the first measurement gets recorded."
echo " -initial_warm_up_count <n>: Number of runs before any measurement is performed."
echo " -alignment_offset <n>: Alignment offset."
echo " -avg: Selects the arithmetic mean as the aggregate function."
echo " -median: Selects the median as the aggregate function."
echo " -min: Selects the minimum as the aggregate function."

View File

@@ -233,6 +233,15 @@ static ssize_t initial_warm_up_store(struct kobject *kobj, struct kobj_attribute
}
static struct kobj_attribute initial_warm_up_attribute =__ATTR(initial_warm_up, 0660, initial_warm_up_show, initial_warm_up_store);
static ssize_t alignment_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return sprintf(buf, "%zu\n", alignment_offset);
}
static ssize_t alignment_offset_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
sscanf(buf, "%zu", &alignment_offset);
return count;
}
static struct kobj_attribute alignment_offset_attribute =__ATTR(alignment_offset, 0660, alignment_offset_show, alignment_offset_store);
static ssize_t basic_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
return sprintf(buf, "%u\n", basic_mode);
}
@@ -618,6 +627,7 @@ static int __init nb_init(void) {
error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr);
error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr);
error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr);
error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr);
error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);

View File

@@ -53,7 +53,8 @@ paramDict = dict()
# Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters().
# Otherwise, reset() needs to be called first.
def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None,
warmUpCount=None, initialWarmUpCount=None, aggregateFunction=None, basicMode=None, noMem=None, codeOffset=0, verbose=None):
warmUpCount=None, initialWarmUpCount=None, alignmentOffset=0, codeOffset=0, aggregateFunction=None, basicMode=None, noMem=None,
verbose=None):
if not ramdiskCreated: createRamdisk()
if config is not None:
@@ -97,6 +98,16 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
writeFile('/sys/nb/initial_warm_up', str(initialWarmUpCount))
paramDict['initialWarmUpCount'] = initialWarmUpCount
if alignmentOffset is not None:
if paramDict.get('alignmentOffset', None) != alignmentOffset:
writeFile('/sys/nb/alignment_offset', str(alignmentOffset))
paramDict['alignmentOffset'] = alignmentOffset
if codeOffset is not None:
if paramDict.get('codeOffset', None) != codeOffset:
writeFile('/sys/nb/code_offset', str(codeOffset))
paramDict['codeOffset'] = codeOffset
if aggregateFunction is not None:
if paramDict.get('aggregateFunction', None) != aggregateFunction:
writeFile('/sys/nb/agg', aggregateFunction)
@@ -112,11 +123,6 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
writeFile('/sys/nb/no_mem', str(int(noMem)))
paramDict['noMem'] = noMem
if codeOffset is not None:
if paramDict.get('codeOffset', None) != codeOffset:
writeFile('/sys/nb/code_offset', str(codeOffset))
paramDict['codeOffset'] = codeOffset
if verbose is not None:
if paramDict.get('verbose', None) != verbose:
writeFile('/sys/nb/verbose', str(int(verbose)))

View File

@@ -33,6 +33,7 @@ void print_usage() {
printf(" -loop_count <n>: Number of iterations of the inner loop.\n");
printf(" -warm_up_count <n>: Number of runs before the first measurement gets recorded.\n");
printf(" -initial_warm_up_count <n>: Number of runs before any measurement is performed.\n");
printf(" -alignment_offset <n>: Alignment offset.\n");
printf(" -avg: Selects the arithmetic mean as the aggregate function.\n");
printf(" -median: Selects the median as the aggregate function.\n");
printf(" -min: Selects the minimum as the aggregate function.\n");
@@ -75,6 +76,7 @@ int main(int argc, char **argv) {
{"loop_count", required_argument, 0, 'l'},
{"warm_up_count", required_argument, 0, 'w'},
{"initial_warm_up_count", required_argument, 0, 'a'},
{"alignment_offset", required_argument, 0, 'm'},
{"avg", no_argument, &aggregate_function, AVG_20_80},
{"median", no_argument, &aggregate_function, MED},
{"min", no_argument, &aggregate_function, MIN},
@@ -125,6 +127,9 @@ int main(int argc, char **argv) {
case 'a':
initial_warm_up_count = atol(optarg);
break;
case 'm':
alignment_offset = (size_t)atol(optarg);
break;
case 'p':
cpu = atol(optarg);
break;