mirror of
https://github.com/andreas-abel/nanoBench.git
synced 2025-12-16 03:20:08 +01:00
code alignment
This commit is contained in:
@@ -140,6 +140,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
|
||||
| `-loop_count <n>` | Number of iterations of the inner loop. If n>0, the code to be benchmarked **must not modify R15**, as this register contains the loop counter. If n=0, the instructions for the loop are omitted; the loop body is then executed once. `[Default: n=0]` |
|
||||
| `-warm_up_count <n>` | Number of runs of the generated benchmark code sequence (in each invocation of `run(...)`) before the first measurement result gets recorded . This can, for example, be useful for excluding outliers due to cold caches. `[Default: n=5]` |
|
||||
| `-initial_warm_up_count <n>` | Number of runs of the benchmark code sequence before the first invocation of `run(...)`. This can be useful for benchmarking instructions that require a warm-up period before they can execute at full speed, like [AVX2 instructions on some microarchitectures](https://www.agner.org/optimize/blog/read.php?i=415). `[Default: n=0]` |
|
||||
| `-alignment_offset <n>` | By default, the code to be benchmarked is aligned to 64 bytes. This parameter allows to specify an additional offset. `[Default: n=0]` |
|
||||
| `-avg` | Selects the arithmetic mean (excluding the top and bottom 20% of the values) as the aggregate function. `[This is the default]` |
|
||||
| `-median` | Selects the median as the aggregate function. |
|
||||
| `-min` | Selects the minimum as the aggregate function. |
|
||||
|
||||
@@ -16,6 +16,7 @@ long unroll_count = UNROLL_COUNT_DEFAULT;
|
||||
long loop_count = LOOP_COUNT_DEFAULT;
|
||||
long warm_up_count = WARM_UP_COUNT_DEFAULT;
|
||||
long initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT;
|
||||
size_t alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
|
||||
|
||||
int no_mem = NO_MEM_DEFAULT;
|
||||
int basic_mode = BASIC_MODE_DEFAULT;
|
||||
@@ -408,7 +409,7 @@ void configure_MSRs(struct msr_config config) {
|
||||
}
|
||||
|
||||
size_t get_required_runtime_code_length() {
|
||||
size_t req_code_length = code_length;
|
||||
size_t req_code_length = code_length + alignment_offset + 64;
|
||||
for (size_t i=0; i+7<code_length; i++) {
|
||||
if (starts_with_magic_bytes(&code[i], MAGIC_BYTES_CODE_PFC_START) || starts_with_magic_bytes(&code[i], MAGIC_BYTES_CODE_PFC_STOP)) {
|
||||
req_code_length += 100;
|
||||
@@ -417,12 +418,25 @@ size_t get_required_runtime_code_length() {
|
||||
return code_init_length + 2*unroll_count*req_code_length + 10000;
|
||||
}
|
||||
|
||||
size_t get_distance_to_code(char* measurement_template, size_t templateI) {
|
||||
size_t dist = 0;
|
||||
while (!starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_CODE)) {
|
||||
if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC_START)) {
|
||||
templateI += 8;
|
||||
} else {
|
||||
templateI++;
|
||||
dist++;
|
||||
}
|
||||
}
|
||||
return dist;
|
||||
}
|
||||
|
||||
void create_runtime_code(char* measurement_template, long local_unroll_count, long local_loop_count) {
|
||||
size_t templateI = 0;
|
||||
size_t codeI = 0;
|
||||
long unrollI = 0;
|
||||
size_t rcI = 0;
|
||||
size_t rcI_loop_start = 0;
|
||||
size_t rcI_code_start = 0;
|
||||
size_t magic_bytes_pfc_start_I = 0;
|
||||
size_t magic_bytes_code_I = 0;
|
||||
|
||||
@@ -443,6 +457,18 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
templateI += 8;
|
||||
memcpy(&runtime_code[rcI], code_init, code_init_length);
|
||||
rcI += code_init_length;
|
||||
|
||||
if (local_loop_count > 0) {
|
||||
runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xC7'; runtime_code[rcI++] = '\xC7';
|
||||
*(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count
|
||||
}
|
||||
|
||||
size_t dist = get_distance_to_code(measurement_template, templateI);
|
||||
size_t nFill = (64 - ((uintptr_t)&runtime_code[rcI+dist] % 64)) % 64;
|
||||
nFill += alignment_offset;
|
||||
for (size_t i=0; i<nFill; i++) {
|
||||
runtime_code[rcI++] = '\x90';
|
||||
}
|
||||
} else if (starts_with_magic_bytes(&measurement_template[templateI], MAGIC_BYTES_PFC_START)) {
|
||||
magic_bytes_pfc_start_I = templateI;
|
||||
templateI += 8;
|
||||
@@ -451,11 +477,7 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
templateI += 8;
|
||||
|
||||
if (unrollI == 0 && codeI == 0) {
|
||||
if (local_loop_count > 0) {
|
||||
runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xC7'; runtime_code[rcI++] = '\xC7';
|
||||
*(int32_t*)(&runtime_code[rcI]) = (int32_t)local_loop_count; rcI += 4; // mov R15, local_loop_count
|
||||
rcI_loop_start = rcI;
|
||||
}
|
||||
rcI_code_start = rcI;
|
||||
}
|
||||
|
||||
if (!code_contains_magic_bytes) {
|
||||
@@ -488,7 +510,7 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
|
||||
if (local_loop_count > 0) {
|
||||
runtime_code[rcI++] = '\x49'; runtime_code[rcI++] = '\xFF'; runtime_code[rcI++] = '\xCF'; // dec R15
|
||||
runtime_code[rcI++] = '\x0F'; runtime_code[rcI++] = '\x85';
|
||||
*(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_loop_start-rcI-4); rcI += 4; // jnz loop_start
|
||||
*(int32_t*)(&runtime_code[rcI]) = (int32_t)(rcI_code_start-rcI-4); rcI += 4; // jnz loop_start
|
||||
}
|
||||
|
||||
if (debug) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// nanoBench
|
||||
//
|
||||
//
|
||||
// Copyright (C) 2019 Andreas Abel
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify it under the terms of version 3 of the GNU Affero General Public License.
|
||||
@@ -12,7 +12,7 @@
|
||||
#ifndef NANOBENCH_H
|
||||
#define NANOBENCH_H
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#ifdef __KERNEL__
|
||||
#include <linux/module.h>
|
||||
#include <linux/sort.h>
|
||||
#else
|
||||
@@ -21,7 +21,7 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include <cpuid.h>
|
||||
@@ -91,12 +91,16 @@ extern long warm_up_count;
|
||||
extern long initial_warm_up_count;
|
||||
#define INITIAL_WARM_UP_COUNT_DEFAULT 0;
|
||||
|
||||
// By default, the code to be benchmarked is aligned to 64 bytes. This parameter allows to specify an offset to this alignment.
|
||||
extern size_t alignment_offset;
|
||||
#define ALIGNMENT_OFFSET_DEFAULT 0;
|
||||
|
||||
// If enabled, the temporary performance counter values are stored in registers instead of in memory;
|
||||
// the code to be measured must then not use registers R8-R13
|
||||
extern int no_mem;
|
||||
#define NO_MEM_DEFAULT 0;
|
||||
|
||||
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
|
||||
// If disabled, the first measurement is performed with 2*unroll_count and the second with unroll_count; the reported result is the difference between the two
|
||||
// measurements.
|
||||
// If enabled, the first measurement is performed with unroll_count and the second with an empty measurement body; the reported result is the difference
|
||||
// between the two measurements.
|
||||
@@ -129,7 +133,7 @@ struct pfc_config {
|
||||
unsigned long cmask;
|
||||
unsigned int any;
|
||||
unsigned int edge;
|
||||
unsigned int inv;
|
||||
unsigned int inv;
|
||||
unsigned long msr_3f6h;
|
||||
unsigned long msr_pf;
|
||||
unsigned long msr_rsp0;
|
||||
@@ -290,7 +294,7 @@ void one_time_init_template(void);
|
||||
"mov rsi, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSI)"\n" \
|
||||
"mov rsp, "STRINGIFY(MAGIC_BYTES_RUNTIME_RSP)"\n" \
|
||||
".att_syntax noprefix");
|
||||
|
||||
|
||||
#define RESTORE_REGS_FLAGS() \
|
||||
asm volatile( \
|
||||
".intel_syntax noprefix\n" \
|
||||
|
||||
@@ -83,6 +83,9 @@ while [ "$1" ]; do
|
||||
elif [[ "$1" == -initial* ]]; then
|
||||
echo "$2" > /sys/nb/initial_warm_up
|
||||
shift 2
|
||||
elif [[ "$1" == -al* ]]; then
|
||||
echo "$2" > /sys/nb/alignment_offset
|
||||
shift 2
|
||||
elif [[ "$1" == -min* ]]; then
|
||||
echo "min" > /sys/nb/agg
|
||||
shift
|
||||
@@ -105,6 +108,7 @@ while [ "$1" ]; do
|
||||
echo " -loop_count <n>: Number of iterations of the inner loop."
|
||||
echo " -warm_up_count <n>: Number of runs before the first measurement gets recorded."
|
||||
echo " -initial_warm_up_count <n>: Number of runs before any measurement is performed."
|
||||
echo " -alignment_offset <n>: Alignment offset."
|
||||
echo " -avg: Selects the arithmetic mean as the aggregate function."
|
||||
echo " -median: Selects the median as the aggregate function."
|
||||
echo " -min: Selects the minimum as the aggregate function."
|
||||
|
||||
@@ -233,6 +233,15 @@ static ssize_t initial_warm_up_store(struct kobject *kobj, struct kobj_attribute
|
||||
}
|
||||
static struct kobj_attribute initial_warm_up_attribute =__ATTR(initial_warm_up, 0660, initial_warm_up_show, initial_warm_up_store);
|
||||
|
||||
static ssize_t alignment_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%zu\n", alignment_offset);
|
||||
}
|
||||
static ssize_t alignment_offset_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
|
||||
sscanf(buf, "%zu", &alignment_offset);
|
||||
return count;
|
||||
}
|
||||
static struct kobj_attribute alignment_offset_attribute =__ATTR(alignment_offset, 0660, alignment_offset_show, alignment_offset_store);
|
||||
|
||||
static ssize_t basic_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
|
||||
return sprintf(buf, "%u\n", basic_mode);
|
||||
}
|
||||
@@ -618,6 +627,7 @@ static int __init nb_init(void) {
|
||||
error |= sysfs_create_file(nb_kobject, &n_measurements_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
|
||||
error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
|
||||
|
||||
@@ -53,7 +53,8 @@ paramDict = dict()
|
||||
# Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters().
|
||||
# Otherwise, reset() needs to be called first.
|
||||
def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None,
|
||||
warmUpCount=None, initialWarmUpCount=None, aggregateFunction=None, basicMode=None, noMem=None, codeOffset=0, verbose=None):
|
||||
warmUpCount=None, initialWarmUpCount=None, alignmentOffset=0, codeOffset=0, aggregateFunction=None, basicMode=None, noMem=None,
|
||||
verbose=None):
|
||||
if not ramdiskCreated: createRamdisk()
|
||||
|
||||
if config is not None:
|
||||
@@ -97,6 +98,16 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
|
||||
writeFile('/sys/nb/initial_warm_up', str(initialWarmUpCount))
|
||||
paramDict['initialWarmUpCount'] = initialWarmUpCount
|
||||
|
||||
if alignmentOffset is not None:
|
||||
if paramDict.get('alignmentOffset', None) != alignmentOffset:
|
||||
writeFile('/sys/nb/alignment_offset', str(alignmentOffset))
|
||||
paramDict['alignmentOffset'] = alignmentOffset
|
||||
|
||||
if codeOffset is not None:
|
||||
if paramDict.get('codeOffset', None) != codeOffset:
|
||||
writeFile('/sys/nb/code_offset', str(codeOffset))
|
||||
paramDict['codeOffset'] = codeOffset
|
||||
|
||||
if aggregateFunction is not None:
|
||||
if paramDict.get('aggregateFunction', None) != aggregateFunction:
|
||||
writeFile('/sys/nb/agg', aggregateFunction)
|
||||
@@ -112,11 +123,6 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
|
||||
writeFile('/sys/nb/no_mem', str(int(noMem)))
|
||||
paramDict['noMem'] = noMem
|
||||
|
||||
if codeOffset is not None:
|
||||
if paramDict.get('codeOffset', None) != codeOffset:
|
||||
writeFile('/sys/nb/code_offset', str(codeOffset))
|
||||
paramDict['codeOffset'] = codeOffset
|
||||
|
||||
if verbose is not None:
|
||||
if paramDict.get('verbose', None) != verbose:
|
||||
writeFile('/sys/nb/verbose', str(int(verbose)))
|
||||
|
||||
@@ -33,6 +33,7 @@ void print_usage() {
|
||||
printf(" -loop_count <n>: Number of iterations of the inner loop.\n");
|
||||
printf(" -warm_up_count <n>: Number of runs before the first measurement gets recorded.\n");
|
||||
printf(" -initial_warm_up_count <n>: Number of runs before any measurement is performed.\n");
|
||||
printf(" -alignment_offset <n>: Alignment offset.\n");
|
||||
printf(" -avg: Selects the arithmetic mean as the aggregate function.\n");
|
||||
printf(" -median: Selects the median as the aggregate function.\n");
|
||||
printf(" -min: Selects the minimum as the aggregate function.\n");
|
||||
@@ -75,6 +76,7 @@ int main(int argc, char **argv) {
|
||||
{"loop_count", required_argument, 0, 'l'},
|
||||
{"warm_up_count", required_argument, 0, 'w'},
|
||||
{"initial_warm_up_count", required_argument, 0, 'a'},
|
||||
{"alignment_offset", required_argument, 0, 'm'},
|
||||
{"avg", no_argument, &aggregate_function, AVG_20_80},
|
||||
{"median", no_argument, &aggregate_function, MED},
|
||||
{"min", no_argument, &aggregate_function, MIN},
|
||||
@@ -125,6 +127,9 @@ int main(int argc, char **argv) {
|
||||
case 'a':
|
||||
initial_warm_up_count = atol(optarg);
|
||||
break;
|
||||
case 'm':
|
||||
alignment_offset = (size_t)atol(optarg);
|
||||
break;
|
||||
case 'p':
|
||||
cpu = atol(optarg);
|
||||
break;
|
||||
|
||||
Reference in New Issue
Block a user