diff --git a/README.md b/README.md index e54f54e..fd6f07b 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line | `-basic_mode` | The effect of this option is described in the [Generated Code](#generated-code) section. | | `-no_mem` | If this option is enabled, the code for `read_perf_ctrs` does not make any memory accesses and stores all performance counter values in registers. This can, for example, be useful for benchmarks that require that the state of the data caches does not change after the execution of `code_init`. *If this option is used, the code to be benchmarked must not modify registers* ***R8-R11 (Intel)*** *and* ***R8-R13 (AMD).*** *Furthermore, `read_perf_ctrs` will modify* ***RAX, RCX, and RDX***. | | `-no_normalization` | If this option is enabled, the measurement results are not divided by the number of repetitions. | +| `-df` | If this option is enabled, the front-end buffers are drained after `code_late_init` by executing a sequence of 128 15-Byte `NOP` instructions. | | `-cpu ` | Pins the measurement thread to CPU n. `[Default: Pin the thread to the CPU it is currently running on.]` | | `-verbose` | Outputs the results of all performance counter readings. In the user-space version, the results are printed to stdout. The output of the kernel module can be accessed using `dmesg`. | diff --git a/common/nanoBench.c b/common/nanoBench.c index 1d2a846..109aad9 100644 --- a/common/nanoBench.c +++ b/common/nanoBench.c @@ -17,7 +17,7 @@ long loop_count = LOOP_COUNT_DEFAULT; long warm_up_count = WARM_UP_COUNT_DEFAULT; long initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT; size_t alignment_offset = ALIGNMENT_OFFSET_DEFAULT; - +int drain_frontend = DRAIN_FRONTEND_DEFAULT; int no_mem = NO_MEM_DEFAULT; int no_normalization = NO_NORMALIZATION_DEFAULT; int basic_mode = BASIC_MODE_DEFAULT; @@ -420,7 +420,7 @@ size_t get_required_runtime_code_length() { req_code_length += 100; } } - return code_init_length + code_late_init_length + 2*unroll_count*req_code_length + 10000; + return code_init_length + code_late_init_length + (drain_frontend?128*15:0) + 2*unroll_count*req_code_length + 10000; } size_t get_distance_to_code(char* measurement_template, size_t templateI) { @@ -486,12 +486,20 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo rcI += code_late_init_length; } + if (drain_frontend) { + // the length of the following code sequence is a multiple of 64, and thus doesn't affect the alignment + for (size_t i=0; i<128; i++) { + strncpy(&runtime_code[rcI], "\x66\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", 15); + rcI += 15; + } + } + if (unrollI == 0 && codeI == 0) { rcI_code_start = rcI; } if (!code_contains_magic_bytes) { - // in this case, we can use a memcpy, which is faster + // in this case, we can use memcpy, which is faster for (unrollI=0; unrollI /sys/nb/alignment_offset shift 2 + elif [[ "$1" == -df* ]]; then + echo "1" > /sys/nb/drain_frontend + shift elif [[ "$1" == -min* ]]; then echo "min" > /sys/nb/agg shift @@ -115,6 +118,7 @@ while [ "$1" ]; do echo " -warm_up_count : Number of runs before the first measurement gets recorded." echo " -initial_warm_up_count : Number of runs before any measurement is performed." echo " -alignment_offset : Alignment offset." + echo " -df: Drains front-end buffers between executing code_late_init and code." echo " -avg: Selects the arithmetic mean as the aggregate function." echo " -median: Selects the median as the aggregate function." echo " -min: Selects the minimum as the aggregate function." diff --git a/kernel/Makefile b/kernel/Makefile index b6e03f5..07b503d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,7 +18,7 @@ obj-m += $(MODULE_NAME).o CFLAGS_nb_km.o := -DDEBUG CFLAGS_nanoBench.o := -DDEBUG -ccflags-y+=-std=gnu99 -Wno-declaration-after-statement +ccflags-y+=-std=gnu99 -Wno-declaration-after-statement -DDEBUG all: make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules diff --git a/kernel/nb_km.c b/kernel/nb_km.c index 911dde1..3a7bae0 100644 --- a/kernel/nb_km.c +++ b/kernel/nb_km.c @@ -256,6 +256,15 @@ static ssize_t alignment_offset_store(struct kobject *kobj, struct kobj_attribut } static struct kobj_attribute alignment_offset_attribute =__ATTR(alignment_offset, 0660, alignment_offset_show, alignment_offset_store); +static ssize_t drain_frontend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + return sprintf(buf, "%u\n", drain_frontend); +} +static ssize_t drain_frontend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + sscanf(buf, "%u", &drain_frontend); + return count; +} +static struct kobj_attribute drain_frontend_attribute =__ATTR(drain_frontend, 0660, drain_frontend_show, drain_frontend_store); + static ssize_t basic_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%u\n", basic_mode); } @@ -416,6 +425,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha aggregate_function = AGGREGATE_FUNCTION_DEFAULT; verbose = VERBOSE_DEFAULT; alignment_offset = ALIGNMENT_OFFSET_DEFAULT; + drain_frontend = DRAIN_FRONTEND_DEFAULT; code_init_length = 0; code_late_init_length = 0; @@ -663,6 +673,7 @@ static int __init nb_init(void) { error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr); error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr); error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr); + error |= sysfs_create_file(nb_kobject, &drain_frontend_attribute.attr); error |= sysfs_create_file(nb_kobject, &agg_attribute.attr); error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr); error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr); diff --git a/kernelNanoBench.py b/kernelNanoBench.py index 8ae82d4..e7e3beb 100644 --- a/kernelNanoBench.py +++ b/kernelNanoBench.py @@ -54,8 +54,8 @@ paramDict = dict() # Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters(). # Otherwise, reset() needs to be called first. def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None, - warmUpCount=None, initialWarmUpCount=None, alignmentOffset=0, codeOffset=0, aggregateFunction=None, basicMode=None, noMem=None, - noNormalization=None, verbose=None): + warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None, aggregateFunction=None, + basicMode=None, noMem=None, noNormalization=None, verbose=None): if not ramdiskCreated: createRamdisk() if config is not None: @@ -109,6 +109,11 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf writeFile('/sys/nb/code_offset', str(codeOffset)) paramDict['codeOffset'] = codeOffset + if drainFrontend is not None: + if paramDict.get('drainFrontend', None) != drainFrontend: + writeFile('/sys/nb/drain_frontend', str(int(drainFrontend))) + paramDict['drainFrontend'] = drainFrontend + if aggregateFunction is not None: if paramDict.get('aggregateFunction', None) != aggregateFunction: writeFile('/sys/nb/agg', aggregateFunction) diff --git a/nanoBench.sh b/nanoBench.sh index 5d05306..b2a1362 100755 --- a/nanoBench.sh +++ b/nanoBench.sh @@ -16,7 +16,7 @@ fi debug=false for p in "$@"; do - if [[ "$p" == -d* ]]; then + if [[ "$p" == -de* ]]; then debug=true fi done diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py index e5c8637..af39291 100755 --- a/tools/cpuBench/cpuBench.py +++ b/tools/cpuBench/cpuBench.py @@ -224,7 +224,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, if maxRepeat>0: if any(v<-0.05 for v in ret.values()): print 'Repeating experiment because there was a value < 0' - return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1) + return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=True, htmlReports=htmlReports, maxRepeat=maxRepeat-1) #sumPortUops = sum(v for e,v in ret.items() if 'PORT' in e and not '4' in e) #if (sumPortUops % 1) > .2 and (sumPortUops % 1) < .8: @@ -236,7 +236,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, maxPortUops = max(v/(len(e)-9) for e,v in ret.items() if 'PORT' in e) if maxPortUops * .98 > ret['Core cycles']: print 'Repeating experiment because there were more uops on a port than core cycles' - return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1) + return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=True, htmlReports=htmlReports, maxRepeat=maxRepeat-1) if htmlReports is not None: htmlReports.extend(localHtmlReports) @@ -1220,12 +1220,8 @@ def getBasicLatencies(instrNodeList): sys.exit() basicLatency['MOVSX'] = movsxCycles - movsxR8hResult = runExperiment(None, 'MOVSX EAX, AH; MOV AH, AL') - movsxR8hCycles = int(round(movsxR8hResult['Core cycles'])) - if movsxR8hCycles != 2: - print 'Latency of "MOVSX EAX, AH; MOV AH, AL" must be 2' - sys.exit() - basicLatency['MOV_R8h_R8l'] = 1 + movsxR8hResult = runExperiment(None, 'MOV AH, AL') + basicLatency['MOV_R8h_R8l'] = max(1, int(round(movsxR8hResult['Core cycles']))) movR8hR8hResult = runExperiment(instrNodeDict['MOV_88 (R8h, R8h)'], 'MOV AH, AH') basicLatency['MOV_R8h_R8h'] = int(round(movR8hR8hResult['Core cycles'])) @@ -1836,11 +1832,12 @@ def getChainInstrForVectorRegs(instrNode, startReg, targetReg, cRep, cType): class LatConfig: - def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, notes=None): + def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, basicMode=False, notes=None): self.instrI = instrI self.chainInstrs = chainInstrs self.chainLatency = chainLatency self.init = ([] if init is None else init) + self.basicMode = basicMode self.notes = ([] if notes is None else notes) class LatConfigList: @@ -2268,7 +2265,10 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem chainInstrs += 'XOR ' + chainReg + ', R12; XOR ' + chainReg + ', R12;' + ('TEST R15, R15;' if instrReadsFlags else '') chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR'] chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding - configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + # we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True)) + # on some microarch. (e.g., HSW), an additional nop instr. can sometimes lead to a better port scheduling + configList.append(LatConfig(instrI, chainInstrs=chainInstrs + 'nop;', chainLatency=chainLatency, basicMode=True, notes=['with additional nop'])) else: # mem -> mem if startNode.attrib.get('r','0')=='1': @@ -2280,7 +2280,8 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';')*10 chainInstrs += ('MOV [' + addrReg + '], ' + regToSize('R12', min(64, memWidth))) chainLatency = basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] - configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + # we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True)) else: # ToDo pass @@ -2513,7 +2514,8 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports): configHtmlReports.append('
  • Chain latency: ' + ('≥' if latConfigList.isUpperBound else '') + str(latConfig.chainLatency) + '
  • \n') init = latConfig.instrI.regMemInit + latConfig.init - measurementResult = runExperiment(instrNode, latConfig.instrI.asm + ';' + latConfig.chainInstrs, init=init, htmlReports=configHtmlReports, unrollCount=100) + measurementResult = runExperiment(instrNode, latConfig.instrI.asm + ';' + latConfig.chainInstrs, init=init, + basicMode=latConfig.basicMode, htmlReports=configHtmlReports, unrollCount=100) configHtmlReports.append('\n') if not measurementResult: diff --git a/user/nanoBench_main.c b/user/nanoBench_main.c index 1ccc544..7fbe763 100644 --- a/user/nanoBench_main.c +++ b/user/nanoBench_main.c @@ -35,6 +35,7 @@ void print_usage() { printf(" -warm_up_count : Number of runs before the first measurement gets recorded.\n"); printf(" -initial_warm_up_count : Number of runs before any measurement is performed.\n"); printf(" -alignment_offset : Alignment offset.\n"); + printf(" -df: Drains front-end buffers between executing code_late_init and code.\n"); printf(" -avg: Selects the arithmetic mean as the aggregate function.\n"); printf(" -median: Selects the median as the aggregate function.\n"); printf(" -min: Selects the minimum as the aggregate function.\n"); @@ -80,6 +81,7 @@ int main(int argc, char **argv) { {"warm_up_count", required_argument, 0, 'w'}, {"initial_warm_up_count", required_argument, 0, 'a'}, {"alignment_offset", required_argument, 0, 'm'}, + {"df", no_argument, &drain_frontend, 1}, {"avg", no_argument, &aggregate_function, AVG_20_80}, {"median", no_argument, &aggregate_function, MED}, {"min", no_argument, &aggregate_function, MIN},