added -df option

2025-12-15 19:10:08 +01:00 · 2021-03-03 15:47:23 +01:00
parent 1d4f3a458a
commit 4971d6c23a
10 changed files with 57 additions and 21 deletions
--- a/README.md
+++ b/README.md
@@ -151,6 +151,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
 | `-basic_mode`                | The effect of this option is described in the [Generated Code](#generated-code) section. |
 | `-no_mem`                    | If this option is enabled, the code for `read_perf_ctrs` does not make any memory accesses and stores all performance counter values in registers. This can, for example, be useful for benchmarks that require that the state of the data caches does not change after the execution of `code_init`. *If this option is used, the code to be benchmarked must not modify registers* ***R8-R11 (Intel)*** *and* ***R8-R13 (AMD).*** *Furthermore, `read_perf_ctrs` will modify* ***RAX, RCX, and RDX***. |
 | `-no_normalization`          | If this option is enabled, the measurement results are not divided by the number of repetitions. |
+| `-df`                        | If this option is enabled, the front-end buffers are drained after `code_late_init` by executing a sequence of 128 15-Byte `NOP` instructions. |
 | `-cpu <n>`                   | Pins the measurement thread to CPU n. `[Default: Pin the thread to the CPU it is currently running on.]` |
 | `-verbose`                   | Outputs the results of all performance counter readings. In the user-space version, the results are printed to stdout. The output of the kernel module can be accessed using `dmesg`. |

--- a/common/nanoBench.c
+++ b/common/nanoBench.c
@@ -17,7 +17,7 @@ long loop_count = LOOP_COUNT_DEFAULT;
 long warm_up_count = WARM_UP_COUNT_DEFAULT;
 long initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT;
 size_t alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
-
+int drain_frontend = DRAIN_FRONTEND_DEFAULT;
 int no_mem = NO_MEM_DEFAULT;
 int no_normalization = NO_NORMALIZATION_DEFAULT;
 int basic_mode = BASIC_MODE_DEFAULT;
@@ -420,7 +420,7 @@ size_t get_required_runtime_code_length() {
            req_code_length += 100;
        }
    }
-    return code_init_length + code_late_init_length + 2*unroll_count*req_code_length + 10000;
+    return code_init_length + code_late_init_length + (drain_frontend?128*15:0) + 2*unroll_count*req_code_length + 10000;
 }

 size_t get_distance_to_code(char* measurement_template, size_t templateI) {
@@ -486,12 +486,20 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
                rcI += code_late_init_length;
            }

+            if (drain_frontend) {
+                // the length of the following code sequence is a multiple of 64, and thus doesn't affect the alignment
+                for (size_t i=0; i<128; i++) {
+                    strncpy(&runtime_code[rcI], "\x66\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", 15);
+                    rcI += 15;
+                }
+            }
+
            if (unrollI == 0 && codeI == 0) {
                rcI_code_start = rcI;
            }

            if (!code_contains_magic_bytes) {
-                // in this case, we can use a memcpy, which is faster
+                // in this case, we can use memcpy, which is faster
                for (unrollI=0; unrollI<local_unroll_count; unrollI++) {
                    memcpy(&runtime_code[rcI], code, code_length);
                    rcI += code_length;
@@ -711,8 +719,7 @@ long long ll_abs(long long val) {
 void print_all_measurement_results(int64_t* results[], int n_counters) {
    int run_padding = (n_measurements<=10?1:(n_measurements<=100?2:(n_measurements<=1000?3:4)));

-    size_t size = 120;
-    char buf[size];
+    char buf[120];

    sprintf(buf, "\t%*s      ", run_padding, "");
    for (int c=0; c<n_counters; c++) {
--- a/common/nanoBench.h
+++ b/common/nanoBench.h
@@ -95,6 +95,10 @@ extern long initial_warm_up_count;
 extern size_t alignment_offset;
 #define ALIGNMENT_OFFSET_DEFAULT 0;

+// If enabled, the front-end buffers are drained between code_late_init and code by executing a sequence of 128 15-Byte NOP instructions.
+extern int drain_frontend;
+#define DRAIN_FRONTEND_DEFAULT 0;
+
 // If enabled, the temporary performance counter values are stored in registers instead of in memory;
 // the code to be measured must then not use registers R8-R13
 extern int no_mem;
--- a/kernel-nanoBench.sh
+++ b/kernel-nanoBench.sh
@@ -87,6 +87,9 @@ while [ "$1" ]; do
    elif [[ "$1" == -al* ]]; then
        echo "$2" > /sys/nb/alignment_offset
        shift 2
+    elif [[ "$1" == -df* ]]; then
+        echo "1" > /sys/nb/drain_frontend
+        shift
    elif [[ "$1" == -min* ]]; then
        echo "min" > /sys/nb/agg
        shift
@@ -115,6 +118,7 @@ while [ "$1" ]; do
        echo "  -warm_up_count <n>:         Number of runs before the first measurement gets recorded."
        echo "  -initial_warm_up_count <n>: Number of runs before any measurement is performed."
        echo "  -alignment_offset <n>:      Alignment offset."
+        echo "  -df:                        Drains front-end buffers between executing code_late_init and code."
        echo "  -avg:                       Selects the arithmetic mean as the aggregate function."
        echo "  -median:                    Selects the median as the aggregate function."
        echo "  -min:                       Selects the minimum as the aggregate function."
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,7 +18,7 @@ obj-m += $(MODULE_NAME).o
 CFLAGS_nb_km.o := -DDEBUG
 CFLAGS_nanoBench.o := -DDEBUG

-ccflags-y+=-std=gnu99 -Wno-declaration-after-statement
+ccflags-y+=-std=gnu99 -Wno-declaration-after-statement -DDEBUG

 all: 
 	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
--- a/kernel/nb_km.c
+++ b/kernel/nb_km.c
@@ -256,6 +256,15 @@ static ssize_t alignment_offset_store(struct kobject *kobj, struct kobj_attribut
 }
 static struct kobj_attribute alignment_offset_attribute =__ATTR(alignment_offset, 0660, alignment_offset_show, alignment_offset_store);

+static ssize_t drain_frontend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
+    return sprintf(buf, "%u\n", drain_frontend);
+}
+static ssize_t drain_frontend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
+    sscanf(buf, "%u", &drain_frontend);
+    return count;
+}
+static struct kobj_attribute drain_frontend_attribute =__ATTR(drain_frontend, 0660, drain_frontend_show, drain_frontend_store);
+
 static ssize_t basic_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
    return sprintf(buf, "%u\n", basic_mode);
 }
@@ -416,6 +425,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha
    aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
    verbose = VERBOSE_DEFAULT;
    alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
+    drain_frontend = DRAIN_FRONTEND_DEFAULT;

    code_init_length = 0;
    code_late_init_length = 0;
@@ -663,6 +673,7 @@ static int __init nb_init(void) {
    error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr);
+    error |= sysfs_create_file(nb_kobject, &drain_frontend_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
    error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
--- a/kernelNanoBench.py
+++ b/kernelNanoBench.py
@@ -54,8 +54,8 @@ paramDict = dict()
 # Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters().
 # Otherwise, reset() needs to be called first.
 def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None,
-                           warmUpCount=None, initialWarmUpCount=None, alignmentOffset=0, codeOffset=0, aggregateFunction=None, basicMode=None, noMem=None,
-                           noNormalization=None, verbose=None):
+                           warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None, aggregateFunction=None,
+                           basicMode=None, noMem=None, noNormalization=None, verbose=None):
   if not ramdiskCreated: createRamdisk()

   if config is not None:
@@ -109,6 +109,11 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
         writeFile('/sys/nb/code_offset', str(codeOffset))
         paramDict['codeOffset'] = codeOffset

+   if drainFrontend is not None:
+      if paramDict.get('drainFrontend', None) != drainFrontend:
+         writeFile('/sys/nb/drain_frontend', str(int(drainFrontend)))
+         paramDict['drainFrontend'] = drainFrontend
+
   if aggregateFunction is not None:
      if paramDict.get('aggregateFunction', None) != aggregateFunction:
         writeFile('/sys/nb/agg', aggregateFunction)
--- a/nanoBench.sh
+++ b/nanoBench.sh
@@ -16,7 +16,7 @@ fi

 debug=false
 for p in "$@"; do
-    if [[ "$p" == -d* ]]; then
+    if [[ "$p" == -de* ]]; then
        debug=true
    fi
 done
--- a/tools/cpuBench/cpuBench.py
+++ b/tools/cpuBench/cpuBench.py
@@ -224,7 +224,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
   if maxRepeat>0:
      if any(v<-0.05 for v in ret.values()):
         print 'Repeating experiment because there was a value < 0'
-         return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
+         return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=True, htmlReports=htmlReports, maxRepeat=maxRepeat-1)

      #sumPortUops = sum(v for e,v in ret.items() if 'PORT' in e and not '4' in e)
      #if (sumPortUops % 1) > .2 and (sumPortUops % 1) < .8:
@@ -236,7 +236,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
         maxPortUops = max(v/(len(e)-9) for e,v in ret.items() if 'PORT' in e)
         if maxPortUops * .98 > ret['Core cycles']:
            print 'Repeating experiment because there were more uops on a port than core cycles'
-            return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
+            return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=True, htmlReports=htmlReports, maxRepeat=maxRepeat-1)

   if htmlReports is not None:
      htmlReports.extend(localHtmlReports)
@@ -1220,12 +1220,8 @@ def getBasicLatencies(instrNodeList):
      sys.exit()
   basicLatency['MOVSX'] = movsxCycles

-   movsxR8hResult = runExperiment(None, 'MOVSX EAX, AH; MOV AH, AL')
-   movsxR8hCycles = int(round(movsxR8hResult['Core cycles']))
-   if movsxR8hCycles != 2:
-      print 'Latency of "MOVSX EAX, AH; MOV AH, AL" must be 2'
-      sys.exit()
-   basicLatency['MOV_R8h_R8l'] = 1
+   movsxR8hResult = runExperiment(None, 'MOV AH, AL')
+   basicLatency['MOV_R8h_R8l'] = max(1, int(round(movsxR8hResult['Core cycles'])))

   movR8hR8hResult = runExperiment(instrNodeDict['MOV_88 (R8h, R8h)'], 'MOV AH, AH')
   basicLatency['MOV_R8h_R8h'] = int(round(movR8hR8hResult['Core cycles']))
@@ -1836,11 +1832,12 @@ def getChainInstrForVectorRegs(instrNode, startReg, targetReg, cRep, cType):


 class LatConfig:
-   def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, notes=None):
+   def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, basicMode=False, notes=None):
      self.instrI = instrI
      self.chainInstrs = chainInstrs
      self.chainLatency = chainLatency
      self.init = ([] if init is None else init)
+      self.basicMode = basicMode
      self.notes = ([] if notes is None else notes)

 class LatConfigList:
@@ -2268,7 +2265,10 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
               chainInstrs += 'XOR ' + chainReg + ', R12; XOR ' + chainReg + ', R12;' + ('TEST R15, R15;' if instrReadsFlags else '')
               chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR']
               chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding
-               configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
+               # we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW
+               configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True))
+               # on some microarch. (e.g., HSW), an additional nop instr. can sometimes lead to a better port scheduling
+               configList.append(LatConfig(instrI, chainInstrs=chainInstrs + 'nop;', chainLatency=chainLatency, basicMode=True, notes=['with additional nop']))
            else:
               # mem -> mem
               if startNode.attrib.get('r','0')=='1':
@@ -2280,7 +2280,8 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
                     chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';')*10
                     chainInstrs += ('MOV [' + addrReg + '], ' + regToSize('R12', min(64, memWidth)))
                     chainLatency = basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))]
-                     configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
+                     # we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW
+                     configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True))
                  else:
                     # ToDo
                     pass
@@ -2513,7 +2514,8 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
                           configHtmlReports.append('<li>Chain latency: ' + ('&ge;' if latConfigList.isUpperBound else '') + str(latConfig.chainLatency) + '</li>\n')

                        init = latConfig.instrI.regMemInit + latConfig.init
-                        measurementResult = runExperiment(instrNode, latConfig.instrI.asm + ';' + latConfig.chainInstrs, init=init, htmlReports=configHtmlReports, unrollCount=100)
+                        measurementResult = runExperiment(instrNode, latConfig.instrI.asm + ';' + latConfig.chainInstrs, init=init,
+                                                          basicMode=latConfig.basicMode, htmlReports=configHtmlReports, unrollCount=100)
                        configHtmlReports.append('</ul>\n')

                        if not measurementResult:
--- a/user/nanoBench_main.c
+++ b/user/nanoBench_main.c
@@ -35,6 +35,7 @@ void print_usage() {
    printf("  -warm_up_count <n>:             Number of runs before the first measurement gets recorded.\n");
    printf("  -initial_warm_up_count <n>:     Number of runs before any measurement is performed.\n");
    printf("  -alignment_offset <n>:          Alignment offset.\n");
+    printf("  -df:                            Drains front-end buffers between executing code_late_init and code.\n");
    printf("  -avg:                           Selects the arithmetic mean as the aggregate function.\n");
    printf("  -median:                        Selects the median as the aggregate function.\n");
    printf("  -min:                           Selects the minimum as the aggregate function.\n");
@@ -80,6 +81,7 @@ int main(int argc, char **argv) {
        {"warm_up_count", required_argument, 0, 'w'},
        {"initial_warm_up_count", required_argument, 0, 'a'},
        {"alignment_offset", required_argument, 0, 'm'},
+        {"df", no_argument, &drain_frontend, 1},
        {"avg", no_argument, &aggregate_function, AVG_20_80},
        {"median", no_argument, &aggregate_function, MED},
        {"min", no_argument, &aggregate_function, MIN},