diff --git a/README.md b/README.md
index e54f54e..fd6f07b 100644
--- a/README.md
+++ b/README.md
@@ -151,6 +151,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
 | `-basic_mode`                | The effect of this option is described in the [Generated Code](#generated-code) section. |
 | `-no_mem`                    | If this option is enabled, the code for `read_perf_ctrs` does not make any memory accesses and stores all performance counter values in registers. This can, for example, be useful for benchmarks that require that the state of the data caches does not change after the execution of `code_init`. *If this option is used, the code to be benchmarked must not modify registers* ***R8-R11 (Intel)*** *and* ***R8-R13 (AMD).*** *Furthermore, `read_perf_ctrs` will modify* ***RAX, RCX, and RDX***. |
 | `-no_normalization`          | If this option is enabled, the measurement results are not divided by the number of repetitions. |
+| `-df`                        | If this option is enabled, the front-end buffers are drained after `code_late_init` by executing a sequence of 128 15-Byte `NOP` instructions. |
 | `-cpu <n>`                   | Pins the measurement thread to CPU n. `[Default: Pin the thread to the CPU it is currently running on.]` |
 | `-verbose`                   | Outputs the results of all performance counter readings. In the user-space version, the results are printed to stdout. The output of the kernel module can be accessed using `dmesg`. |
 
diff --git a/common/nanoBench.c b/common/nanoBench.c
index 1d2a846..109aad9 100644
--- a/common/nanoBench.c
+++ b/common/nanoBench.c
@@ -17,7 +17,7 @@ long loop_count = LOOP_COUNT_DEFAULT;
 long warm_up_count = WARM_UP_COUNT_DEFAULT;
 long initial_warm_up_count = INITIAL_WARM_UP_COUNT_DEFAULT;
 size_t alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
-
+int drain_frontend = DRAIN_FRONTEND_DEFAULT;
 int no_mem = NO_MEM_DEFAULT;
 int no_normalization = NO_NORMALIZATION_DEFAULT;
 int basic_mode = BASIC_MODE_DEFAULT;
@@ -420,7 +420,7 @@ size_t get_required_runtime_code_length() {
             req_code_length += 100;
         }
     }
-    return code_init_length + code_late_init_length + 2*unroll_count*req_code_length + 10000;
+    return code_init_length + code_late_init_length + (drain_frontend?128*15:0) + 2*unroll_count*req_code_length + 10000;
 }
 
 size_t get_distance_to_code(char* measurement_template, size_t templateI) {
@@ -486,12 +486,20 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
                 rcI += code_late_init_length;
             }
 
+            if (drain_frontend) {
+                // the length of the following code sequence is a multiple of 64, and thus doesn't affect the alignment
+                for (size_t i=0; i<128; i++) {
+                    strncpy(&runtime_code[rcI], "\x66\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", 15);
+                    rcI += 15;
+                }
+            }
+
             if (unrollI == 0 && codeI == 0) {
                 rcI_code_start = rcI;
             }
 
             if (!code_contains_magic_bytes) {
-                // in this case, we can use a memcpy, which is faster
+                // in this case, we can use memcpy, which is faster
                 for (unrollI=0; unrollI<local_unroll_count; unrollI++) {
                     memcpy(&runtime_code[rcI], code, code_length);
                     rcI += code_length;
@@ -711,8 +719,7 @@ long long ll_abs(long long val) {
 void print_all_measurement_results(int64_t* results[], int n_counters) {
     int run_padding = (n_measurements<=10?1:(n_measurements<=100?2:(n_measurements<=1000?3:4)));
 
-    size_t size = 120;
-    char buf[size];
+    char buf[120];
 
     sprintf(buf, "\t%*s      ", run_padding, "");
     for (int c=0; c<n_counters; c++) {
diff --git a/common/nanoBench.h b/common/nanoBench.h
index 7368c9a..e28bc59 100644
--- a/common/nanoBench.h
+++ b/common/nanoBench.h
@@ -95,6 +95,10 @@ extern long initial_warm_up_count;
 extern size_t alignment_offset;
 #define ALIGNMENT_OFFSET_DEFAULT 0;
 
+// If enabled, the front-end buffers are drained between code_late_init and code by executing a sequence of 128 15-Byte NOP instructions.
+extern int drain_frontend;
+#define DRAIN_FRONTEND_DEFAULT 0;
+
 // If enabled, the temporary performance counter values are stored in registers instead of in memory;
 // the code to be measured must then not use registers R8-R13
 extern int no_mem;
diff --git a/kernel-nanoBench.sh b/kernel-nanoBench.sh
index 86952c1..daa10d5 100755
--- a/kernel-nanoBench.sh
+++ b/kernel-nanoBench.sh
@@ -87,6 +87,9 @@ while [ "$1" ]; do
     elif [[ "$1" == -al* ]]; then
         echo "$2" > /sys/nb/alignment_offset
         shift 2
+    elif [[ "$1" == -df* ]]; then
+        echo "1" > /sys/nb/drain_frontend
+        shift
     elif [[ "$1" == -min* ]]; then
         echo "min" > /sys/nb/agg
         shift
@@ -115,6 +118,7 @@ while [ "$1" ]; do
         echo "  -warm_up_count <n>:         Number of runs before the first measurement gets recorded."
         echo "  -initial_warm_up_count <n>: Number of runs before any measurement is performed."
         echo "  -alignment_offset <n>:      Alignment offset."
+        echo "  -df:                        Drains front-end buffers between executing code_late_init and code."
         echo "  -avg:                       Selects the arithmetic mean as the aggregate function."
         echo "  -median:                    Selects the median as the aggregate function."
         echo "  -min:                       Selects the minimum as the aggregate function."
diff --git a/kernel/Makefile b/kernel/Makefile
index b6e03f5..07b503d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,7 +18,7 @@ obj-m += $(MODULE_NAME).o
 CFLAGS_nb_km.o := -DDEBUG
 CFLAGS_nanoBench.o := -DDEBUG
 
-ccflags-y+=-std=gnu99 -Wno-declaration-after-statement
+ccflags-y+=-std=gnu99 -Wno-declaration-after-statement -DDEBUG
 
 all: 
 	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
diff --git a/kernel/nb_km.c b/kernel/nb_km.c
index 911dde1..3a7bae0 100644
--- a/kernel/nb_km.c
+++ b/kernel/nb_km.c
@@ -256,6 +256,15 @@ static ssize_t alignment_offset_store(struct kobject *kobj, struct kobj_attribut
 }
 static struct kobj_attribute alignment_offset_attribute =__ATTR(alignment_offset, 0660, alignment_offset_show, alignment_offset_store);
 
+static ssize_t drain_frontend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
+    return sprintf(buf, "%u\n", drain_frontend);
+}
+static ssize_t drain_frontend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) {
+    sscanf(buf, "%u", &drain_frontend);
+    return count;
+}
+static struct kobj_attribute drain_frontend_attribute =__ATTR(drain_frontend, 0660, drain_frontend_show, drain_frontend_store);
+
 static ssize_t basic_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) {
     return sprintf(buf, "%u\n", basic_mode);
 }
@@ -416,6 +425,7 @@ static ssize_t reset_show(struct kobject *kobj, struct kobj_attribute *attr, cha
     aggregate_function = AGGREGATE_FUNCTION_DEFAULT;
     verbose = VERBOSE_DEFAULT;
     alignment_offset = ALIGNMENT_OFFSET_DEFAULT;
+    drain_frontend = DRAIN_FRONTEND_DEFAULT;
 
     code_init_length = 0;
     code_late_init_length = 0;
@@ -663,6 +673,7 @@ static int __init nb_init(void) {
     error |= sysfs_create_file(nb_kobject, &warm_up_attribute.attr);
     error |= sysfs_create_file(nb_kobject, &initial_warm_up_attribute.attr);
     error |= sysfs_create_file(nb_kobject, &alignment_offset_attribute.attr);
+    error |= sysfs_create_file(nb_kobject, &drain_frontend_attribute.attr);
     error |= sysfs_create_file(nb_kobject, &agg_attribute.attr);
     error |= sysfs_create_file(nb_kobject, &basic_mode_attribute.attr);
     error |= sysfs_create_file(nb_kobject, &no_mem_attribute.attr);
diff --git a/kernelNanoBench.py b/kernelNanoBench.py
index 8ae82d4..e7e3beb 100644
--- a/kernelNanoBench.py
+++ b/kernelNanoBench.py
@@ -54,8 +54,8 @@ paramDict = dict()
 # Assumes that no changes to the corresponding files in /sys/nb/ were made since the last call to setNanoBenchParameters().
 # Otherwise, reset() needs to be called first.
 def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConfigFile=None, nMeasurements=None, unrollCount=None, loopCount=None,
-                           warmUpCount=None, initialWarmUpCount=None, alignmentOffset=0, codeOffset=0, aggregateFunction=None, basicMode=None, noMem=None,
-                           noNormalization=None, verbose=None):
+                           warmUpCount=None, initialWarmUpCount=None, alignmentOffset=None, codeOffset=None, drainFrontend=None, aggregateFunction=None,
+                           basicMode=None, noMem=None, noNormalization=None, verbose=None):
    if not ramdiskCreated: createRamdisk()
 
    if config is not None:
@@ -109,6 +109,11 @@ def setNanoBenchParameters(config=None, configFile=None, msrConfig=None, msrConf
          writeFile('/sys/nb/code_offset', str(codeOffset))
          paramDict['codeOffset'] = codeOffset
 
+   if drainFrontend is not None:
+      if paramDict.get('drainFrontend', None) != drainFrontend:
+         writeFile('/sys/nb/drain_frontend', str(int(drainFrontend)))
+         paramDict['drainFrontend'] = drainFrontend
+
    if aggregateFunction is not None:
       if paramDict.get('aggregateFunction', None) != aggregateFunction:
          writeFile('/sys/nb/agg', aggregateFunction)
diff --git a/nanoBench.sh b/nanoBench.sh
index 5d05306..b2a1362 100755
--- a/nanoBench.sh
+++ b/nanoBench.sh
@@ -16,7 +16,7 @@ fi
 
 debug=false
 for p in "$@"; do
-    if [[ "$p" == -d* ]]; then
+    if [[ "$p" == -de* ]]; then
         debug=true
     fi
 done
diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py
index e5c8637..af39291 100755
--- a/tools/cpuBench/cpuBench.py
+++ b/tools/cpuBench/cpuBench.py
@@ -224,7 +224,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
    if maxRepeat>0:
       if any(v<-0.05 for v in ret.values()):
          print 'Repeating experiment because there was a value < 0'
-         return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
+         return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=True, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
 
       #sumPortUops = sum(v for e,v in ret.items() if 'PORT' in e and not '4' in e)
       #if (sumPortUops % 1) > .2 and (sumPortUops % 1) < .8:
@@ -236,7 +236,7 @@ def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0,
          maxPortUops = max(v/(len(e)-9) for e,v in ret.items() if 'PORT' in e)
          if maxPortUops * .98 > ret['Core cycles']:
             print 'Repeating experiment because there were more uops on a port than core cycles'
-            return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
+            return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=True, htmlReports=htmlReports, maxRepeat=maxRepeat-1)
 
    if htmlReports is not None:
       htmlReports.extend(localHtmlReports)
@@ -1220,12 +1220,8 @@ def getBasicLatencies(instrNodeList):
       sys.exit()
    basicLatency['MOVSX'] = movsxCycles
 
-   movsxR8hResult = runExperiment(None, 'MOVSX EAX, AH; MOV AH, AL')
-   movsxR8hCycles = int(round(movsxR8hResult['Core cycles']))
-   if movsxR8hCycles != 2:
-      print 'Latency of "MOVSX EAX, AH; MOV AH, AL" must be 2'
-      sys.exit()
-   basicLatency['MOV_R8h_R8l'] = 1
+   movsxR8hResult = runExperiment(None, 'MOV AH, AL')
+   basicLatency['MOV_R8h_R8l'] = max(1, int(round(movsxR8hResult['Core cycles'])))
 
    movR8hR8hResult = runExperiment(instrNodeDict['MOV_88 (R8h, R8h)'], 'MOV AH, AH')
    basicLatency['MOV_R8h_R8h'] = int(round(movR8hR8hResult['Core cycles']))
@@ -1836,11 +1832,12 @@ def getChainInstrForVectorRegs(instrNode, startReg, targetReg, cRep, cType):
 
 
 class LatConfig:
-   def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, notes=None):
+   def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, basicMode=False, notes=None):
       self.instrI = instrI
       self.chainInstrs = chainInstrs
       self.chainLatency = chainLatency
       self.init = ([] if init is None else init)
+      self.basicMode = basicMode
       self.notes = ([] if notes is None else notes)
 
 class LatConfigList:
@@ -2268,7 +2265,10 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
                chainInstrs += 'XOR ' + chainReg + ', R12; XOR ' + chainReg + ', R12;' + ('TEST R15, R15;' if instrReadsFlags else '')
                chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR']
                chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding
-               configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
+               # we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW
+               configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True))
+               # on some microarch. (e.g., HSW), an additional nop instr. can sometimes lead to a better port scheduling
+               configList.append(LatConfig(instrI, chainInstrs=chainInstrs + 'nop;', chainLatency=chainLatency, basicMode=True, notes=['with additional nop']))
             else:
                # mem -> mem
                if startNode.attrib.get('r','0')=='1':
@@ -2280,7 +2280,8 @@ def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addrMem
                      chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';')*10
                      chainInstrs += ('MOV [' + addrReg + '], ' + regToSize('R12', min(64, memWidth)))
                      chainLatency = basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))]
-                     configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency))
+                     # we use basicMode, as the measurements for these benchmarks are often not very stable, in particular on, e.g., HSW
+                     configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency, basicMode=True))
                   else:
                      # ToDo
                      pass
@@ -2513,7 +2514,8 @@ def getLatencies(instrNode, instrNodeList, tpDict, tpDictSameReg, htmlReports):
                            configHtmlReports.append('<li>Chain latency: ' + ('&ge;' if latConfigList.isUpperBound else '') + str(latConfig.chainLatency) + '</li>\n')
 
                         init = latConfig.instrI.regMemInit + latConfig.init
-                        measurementResult = runExperiment(instrNode, latConfig.instrI.asm + ';' + latConfig.chainInstrs, init=init, htmlReports=configHtmlReports, unrollCount=100)
+                        measurementResult = runExperiment(instrNode, latConfig.instrI.asm + ';' + latConfig.chainInstrs, init=init,
+                                                          basicMode=latConfig.basicMode, htmlReports=configHtmlReports, unrollCount=100)
                         configHtmlReports.append('</ul>\n')
 
                         if not measurementResult:
diff --git a/user/nanoBench_main.c b/user/nanoBench_main.c
index 1ccc544..7fbe763 100644
--- a/user/nanoBench_main.c
+++ b/user/nanoBench_main.c
@@ -35,6 +35,7 @@ void print_usage() {
     printf("  -warm_up_count <n>:             Number of runs before the first measurement gets recorded.\n");
     printf("  -initial_warm_up_count <n>:     Number of runs before any measurement is performed.\n");
     printf("  -alignment_offset <n>:          Alignment offset.\n");
+    printf("  -df:                            Drains front-end buffers between executing code_late_init and code.\n");
     printf("  -avg:                           Selects the arithmetic mean as the aggregate function.\n");
     printf("  -median:                        Selects the median as the aggregate function.\n");
     printf("  -min:                           Selects the minimum as the aggregate function.\n");
@@ -80,6 +81,7 @@ int main(int argc, char **argv) {
         {"warm_up_count", required_argument, 0, 'w'},
         {"initial_warm_up_count", required_argument, 0, 'a'},
         {"alignment_offset", required_argument, 0, 'm'},
+        {"df", no_argument, &drain_frontend, 1},
         {"avg", no_argument, &aggregate_function, AVG_20_80},
         {"median", no_argument, &aggregate_function, MED},
         {"min", no_argument, &aggregate_function, MIN},