improved drain_frontend

2025-12-13 10:10:04 +01:00 · 2022-01-12 20:33:09 +01:00
parent a1ad0fbf47
commit 39f97457c6
3 changed files with 20 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -152,7 +152,7 @@ Both `nanoBench.sh` and `kernel-nanoBench.sh` support the following command-line
 | `-basic_mode`                | The effect of this option is described in the [Generated Code](#generated-code) section. |
 | `-no_mem`                    | If this option is enabled, the code for `read_perf_ctrs` does not make any memory accesses and stores all performance counter values in registers. This can, for example, be useful for benchmarks that require that the state of the data caches does not change after the execution of `code_init`. *If this option is used, the code to be benchmarked must not modify registers* ***R8-R11 (Intel)*** *and* ***R8-R13 (AMD).*** *Furthermore, `read_perf_ctrs` will modify* ***RAX, RCX, and RDX***. |
 | `-no_normalization`          | If this option is enabled, the measurement results are not divided by the number of repetitions. |
-| `-df`                        | If this option is enabled, the front-end buffers are drained after `code_init`, after `code_late_init`, and after the last instance of `code` by executing a long sequence of 15-Byte `NOP` instructions. |
+| `-df`                        | If this option is enabled, the front-end buffers are drained after `code_init`, after `code_late_init`, and after the last instance of `code` by executing an lfence, followed by a long sequence of 1-Byte `NOP` instructions, followed by a long sequence of 15-Byte `NOP` instructions. |
 | `-cpu <n>`                   | Pins the measurement thread to CPU n. `[Default: Pin the thread to the CPU it is currently running on.]` |
 | `-verbose`                   | Outputs the results of all performance counter readings. In the user-space version, the results are printed to stdout. The output of the kernel module can be accessed using `dmesg`. |

--- a/common/nanoBench.c
+++ b/common/nanoBench.c
@@ -453,7 +453,6 @@ size_t configure_perf_ctrs_programmable(size_t next_pfc_config, bool usr, bool o
                continue;
            }
            if (((avoid_counters >> i) & 1) && (config.ctr != i)) {
-                print_error("avoiding %d", i);
                continue;
            }
            next_pfc_config++;
@@ -491,7 +490,7 @@ size_t get_required_runtime_code_length() {
            req_code_length += 100;
        }
    }
-    return code_init_length + code_late_init_length + (drain_frontend?3*64*15:0) + 2*unroll_count*req_code_length + 10000;
+    return code_init_length + code_late_init_length + (drain_frontend?3*(192+64*15):0) + 2*unroll_count*req_code_length + 10000;
 }

 int get_distance_to_code(char* measurement_template, size_t templateI) {
@@ -541,6 +540,9 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo

            if (drain_frontend) {
                strcpy(&runtime_code[rcI], "\x0F\xAE\xE8"); rcI += 3; // lfence
+                for (int i=0; i<192; i++) {
+                    strcpy(&runtime_code[rcI], NOPS[1]); rcI += 1;
+                }
                for (int i=0; i<64; i++) {
                    strcpy(&runtime_code[rcI], NOPS[15]); rcI += 15;
                }
@@ -562,19 +564,25 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
            templateI += 8;

            if (unrollI == 0 && codeI == 0) {
-                rcI_code_start = rcI;
-
                if (code_late_init_length > 0) {
                    memcpy(&runtime_code[rcI], code_late_init, code_late_init_length);
                    rcI += code_late_init_length;
                }

                if (drain_frontend) {
-                    // the length of the following code sequence is a multiple of 64, and thus doesn't affect the alignment
+                    // We first execute an lfence instruction, then, we fill the front-end buffers with 1-Byte NOPs, and then, we drain the buffers using
+                    // 15-Byte NOPs; this makes sure that before the first 15-Byte NOP is predecoded, the front-end buffers contain only NOPs that can be
+                    // issued at the maximum rate. The length of the added instructions is a multiple of 64, and thus doesn't affect the alignment.
+                    strcpy(&runtime_code[rcI], "\x0F\xAE\xE8"); rcI += 3; // lfence
+                    for (int i=0; i<189; i++) {
+                        strcpy(&runtime_code[rcI], NOPS[1]); rcI += 1;
+                    }
                    for (int i=0; i<64; i++) {
                        strcpy(&runtime_code[rcI], NOPS[15]); rcI += 15;
                    }
                }
+
+                rcI_code_start = rcI;
            }

            if (!code_contains_magic_bytes) {
@@ -611,8 +619,11 @@ void create_runtime_code(char* measurement_template, long local_unroll_count, lo
                }

                if (drain_frontend) {
-                    // add an lfence and 64 nops s.t. the front end gets drained and the following instruction begins on a 32-byte boundary.
+                    // We add an lfence followed by nop instructions s.t. the front end gets drained and the following instruction begins on a 32-byte boundary.
                    strcpy(&runtime_code[rcI], "\x0F\xAE\xE8"); rcI += 3; // lfence
+                    for (int i=0; i<189; i++) {
+                        strcpy(&runtime_code[rcI], NOPS[1]); rcI += 1;
+                    }

                    for (int i=0; i<61; i++) {
                        strcpy(&runtime_code[rcI], NOPS[15]); rcI += 15;
--- a/common/nanoBench.h
+++ b/common/nanoBench.h
@@ -105,7 +105,8 @@ extern long initial_warm_up_count;
 extern size_t alignment_offset;
 #define ALIGNMENT_OFFSET_DEFAULT 0;

-// If enabled, the front-end buffers are drained between code_late_init and code by executing a sequence of 128 15-Byte NOP instructions.
+// If enabled, the front-end buffers are drained after code_init, after code_late_init, and after the last instance of code by executing an lfence, followed
+// by a long sequence of 1-Byte NOPs, followed by a long sequence of 15-Byte NOPs.
 extern int drain_frontend;
 #define DRAIN_FRONTEND_DEFAULT false;