From 4052187660cacf69612098a4e9aed67bf0df4e8b Mon Sep 17 00:00:00 2001 From: Jan Laukemann Date: Sat, 22 Jul 2017 18:33:58 +0200 Subject: [PATCH] added some new testcases --- testcases/inc-r64-TP.S | 111 +++++++++++++++++++++++++++ testcases/inc-r64.S | 111 +++++++++++++++++++++++++++ testcases/lea-r32_mem-TP.S | 102 ++++++++++++++++++++++++ testcases/lea-r32_mem.S | 102 ++++++++++++++++++++++++ testcases/vaddss-xmm_xmm_mem-TP.S | 76 ++++++++++++++++++ testcases/vaddss-xmm_xmm_mem.S | 76 ++++++++++++++++++ testcases/vaddss-xmm_xmm_xmm-TP.S | 76 ++++++++++++++++++ testcases/vaddss-xmm_xmm_xmm.S | 76 ++++++++++++++++++ testcases/vcvtsi2ss-xmm_xmm_r32-TP.S | 109 ++++++++++++++++++++++++++ testcases/vcvtsi2ss-xmm_xmm_r32.S | 109 ++++++++++++++++++++++++++ testcases/vmovss-mem_xmm-TP.S | 76 ++++++++++++++++++ testcases/vmovss-mem_xmm.S | 76 ++++++++++++++++++ testcases/vmovss-xmm_mem-TP.S | 69 +++++++++++++++++ testcases/vmovss-xmm_mem.S | 69 +++++++++++++++++ testcases/vmulss-xmm_xmm_xmm-TP.S | 76 ++++++++++++++++++ testcases/vmulss-xmm_xmm_xmm.S | 76 ++++++++++++++++++ testcases/vxorps-xmm_xmm_xmm-TP.S | 76 ++++++++++++++++++ testcases/vxorps-xmm_xmm_xmm.S | 76 ++++++++++++++++++ testcases/xor-r32_r32-TP.S | 111 +++++++++++++++++++++++++++ testcases/xor-r32_r32.S | 111 +++++++++++++++++++++++++++ 20 files changed, 1764 insertions(+) create mode 100644 testcases/inc-r64-TP.S create mode 100644 testcases/inc-r64.S create mode 100644 testcases/lea-r32_mem-TP.S create mode 100644 testcases/lea-r32_mem.S create mode 100644 testcases/vaddss-xmm_xmm_mem-TP.S create mode 100644 testcases/vaddss-xmm_xmm_mem.S create mode 100644 testcases/vaddss-xmm_xmm_xmm-TP.S create mode 100644 testcases/vaddss-xmm_xmm_xmm.S create mode 100644 testcases/vcvtsi2ss-xmm_xmm_r32-TP.S create mode 100644 testcases/vcvtsi2ss-xmm_xmm_r32.S create mode 100644 testcases/vmovss-mem_xmm-TP.S create mode 100644 testcases/vmovss-mem_xmm.S create mode 100644 testcases/vmovss-xmm_mem-TP.S create mode 100644 testcases/vmovss-xmm_mem.S create mode 100644 testcases/vmulss-xmm_xmm_xmm-TP.S create mode 100644 testcases/vmulss-xmm_xmm_xmm.S create mode 100644 testcases/vxorps-xmm_xmm_xmm-TP.S create mode 100644 testcases/vxorps-xmm_xmm_xmm.S create mode 100644 testcases/xor-r32_r32-TP.S create mode 100644 testcases/xor-r32_r32.S diff --git a/testcases/inc-r64-TP.S b/testcases/inc-r64-TP.S new file mode 100644 index 0000000..62b649d --- /dev/null +++ b/testcases/inc-r64-TP.S @@ -0,0 +1,111 @@ +#define INSTR inc +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r64.S b/testcases/inc-r64.S new file mode 100644 index 0000000..095248a --- /dev/null +++ b/testcases/inc-r64.S @@ -0,0 +1,111 @@ +#define INSTR inc +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/lea-r32_mem-TP.S b/testcases/lea-r32_mem-TP.S new file mode 100644 index 0000000..c5becb3 --- /dev/null +++ b/testcases/lea-r32_mem-TP.S @@ -0,0 +1,102 @@ +#define INSTR lea +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/lea-r32_mem.S b/testcases/lea-r32_mem.S new file mode 100644 index 0000000..1a7bf5e --- /dev/null +++ b/testcases/lea-r32_mem.S @@ -0,0 +1,102 @@ +#define INSTR lea +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddss-xmm_xmm_mem-TP.S b/testcases/vaddss-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..7b00116 --- /dev/null +++ b/testcases/vaddss-xmm_xmm_mem-TP.S @@ -0,0 +1,76 @@ +#define INSTR vaddss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddss-xmm_xmm_mem.S b/testcases/vaddss-xmm_xmm_mem.S new file mode 100644 index 0000000..5a7d8b1 --- /dev/null +++ b/testcases/vaddss-xmm_xmm_mem.S @@ -0,0 +1,76 @@ +#define INSTR vaddss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddss-xmm_xmm_xmm-TP.S b/testcases/vaddss-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..00ffeb6 --- /dev/null +++ b/testcases/vaddss-xmm_xmm_xmm-TP.S @@ -0,0 +1,76 @@ +#define INSTR vaddss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddss-xmm_xmm_xmm.S b/testcases/vaddss-xmm_xmm_xmm.S new file mode 100644 index 0000000..550fc3e --- /dev/null +++ b/testcases/vaddss-xmm_xmm_xmm.S @@ -0,0 +1,76 @@ +#define INSTR vaddss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vcvtsi2ss-xmm_xmm_r32-TP.S b/testcases/vcvtsi2ss-xmm_xmm_r32-TP.S new file mode 100644 index 0000000..dd8bece --- /dev/null +++ b/testcases/vcvtsi2ss-xmm_xmm_r32-TP.S @@ -0,0 +1,109 @@ +#define INSTR vcvtsi2ss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, eax + INSTR xmm4, xmm1, ebx + INSTR xmm5, xmm2, ecx + INSTR xmm6, xmm0, eax + INSTR xmm7, xmm1, ebx + INSTR xmm8, xmm2, ecx + INSTR xmm9, xmm0, eax + INSTR xmm10, xmm1, ebx + INSTR xmm11, xmm2, ecx + INSTR xmm12, xmm0, eax + INSTR xmm13, xmm1, ebx + INSTR xmm14, xmm2, ecx + INSTR xmm15, xmm0, eax + INSTR xmm3, xmm1, ebx + INSTR xmm4, xmm2, ecx + INSTR xmm5, xmm0, eax + INSTR xmm6, xmm1, ebx + INSTR xmm7, xmm2, ecx + INSTR xmm8, xmm0, eax + INSTR xmm9, xmm1, ebx + INSTR xmm10, xmm2, ecx + INSTR xmm11, xmm0, eax + INSTR xmm12, xmm1, ebx + INSTR xmm13, xmm2, ecx + INSTR xmm14, xmm0, eax + INSTR xmm15, xmm1, ebx + INSTR xmm3, xmm2, ecx + INSTR xmm4, xmm0, eax + INSTR xmm5, xmm1, ebx + INSTR xmm6, xmm2, ecx + INSTR xmm7, xmm0, eax + INSTR xmm8, xmm1, ebx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vcvtsi2ss-xmm_xmm_r32.S b/testcases/vcvtsi2ss-xmm_xmm_r32.S new file mode 100644 index 0000000..862f951 --- /dev/null +++ b/testcases/vcvtsi2ss-xmm_xmm_r32.S @@ -0,0 +1,109 @@ +#define INSTR vcvtsi2ss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovss-mem_xmm-TP.S b/testcases/vmovss-mem_xmm-TP.S new file mode 100644 index 0000000..57a23e2 --- /dev/null +++ b/testcases/vmovss-mem_xmm-TP.S @@ -0,0 +1,76 @@ +#define INSTR vmovss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovss-mem_xmm.S b/testcases/vmovss-mem_xmm.S new file mode 100644 index 0000000..c99d6cc --- /dev/null +++ b/testcases/vmovss-mem_xmm.S @@ -0,0 +1,76 @@ +#define INSTR vmovss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovss-xmm_mem-TP.S b/testcases/vmovss-xmm_mem-TP.S new file mode 100644 index 0000000..74d3d83 --- /dev/null +++ b/testcases/vmovss-xmm_mem-TP.S @@ -0,0 +1,69 @@ +#define INSTR vmovss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovss-xmm_mem.S b/testcases/vmovss-xmm_mem.S new file mode 100644 index 0000000..f553695 --- /dev/null +++ b/testcases/vmovss-xmm_mem.S @@ -0,0 +1,69 @@ +#define INSTR vmovss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulss-xmm_xmm_xmm-TP.S b/testcases/vmulss-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..e865ed8 --- /dev/null +++ b/testcases/vmulss-xmm_xmm_xmm-TP.S @@ -0,0 +1,76 @@ +#define INSTR vmulss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulss-xmm_xmm_xmm.S b/testcases/vmulss-xmm_xmm_xmm.S new file mode 100644 index 0000000..f91adc3 --- /dev/null +++ b/testcases/vmulss-xmm_xmm_xmm.S @@ -0,0 +1,76 @@ +#define INSTR vmulss +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorps-xmm_xmm_xmm-TP.S b/testcases/vxorps-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..d71e189 --- /dev/null +++ b/testcases/vxorps-xmm_xmm_xmm-TP.S @@ -0,0 +1,76 @@ +#define INSTR vxorps +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorps-xmm_xmm_xmm.S b/testcases/vxorps-xmm_xmm_xmm.S new file mode 100644 index 0000000..a8314c5 --- /dev/null +++ b/testcases/vxorps-xmm_xmm_xmm.S @@ -0,0 +1,76 @@ +#define INSTR vxorps +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/xor-r32_r32-TP.S b/testcases/xor-r32_r32-TP.S new file mode 100644 index 0000000..5e122f7 --- /dev/null +++ b/testcases/xor-r32_r32-TP.S @@ -0,0 +1,111 @@ +#define INSTR xor +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/xor-r32_r32.S b/testcases/xor-r32_r32.S new file mode 100644 index 0000000..b1e71b7 --- /dev/null +++ b/testcases/xor-r32_r32.S @@ -0,0 +1,111 @@ +#define INSTR xor +#define NINST 32 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file