diff --git a/testcases/add-rr-TP.S b/testcases/add-rr-TP.S deleted file mode 100644 index 4f40830..0000000 --- a/testcases/add-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR add -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR edx, eax - INSTR r9d, ebx - INSTR r10d, ecx - INSTR edx, eax - INSTR r9d, ebx - INSTR r10d, ecx - INSTR r11d, eax - INSTR r12d, ebx - INSTR r13d, ecx - INSTR r14d, eax - INSTR r15d, ebx - INSTR eax, ecx - INSTR ebx, eax - INSTR ecx, ebx - INSTR edx, ecx - INSTR r9d, eax - INSTR r10d, ebx - INSTR r11d, ecx - INSTR r12d, eax - INSTR r13d, ebx - INSTR r14d, ecx - INSTR r15d, eax - INSTR eax, ebx - INSTR ebx, ecx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-rr.S b/testcases/add-rr.S deleted file mode 100644 index 1dc4adf..0000000 --- a/testcases/add-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR add -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - INSTR eax, ebx - INSTR ebx, eax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-rr-TP.S b/testcases/cmp-rr-TP.S deleted file mode 100644 index d2b943a..0000000 --- a/testcases/cmp-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR cmp -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR r11, rax - INSTR r12, rbx - INSTR r13, rcx - INSTR r14, rax - INSTR r15, rbx - INSTR rax, rcx - INSTR rbx, rax - INSTR rcx, rbx - INSTR rdx, rcx - INSTR r9, rax - INSTR r10, rbx - INSTR r11, rcx - INSTR r12, rax - INSTR r13, rbx - INSTR r14, rcx - INSTR r15, rax - INSTR rax, rbx - INSTR rbx, rcx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-rr.S b/testcases/cmp-rr.S deleted file mode 100644 index 7e5ee2c..0000000 --- a/testcases/cmp-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR cmp -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/dec-r-TP.S b/testcases/dec-r-TP.S deleted file mode 100644 index a281110..0000000 --- a/testcases/dec-r-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR dec -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR edx - INSTR r9d - INSTR r10d - INSTR edx - INSTR r9d - INSTR r10d - INSTR r11d - INSTR r12d - INSTR r13d - INSTR r14d - INSTR r15d - INSTR eax - INSTR ebx - INSTR ecx - INSTR edx - INSTR r9d - INSTR r10d - INSTR r11d - INSTR r12d - INSTR r13d - INSTR r14d - INSTR r15d - INSTR eax - INSTR ebx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/dec-r.S b/testcases/dec-r.S deleted file mode 100644 index 53cf598..0000000 --- a/testcases/dec-r.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR dec -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r-TP.S b/testcases/inc-r-TP.S deleted file mode 100644 index 8c57e5e..0000000 --- a/testcases/inc-r-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR inc -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx - INSTR r9 - INSTR r10 - INSTR rdx - INSTR r9 - INSTR r10 - INSTR r11 - INSTR r12 - INSTR r13 - INSTR r14 - INSTR r15 - INSTR rax - INSTR rbx - INSTR rcx - INSTR rdx - INSTR r9 - INSTR r10 - INSTR r11 - INSTR r12 - INSTR r13 - INSTR r14 - INSTR r15 - INSTR rax - INSTR rbx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r.S b/testcases/inc-r.S deleted file mode 100644 index 4f918c7..0000000 --- a/testcases/inc-r.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR inc -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-r64r32-TP.S b/testcases/janadd-r64r32-TP.S deleted file mode 100644 index f6fd008..0000000 --- a/testcases/janadd-r64r32-TP.S +++ /dev/null @@ -1,82 +0,0 @@ -#define INSTR janadd -#define NINST 6 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx, eax - INSTR r9, ebx - INSTR r10, ecx - INSTR rdx, eax - INSTR r9, ebx - INSTR r10, ecx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-r64r32.S b/testcases/janadd-r64r32.S deleted file mode 100644 index 5568030..0000000 --- a/testcases/janadd-r64r32.S +++ /dev/null @@ -1,82 +0,0 @@ -#define INSTR janadd -#define NINST 6 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-rr-TP.S b/testcases/janadd-rr-TP.S deleted file mode 100644 index f6fd008..0000000 --- a/testcases/janadd-rr-TP.S +++ /dev/null @@ -1,82 +0,0 @@ -#define INSTR janadd -#define NINST 6 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx, eax - INSTR r9, ebx - INSTR r10, ecx - INSTR rdx, eax - INSTR r9, ebx - INSTR r10, ecx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-rr.S b/testcases/janadd-rr.S deleted file mode 100644 index 5568030..0000000 --- a/testcases/janadd-rr.S +++ /dev/null @@ -1,82 +0,0 @@ -#define INSTR janadd -#define NINST 6 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-rr-TP.S b/testcases/mov-rr-TP.S deleted file mode 100644 index 72872bc..0000000 --- a/testcases/mov-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR mov -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR r11, rax - INSTR r12, rbx - INSTR r13, rcx - INSTR r14, rax - INSTR r15, rbx - INSTR rax, rcx - INSTR rbx, rax - INSTR rcx, rbx - INSTR rdx, rcx - INSTR r9, rax - INSTR r10, rbx - INSTR r11, rcx - INSTR r12, rax - INSTR r13, rbx - INSTR r14, rcx - INSTR r15, rax - INSTR rax, rbx - INSTR rbx, rcx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-rr.S b/testcases/mov-rr.S deleted file mode 100644 index b15c313..0000000 --- a/testcases/mov-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR mov -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-rr-TP.S b/testcases/movslq-rr-TP.S deleted file mode 100644 index 5ee7352..0000000 --- a/testcases/movslq-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR movslq -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx, eax - INSTR r9, ebx - INSTR r10, ecx - INSTR rdx, eax - INSTR r9, ebx - INSTR r10, ecx - INSTR r11, eax - INSTR r12, ebx - INSTR r13, ecx - INSTR r14, eax - INSTR r15, ebx - INSTR rax, ecx - INSTR rbx, eax - INSTR rcx, ebx - INSTR rdx, ecx - INSTR r9, eax - INSTR r10, ebx - INSTR r11, ecx - INSTR r12, eax - INSTR r13, ebx - INSTR r14, ecx - INSTR r15, eax - INSTR rax, ebx - INSTR rbx, ecx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-rr.S b/testcases/movslq-rr.S deleted file mode 100644 index b7f3825..0000000 --- a/testcases/movslq-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR movslq -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - INSTR rax, eax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/movzbl-rr-TP.S b/testcases/movzbl-rr-TP.S deleted file mode 100644 index f883521..0000000 --- a/testcases/movzbl-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR movzbl -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR edx, al - INSTR r9d, bl - INSTR r10d, cl - INSTR edx, al - INSTR r9d, bl - INSTR r10d, cl - INSTR r11d, al - INSTR r12d, bl - INSTR r13d, cl - INSTR r14d, al - INSTR r15d, bl - INSTR eax, cl - INSTR ebx, al - INSTR ecx, bl - INSTR edx, cl - INSTR r9d, al - INSTR r10d, bl - INSTR r11d, cl - INSTR r12d, al - INSTR r13d, bl - INSTR r14d, cl - INSTR r15d, al - INSTR eax, bl - INSTR ebx, cl - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/movzbl-rr.S b/testcases/movzbl-rr.S deleted file mode 100644 index 0028005..0000000 --- a/testcases/movzbl-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR movzbl -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - INSTR eax, al - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/neg-r-TP.S b/testcases/neg-r-TP.S deleted file mode 100644 index b93faeb..0000000 --- a/testcases/neg-r-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR neg -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR edx - INSTR r9d - INSTR r10d - INSTR edx - INSTR r9d - INSTR r10d - INSTR r11d - INSTR r12d - INSTR r13d - INSTR r14d - INSTR r15d - INSTR eax - INSTR ebx - INSTR ecx - INSTR edx - INSTR r9d - INSTR r10d - INSTR r11d - INSTR r12d - INSTR r13d - INSTR r14d - INSTR r15d - INSTR eax - INSTR ebx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/neg-r.S b/testcases/neg-r.S deleted file mode 100644 index 88c7f6d..0000000 --- a/testcases/neg-r.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR neg -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - INSTR eax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/pop-r-TP.S b/testcases/pop-r-TP.S deleted file mode 100644 index 9f676b6..0000000 --- a/testcases/pop-r-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR pop -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx - INSTR r9 - INSTR r10 - INSTR rdx - INSTR r9 - INSTR r10 - INSTR r11 - INSTR r12 - INSTR r13 - INSTR r14 - INSTR r15 - INSTR rax - INSTR rbx - INSTR rcx - INSTR rdx - INSTR r9 - INSTR r10 - INSTR r11 - INSTR r12 - INSTR r13 - INSTR r14 - INSTR r15 - INSTR rax - INSTR rbx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/pop-r.S b/testcases/pop-r.S deleted file mode 100644 index 73fcb9a..0000000 --- a/testcases/pop-r.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR pop -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - INSTR rax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-rr-TP.S b/testcases/sub-rr-TP.S deleted file mode 100644 index 28fbfc6..0000000 --- a/testcases/sub-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR sub -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR r11, rax - INSTR r12, rbx - INSTR r13, rcx - INSTR r14, rax - INSTR r15, rbx - INSTR rax, rcx - INSTR rbx, rax - INSTR rcx, rbx - INSTR rdx, rcx - INSTR r9, rax - INSTR r10, rbx - INSTR r11, rcx - INSTR r12, rax - INSTR r13, rbx - INSTR r14, rcx - INSTR r15, rax - INSTR rax, rbx - INSTR rbx, rcx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-rr.S b/testcases/sub-rr.S deleted file mode 100644 index 0eb2c63..0000000 --- a/testcases/sub-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR sub -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-rr-TP.S b/testcases/test-rr-TP.S deleted file mode 100644 index 0a7515d..0000000 --- a/testcases/test-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR test -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR rdx, rax - INSTR r9, rbx - INSTR r10, rcx - INSTR r11, rax - INSTR r12, rbx - INSTR r13, rcx - INSTR r14, rax - INSTR r15, rbx - INSTR rax, rcx - INSTR rbx, rax - INSTR rcx, rbx - INSTR rdx, rcx - INSTR r9, rax - INSTR r10, rbx - INSTR r11, rcx - INSTR r12, rax - INSTR r13, rbx - INSTR r14, rcx - INSTR r15, rax - INSTR rax, rbx - INSTR rbx, rcx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-rr.S b/testcases/test-rr.S deleted file mode 100644 index 4a1aa46..0000000 --- a/testcases/test-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR test -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - INSTR rax, rbx - INSTR rbx, rax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-avx-ymmymmymm-TP.S b/testcases/vaddpd-avx-ymmymmymm-TP.S deleted file mode 100644 index 88a5fdb..0000000 --- a/testcases/vaddpd-avx-ymmymmymm-TP.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vaddpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm6, ymm0, ymm0 - INSTR ymm7, ymm1, ymm1 - INSTR ymm8, ymm2, ymm2 - INSTR ymm9, ymm0, ymm0 - INSTR ymm10, ymm1, ymm1 - INSTR ymm11, ymm2, ymm2 - INSTR ymm12, ymm0, ymm0 - INSTR ymm13, ymm1, ymm1 - INSTR ymm14, ymm2, ymm2 - INSTR ymm15, ymm0, ymm0 - INSTR ymm16, ymm1, ymm1 - INSTR ymm17, ymm2, ymm2 - INSTR ymm18, ymm0, ymm0 - INSTR ymm19, ymm1, ymm1 - INSTR ymm20, ymm2, ymm2 - INSTR ymm21, ymm0, ymm0 - INSTR ymm22, ymm1, ymm1 - INSTR ymm23, ymm2, ymm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-avx-ymmymmymm.S b/testcases/vaddpd-avx-ymmymmymm.S deleted file mode 100644 index d032dd2..0000000 --- a/testcases/vaddpd-avx-ymmymmymm.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vaddpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-xmmxmmxmm-TP.S b/testcases/vaddpd-xmmxmmxmm-TP.S deleted file mode 100644 index 45bee34..0000000 --- a/testcases/vaddpd-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vaddpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-xmmxmmxmm.S b/testcases/vaddpd-xmmxmmxmm.S deleted file mode 100644 index bea987d..0000000 --- a/testcases/vaddpd-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vaddpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmmxmmxmm-TP.S b/testcases/vaddsd-xmmxmmxmm-TP.S deleted file mode 100644 index 3d04147..0000000 --- a/testcases/vaddsd-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vaddsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmmxmmxmm.S b/testcases/vaddsd-xmmxmmxmm.S deleted file mode 100644 index 2090c03..0000000 --- a/testcases/vaddsd-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vaddsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-avx-ymmymm-TP.S b/testcases/vmovapd-avx-ymmymm-TP.S deleted file mode 100644 index ff74ba3..0000000 --- a/testcases/vmovapd-avx-ymmymm-TP.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vmovapd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm3, ymm0 - INSTR ymm4, ymm1 - INSTR ymm5, ymm2 - INSTR ymm3, ymm0 - INSTR ymm4, ymm1 - INSTR ymm5, ymm2 - INSTR ymm6, ymm0 - INSTR ymm7, ymm1 - INSTR ymm8, ymm2 - INSTR ymm9, ymm0 - INSTR ymm10, ymm1 - INSTR ymm11, ymm2 - INSTR ymm12, ymm0 - INSTR ymm13, ymm1 - INSTR ymm14, ymm2 - INSTR ymm15, ymm0 - INSTR ymm16, ymm1 - INSTR ymm17, ymm2 - INSTR ymm18, ymm0 - INSTR ymm19, ymm1 - INSTR ymm20, ymm2 - INSTR ymm21, ymm0 - INSTR ymm22, ymm1 - INSTR ymm23, ymm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-avx-ymmymm.S b/testcases/vmovapd-avx-ymmymm.S deleted file mode 100644 index 0396e83..0000000 --- a/testcases/vmovapd-avx-ymmymm.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vmovapd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - INSTR ymm0, ymm1 - INSTR ymm1, ymm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-xmmxmm-TP.S b/testcases/vmovapd-xmmxmm-TP.S deleted file mode 100644 index acd24a8..0000000 --- a/testcases/vmovapd-xmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmovapd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0 - INSTR xmm4, xmm1 - INSTR xmm5, xmm2 - INSTR xmm3, xmm0 - INSTR xmm4, xmm1 - INSTR xmm5, xmm2 - INSTR xmm6, xmm0 - INSTR xmm7, xmm1 - INSTR xmm8, xmm2 - INSTR xmm9, xmm0 - INSTR xmm10, xmm1 - INSTR xmm11, xmm2 - INSTR xmm12, xmm0 - INSTR xmm13, xmm1 - INSTR xmm14, xmm2 - INSTR xmm15, xmm0 - INSTR xmm16, xmm1 - INSTR xmm17, xmm2 - INSTR xmm18, xmm0 - INSTR xmm19, xmm1 - INSTR xmm20, xmm2 - INSTR xmm21, xmm0 - INSTR xmm22, xmm1 - INSTR xmm23, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-xmmxmm.S b/testcases/vmovapd-xmmxmm.S deleted file mode 100644 index 89b66d6..0000000 --- a/testcases/vmovapd-xmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmovapd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovaps-xmmxmm-TP.S b/testcases/vmovaps-xmmxmm-TP.S deleted file mode 100644 index 959363a..0000000 --- a/testcases/vmovaps-xmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmovaps -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0 - INSTR xmm4, xmm1 - INSTR xmm5, xmm2 - INSTR xmm3, xmm0 - INSTR xmm4, xmm1 - INSTR xmm5, xmm2 - INSTR xmm6, xmm0 - INSTR xmm7, xmm1 - INSTR xmm8, xmm2 - INSTR xmm9, xmm0 - INSTR xmm10, xmm1 - INSTR xmm11, xmm2 - INSTR xmm12, xmm0 - INSTR xmm13, xmm1 - INSTR xmm14, xmm2 - INSTR xmm15, xmm0 - INSTR xmm16, xmm1 - INSTR xmm17, xmm2 - INSTR xmm18, xmm0 - INSTR xmm19, xmm1 - INSTR xmm20, xmm2 - INSTR xmm21, xmm0 - INSTR xmm22, xmm1 - INSTR xmm23, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovaps-xmmxmm.S b/testcases/vmovaps-xmmxmm.S deleted file mode 100644 index 9559f9f..0000000 --- a/testcases/vmovaps-xmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmovaps -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - INSTR xmm0, xmm1 - INSTR xmm1, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-rxmm-TP.S b/testcases/vmovq-rxmm-TP.S deleted file mode 100644 index 3b80f46..0000000 --- a/testcases/vmovq-rxmm-TP.S +++ /dev/null @@ -1,98 +0,0 @@ -#define INSTR vmovq -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR rdx, xmm0 - INSTR r9, xmm1 - INSTR r10, xmm2 - INSTR rdx, xmm0 - INSTR r9, xmm1 - INSTR r10, xmm2 - INSTR r11, xmm0 - INSTR r12, xmm1 - INSTR r13, xmm2 - INSTR r14, xmm0 - INSTR r15, xmm1 - INSTR rax, xmm2 - INSTR rbx, xmm0 - INSTR rcx, xmm1 - INSTR rdx, xmm2 - INSTR r9, xmm0 - INSTR r10, xmm1 - INSTR r11, xmm2 - INSTR r12, xmm0 - INSTR r13, xmm1 - INSTR r14, xmm2 - INSTR r15, xmm0 - INSTR rax, xmm1 - INSTR rbx, xmm2 - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-rxmm.S b/testcases/vmovq-rxmm.S deleted file mode 100644 index a1d5c05..0000000 --- a/testcases/vmovq-rxmm.S +++ /dev/null @@ -1,98 +0,0 @@ -#define INSTR vmovq -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - INSTR rax, xmm0 - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-xmmr-TP.S b/testcases/vmovq-xmmr-TP.S deleted file mode 100644 index c84e892..0000000 --- a/testcases/vmovq-xmmr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR vmovq -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR xmm3, rax - INSTR xmm4, rbx - INSTR xmm5, rcx - INSTR xmm3, rax - INSTR xmm4, rbx - INSTR xmm5, rcx - INSTR xmm6, rax - INSTR xmm7, rbx - INSTR xmm8, rcx - INSTR xmm9, rax - INSTR xmm10, rbx - INSTR xmm11, rcx - INSTR xmm12, rax - INSTR xmm13, rbx - INSTR xmm14, rcx - INSTR xmm15, rax - INSTR xmm16, rbx - INSTR xmm17, rcx - INSTR xmm18, rax - INSTR xmm19, rbx - INSTR xmm20, rcx - INSTR xmm21, rax - INSTR xmm22, rbx - INSTR xmm23, rcx - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-xmmr.S b/testcases/vmovq-xmmr.S deleted file mode 100644 index 1bfd1ea..0000000 --- a/testcases/vmovq-xmmr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR vmovq -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - INSTR xmm0, rax - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmmxmmxmm-TP.S b/testcases/vmovsd-xmmxmmxmm-TP.S deleted file mode 100644 index cad7071..0000000 --- a/testcases/vmovsd-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmovsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmmxmmxmm.S b/testcases/vmovsd-xmmxmmxmm.S deleted file mode 100644 index 2bac0f2..0000000 --- a/testcases/vmovsd-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmovsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-avx-ymmymmymm-TP.S b/testcases/vmulpd-avx-ymmymmymm-TP.S deleted file mode 100644 index 0b3b1ad..0000000 --- a/testcases/vmulpd-avx-ymmymmymm-TP.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vmulpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm6, ymm0, ymm0 - INSTR ymm7, ymm1, ymm1 - INSTR ymm8, ymm2, ymm2 - INSTR ymm9, ymm0, ymm0 - INSTR ymm10, ymm1, ymm1 - INSTR ymm11, ymm2, ymm2 - INSTR ymm12, ymm0, ymm0 - INSTR ymm13, ymm1, ymm1 - INSTR ymm14, ymm2, ymm2 - INSTR ymm15, ymm0, ymm0 - INSTR ymm16, ymm1, ymm1 - INSTR ymm17, ymm2, ymm2 - INSTR ymm18, ymm0, ymm0 - INSTR ymm19, ymm1, ymm1 - INSTR ymm20, ymm2, ymm2 - INSTR ymm21, ymm0, ymm0 - INSTR ymm22, ymm1, ymm1 - INSTR ymm23, ymm2, ymm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-avx-ymmymmymm.S b/testcases/vmulpd-avx-ymmymmymm.S deleted file mode 100644 index 00279d3..0000000 --- a/testcases/vmulpd-avx-ymmymmymm.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vmulpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmmxmmxmm-TP.S b/testcases/vmulsd-xmmxmmxmm-TP.S deleted file mode 100644 index 144dce2..0000000 --- a/testcases/vmulsd-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmulsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmmxmmxmm.S b/testcases/vmulsd-xmmxmmxmm.S deleted file mode 100644 index 191e7b4..0000000 --- a/testcases/vmulsd-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vmulsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubpd-avx-ymmymmymm-TP.S b/testcases/vsubpd-avx-ymmymmymm-TP.S deleted file mode 100644 index 2ec8183..0000000 --- a/testcases/vsubpd-avx-ymmymmymm-TP.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vsubpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm6, ymm0, ymm0 - INSTR ymm7, ymm1, ymm1 - INSTR ymm8, ymm2, ymm2 - INSTR ymm9, ymm0, ymm0 - INSTR ymm10, ymm1, ymm1 - INSTR ymm11, ymm2, ymm2 - INSTR ymm12, ymm0, ymm0 - INSTR ymm13, ymm1, ymm1 - INSTR ymm14, ymm2, ymm2 - INSTR ymm15, ymm0, ymm0 - INSTR ymm16, ymm1, ymm1 - INSTR ymm17, ymm2, ymm2 - INSTR ymm18, ymm0, ymm0 - INSTR ymm19, ymm1, ymm1 - INSTR ymm20, ymm2, ymm2 - INSTR ymm21, ymm0, ymm0 - INSTR ymm22, ymm1, ymm1 - INSTR ymm23, ymm2, ymm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubpd-avx-ymmymmymm.S b/testcases/vsubpd-avx-ymmymmymm.S deleted file mode 100644 index 4c803bd..0000000 --- a/testcases/vsubpd-avx-ymmymmymm.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vsubpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubsd-xmmxmmxmm-TP.S b/testcases/vsubsd-xmmxmmxmm-TP.S deleted file mode 100644 index c14a8fb..0000000 --- a/testcases/vsubsd-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vsubsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubsd-xmmxmmxmm.S b/testcases/vsubsd-xmmxmmxmm.S deleted file mode 100644 index e9dad4c..0000000 --- a/testcases/vsubsd-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vsubsd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vunpckhpd-xmmxmmxmm-TP.S b/testcases/vunpckhpd-xmmxmmxmm-TP.S deleted file mode 100644 index 1f5cafe..0000000 --- a/testcases/vunpckhpd-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vunpckhpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vunpckhpd-xmmxmmxmm.S b/testcases/vunpckhpd-xmmxmmxmm.S deleted file mode 100644 index 7b4a197..0000000 --- a/testcases/vunpckhpd-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vunpckhpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-avx-ymmymmymm-TP.S b/testcases/vxorpd-avx-ymmymmymm-TP.S deleted file mode 100644 index 9e7b830..0000000 --- a/testcases/vxorpd-avx-ymmymmymm-TP.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vxorpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm3, ymm0, ymm0 - INSTR ymm4, ymm1, ymm1 - INSTR ymm5, ymm2, ymm2 - INSTR ymm6, ymm0, ymm0 - INSTR ymm7, ymm1, ymm1 - INSTR ymm8, ymm2, ymm2 - INSTR ymm9, ymm0, ymm0 - INSTR ymm10, ymm1, ymm1 - INSTR ymm11, ymm2, ymm2 - INSTR ymm12, ymm0, ymm0 - INSTR ymm13, ymm1, ymm1 - INSTR ymm14, ymm2, ymm2 - INSTR ymm15, ymm0, ymm0 - INSTR ymm16, ymm1, ymm1 - INSTR ymm17, ymm2, ymm2 - INSTR ymm18, ymm0, ymm0 - INSTR ymm19, ymm1, ymm1 - INSTR ymm20, ymm2, ymm2 - INSTR ymm21, ymm0, ymm0 - INSTR ymm22, ymm1, ymm1 - INSTR ymm23, ymm2, ymm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-avx-ymmymmymm.S b/testcases/vxorpd-avx-ymmymmymm.S deleted file mode 100644 index a1f370d..0000000 --- a/testcases/vxorpd-avx-ymmymmymm.S +++ /dev/null @@ -1,67 +0,0 @@ -#define INSTR vxorpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # expand from SSE to AVX - vinsertf128 ymm0, ymm0, xmm0, 0x1 - # copy DP 1.0 - vmovaps ymm0, ymm0 - vmovaps ymm1, ymm0 - # Create DP 2.0 - vaddpd ymm1, ymm1, ymm1 - # Create DP 0.5 - vdivpd ymm2, ymm0, ymm1 -loop: - inc i - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - INSTR ymm0, ymm1, ymm0 - INSTR ymm1, ymm0, ymm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-xmmxmmxmm-TP.S b/testcases/vxorpd-xmmxmmxmm-TP.S deleted file mode 100644 index bbacc19..0000000 --- a/testcases/vxorpd-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vxorpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-xmmxmmxmm.S b/testcases/vxorpd-xmmxmmxmm.S deleted file mode 100644 index 8783f3c..0000000 --- a/testcases/vxorpd-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vxorpd -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorps-xmmxmmxmm-TP.S b/testcases/vxorps-xmmxmmxmm-TP.S deleted file mode 100644 index d8b097b..0000000 --- a/testcases/vxorps-xmmxmmxmm-TP.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vxorps -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm3, xmm0, xmm0 - INSTR xmm4, xmm1, xmm1 - INSTR xmm5, xmm2, xmm2 - INSTR xmm6, xmm0, xmm0 - INSTR xmm7, xmm1, xmm1 - INSTR xmm8, xmm2, xmm2 - INSTR xmm9, xmm0, xmm0 - INSTR xmm10, xmm1, xmm1 - INSTR xmm11, xmm2, xmm2 - INSTR xmm12, xmm0, xmm0 - INSTR xmm13, xmm1, xmm1 - INSTR xmm14, xmm2, xmm2 - INSTR xmm15, xmm0, xmm0 - INSTR xmm16, xmm1, xmm1 - INSTR xmm17, xmm2, xmm2 - INSTR xmm18, xmm0, xmm0 - INSTR xmm19, xmm1, xmm1 - INSTR xmm20, xmm2, xmm2 - INSTR xmm21, xmm0, xmm0 - INSTR xmm22, xmm1, xmm1 - INSTR xmm23, xmm2, xmm2 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorps-xmmxmmxmm.S b/testcases/vxorps-xmmxmmxmm.S deleted file mode 100644 index 2309d0c..0000000 --- a/testcases/vxorps-xmmxmmxmm.S +++ /dev/null @@ -1,65 +0,0 @@ -#define INSTR vxorps -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - # copy DP 1.0 - vmovaps xmm0, xmm0 - vmovaps xmm1, xmm0 - # Create DP 2.0 - vaddpd xmm1, xmm1, xmm1 - # Create DP 0.5 - vdivpd xmm2, xmm0, xmm1 -loop: - inc i - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - INSTR xmm0, xmm1, xmm0 - INSTR xmm1, xmm0, xmm0 - cmp i, N - jl loop -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/xor-rr-TP.S b/testcases/xor-rr-TP.S deleted file mode 100644 index caf6b9d..0000000 --- a/testcases/xor-rr-TP.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR xor -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR dl, al - INSTR r9l, bl - INSTR r10l, cl - INSTR dl, al - INSTR r9l, bl - INSTR r10l, cl - INSTR r11l, al - INSTR r12l, bl - INSTR r13l, cl - INSTR r14l, al - INSTR r15l, bl - INSTR al, cl - INSTR bl, al - INSTR cl, bl - INSTR dl, cl - INSTR r9l, al - INSTR r10l, bl - INSTR r11l, cl - INSTR r12l, al - INSTR r13l, bl - INSTR r14l, cl - INSTR r15l, al - INSTR al, bl - INSTR bl, cl - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file diff --git a/testcases/xor-rr.S b/testcases/xor-rr.S deleted file mode 100644 index 04f32e7..0000000 --- a/testcases/xor-rr.S +++ /dev/null @@ -1,100 +0,0 @@ -#define INSTR xor -#define NINST 24 -#define N edi -#define i r8d - - -.intel_syntax noprefix -.globl ninst -.data -ninst: -.long NINST -.text -.globl latency -.type latency, @function -.align 32 -latency: - push rbp - mov rbp, rsp - xor i, i - test N, N - jle done - # create DP 1.0 - vpcmpeqw xmm0, xmm0, xmm0 # all ones - vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) - vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero - push rax - push rbx - push rcx - push rdx - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - # copy DP 1.0 - vmovq rax, xmm0 - vmovq rbx, xmm0 - # Create DP 2.0 - add rbx, rax - # Create DP 0.5 - div rax - movq rcx, rax - vmovq rax, xmm0 -loop: - inc i - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - INSTR al, bl - INSTR bl, al - cmp i, N - jl loop - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop rdx - pop rcx - pop rbx - pop rax -done: - mov rsp, rbp - pop rbp - ret -.size latency, .-latency \ No newline at end of file