updated testcases

This commit is contained in:
Jan Laukemann
2017-09-23 17:52:10 +02:00
parent aa857af4e3
commit ca22c02691
90 changed files with 0 additions and 12018 deletions

View File

@@ -1,134 +0,0 @@
#define INSTR add
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR add
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR add
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR r11, rax
INSTR r12, rbx
INSTR r13, rcx
INSTR r14, rax
INSTR r15, rbx
INSTR rdx, rcx
INSTR r9, rax
INSTR r10, rbx
INSTR r11, rcx
INSTR r12, rax
INSTR r13, rbx
INSTR r14, rcx
INSTR r15, rax
INSTR rdx, rbx
INSTR r9, rcx
INSTR r10, rax
INSTR r11, rbx
INSTR r12, rcx
INSTR r13, rax
INSTR r14, rbx
INSTR r15, rcx
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR r11, rax
INSTR r12, rbx
INSTR r13, rcx
INSTR r14, rax
INSTR r15, rbx
INSTR rdx, rcx
INSTR r9, rax
INSTR r10, rbx
INSTR r11, rcx
INSTR r12, rax
INSTR r13, rbx
INSTR r14, rcx
INSTR r15, rax
INSTR rdx, rbx
INSTR r9, rcx
INSTR r10, rax
INSTR r11, rbx
INSTR r12, rcx
INSTR r13, rax
INSTR r14, rbx
INSTR r15, rcx
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR r11, rax
INSTR r12, rbx
INSTR r13, rcx
INSTR r14, rax
INSTR r15, rbx
INSTR rdx, rcx
INSTR r9, rax
INSTR r10, rbx
INSTR r11, rcx
INSTR r12, rax
INSTR r13, rbx
INSTR r14, rcx
INSTR r15, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR add
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR cmp
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR cmp
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR cmp
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR cmp
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,170 +0,0 @@
#define INSTR cmp
#define NINST 100
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rdx, 1
INSTR r9, 2
INSTR r10, 13
INSTR r11, 1
INSTR r12, 2
INSTR r13, 13
INSTR r14, 1
INSTR r15, 2
INSTR rdx, 13
INSTR r9, 1
INSTR r10, 2
INSTR r11, 13
INSTR r12, 1
INSTR r13, 2
INSTR r14, 13
INSTR r15, 1
INSTR rdx, 2
INSTR r9, 13
INSTR r10, 1
INSTR r11, 2
INSTR r12, 13
INSTR r13, 1
INSTR r14, 2
INSTR r15, 13
INSTR rdx, 1
INSTR r9, 2
INSTR r10, 13
INSTR r11, 1
INSTR r12, 2
INSTR r13, 13
INSTR r14, 1
INSTR r15, 2
INSTR rdx, 13
INSTR r9, 1
INSTR r10, 2
INSTR r11, 13
INSTR r12, 1
INSTR r13, 2
INSTR r14, 13
INSTR r15, 1
INSTR rdx, 2
INSTR r9, 13
INSTR r10, 1
INSTR r11, 2
INSTR r12, 13
INSTR r13, 1
INSTR r14, 2
INSTR r15, 13
INSTR rdx, 1
INSTR r9, 2
INSTR r10, 13
INSTR r11, 1
INSTR r12, 2
INSTR r13, 13
INSTR r14, 1
INSTR r15, 2
INSTR rdx, 13
INSTR r9, 1
INSTR r10, 2
INSTR r11, 13
INSTR r12, 1
INSTR r13, 2
INSTR r14, 13
INSTR r15, 1
INSTR rdx, 2
INSTR r9, 13
INSTR r10, 1
INSTR r11, 2
INSTR r12, 13
INSTR r13, 1
INSTR r14, 2
INSTR r15, 13
INSTR rdx, 1
INSTR r9, 2
INSTR r10, 13
INSTR r11, 1
INSTR r12, 2
INSTR r13, 13
INSTR r14, 1
INSTR r15, 2
INSTR rdx, 13
INSTR r9, 1
INSTR r10, 2
INSTR r11, 13
INSTR r12, 1
INSTR r13, 2
INSTR r14, 13
INSTR r15, 1
INSTR rdx, 2
INSTR r9, 13
INSTR r10, 1
INSTR r11, 2
INSTR r12, 13
INSTR r13, 1
INSTR r14, 2
INSTR r15, 13
INSTR rdx, 1
INSTR r9, 2
INSTR r10, 13
INSTR r11, 1
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,170 +0,0 @@
#define INSTR cmp
#define NINST 100
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
INSTR rax, 1
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR dec
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR dec
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR inc
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR inc
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR lea
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR lea
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR lea
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR lea
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
INSTR [rip+PI], ebx
INSTR [rip+PI], ecx
INSTR [rip+PI], eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
INSTR [rip+PI], eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
INSTR [rip+PI], rbx
INSTR [rip+PI], rcx
INSTR [rip+PI], rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
INSTR [rip+PI], rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
INSTR edx, [rip+PI]
INSTR r9d, [rip+PI]
INSTR r10d, [rip+PI]
INSTR r11d, [rip+PI]
INSTR r12d, [rip+PI]
INSTR r13d, [rip+PI]
INSTR r14d, [rip+PI]
INSTR r15d, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
INSTR eax, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,207 +0,0 @@
#define INSTR mov
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,207 +0,0 @@
#define INSTR mov
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR mov
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR movslq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
INSTR rdx, [rip+PI]
INSTR r9, [rip+PI]
INSTR r10, [rip+PI]
INSTR r11, [rip+PI]
INSTR r12, [rip+PI]
INSTR r13, [rip+PI]
INSTR r14, [rip+PI]
INSTR r15, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,134 +0,0 @@
#define INSTR movslq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
loop:
inc i
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
INSTR rax, [rip+PI]
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR movslq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
INSTR r11, eax
INSTR r12, ebx
INSTR r13, ecx
INSTR r14, eax
INSTR r15, ebx
INSTR rdx, ecx
INSTR r9, eax
INSTR r10, ebx
INSTR r11, ecx
INSTR r12, eax
INSTR r13, ebx
INSTR r14, ecx
INSTR r15, eax
INSTR rdx, ebx
INSTR r9, ecx
INSTR r10, eax
INSTR r11, ebx
INSTR r12, ecx
INSTR r13, eax
INSTR r14, ebx
INSTR r15, ecx
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
INSTR r11, eax
INSTR r12, ebx
INSTR r13, ecx
INSTR r14, eax
INSTR r15, ebx
INSTR rdx, ecx
INSTR r9, eax
INSTR r10, ebx
INSTR r11, ecx
INSTR r12, eax
INSTR r13, ebx
INSTR r14, ecx
INSTR r15, eax
INSTR rdx, ebx
INSTR r9, ecx
INSTR r10, eax
INSTR r11, ebx
INSTR r12, ecx
INSTR r13, eax
INSTR r14, ebx
INSTR r15, ecx
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
INSTR r11, eax
INSTR r12, ebx
INSTR r13, ecx
INSTR r14, eax
INSTR r15, ebx
INSTR rdx, ecx
INSTR r9, eax
INSTR r10, ebx
INSTR r11, ecx
INSTR r12, eax
INSTR r13, ebx
INSTR r14, ecx
INSTR r15, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR movslq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,207 +0,0 @@
#define INSTR movzbl
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR r11d, al
INSTR r12d, bl
INSTR r13d, cl
INSTR r14d, al
INSTR r15d, bl
INSTR edx, cl
INSTR r9d, al
INSTR r10d, bl
INSTR r11d, cl
INSTR r12d, al
INSTR r13d, bl
INSTR r14d, cl
INSTR r15d, al
INSTR edx, bl
INSTR r9d, cl
INSTR r10d, al
INSTR r11d, bl
INSTR r12d, cl
INSTR r13d, al
INSTR r14d, bl
INSTR r15d, cl
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR r11d, al
INSTR r12d, bl
INSTR r13d, cl
INSTR r14d, al
INSTR r15d, bl
INSTR edx, cl
INSTR r9d, al
INSTR r10d, bl
INSTR r11d, cl
INSTR r12d, al
INSTR r13d, bl
INSTR r14d, cl
INSTR r15d, al
INSTR edx, bl
INSTR r9d, cl
INSTR r10d, al
INSTR r11d, bl
INSTR r12d, cl
INSTR r13d, al
INSTR r14d, bl
INSTR r15d, cl
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR r11d, al
INSTR r12d, bl
INSTR r13d, cl
INSTR r14d, al
INSTR r15d, bl
INSTR edx, cl
INSTR r9d, al
INSTR r10d, bl
INSTR r11d, cl
INSTR r12d, al
INSTR r13d, bl
INSTR r14d, cl
INSTR r15d, al
INSTR edx, bl
INSTR r9d, cl
INSTR r10d, al
INSTR r11d, bl
INSTR r12d, cl
INSTR r13d, al
INSTR r14d, bl
INSTR r15d, cl
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR r11d, al
INSTR r12d, bl
INSTR r13d, cl
INSTR r14d, al
INSTR r15d, bl
INSTR edx, cl
INSTR r9d, al
INSTR r10d, bl
INSTR r11d, cl
INSTR r12d, al
INSTR r13d, bl
INSTR r14d, cl
INSTR r15d, al
INSTR edx, bl
INSTR r9d, cl
INSTR r10d, al
INSTR r11d, bl
INSTR r12d, cl
INSTR r13d, al
INSTR r14d, bl
INSTR r15d, cl
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR r11d, al
INSTR r12d, bl
INSTR r13d, cl
INSTR r14d, al
INSTR r15d, bl
INSTR edx, cl
INSTR r9d, al
INSTR r10d, bl
INSTR r11d, cl
INSTR r12d, al
INSTR r13d, bl
INSTR r14d, cl
INSTR r15d, al
INSTR edx, bl
INSTR r9d, cl
INSTR r10d, al
INSTR r11d, bl
INSTR r12d, cl
INSTR r13d, al
INSTR r14d, bl
INSTR r15d, cl
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR r11d, al
INSTR r12d, bl
INSTR r13d, cl
INSTR r14d, al
INSTR r15d, bl
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,207 +0,0 @@
#define INSTR movzbl
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR neg
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR neg
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR sub
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR sub
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR test
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR test
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vaddpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vaddpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vaddpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
INSTR ymm15, ymm1, ymm1
INSTR ymm3, ymm2, ymm2
INSTR ymm4, ymm0, ymm0
INSTR ymm5, ymm1, ymm1
INSTR ymm6, ymm2, ymm2
INSTR ymm7, ymm0, ymm0
INSTR ymm8, ymm1, ymm1
INSTR ymm9, ymm2, ymm2
INSTR ymm10, ymm0, ymm0
INSTR ymm11, ymm1, ymm1
INSTR ymm12, ymm2, ymm2
INSTR ymm13, ymm0, ymm0
INSTR ymm14, ymm1, ymm1
INSTR ymm15, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vaddpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vaddsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, [rip+PI]
INSTR xmm4, xmm1, [rip+PI]
INSTR xmm5, xmm2, [rip+PI]
INSTR xmm6, xmm0, [rip+PI]
INSTR xmm7, xmm1, [rip+PI]
INSTR xmm8, xmm2, [rip+PI]
INSTR xmm9, xmm0, [rip+PI]
INSTR xmm10, xmm1, [rip+PI]
INSTR xmm11, xmm2, [rip+PI]
INSTR xmm12, xmm0, [rip+PI]
INSTR xmm13, xmm1, [rip+PI]
INSTR xmm14, xmm2, [rip+PI]
INSTR xmm15, xmm0, [rip+PI]
INSTR xmm3, xmm1, [rip+PI]
INSTR xmm4, xmm2, [rip+PI]
INSTR xmm5, xmm0, [rip+PI]
INSTR xmm6, xmm1, [rip+PI]
INSTR xmm7, xmm2, [rip+PI]
INSTR xmm8, xmm0, [rip+PI]
INSTR xmm9, xmm1, [rip+PI]
INSTR xmm10, xmm2, [rip+PI]
INSTR xmm11, xmm0, [rip+PI]
INSTR xmm12, xmm1, [rip+PI]
INSTR xmm13, xmm2, [rip+PI]
INSTR xmm14, xmm0, [rip+PI]
INSTR xmm15, xmm1, [rip+PI]
INSTR xmm3, xmm2, [rip+PI]
INSTR xmm4, xmm0, [rip+PI]
INSTR xmm5, xmm1, [rip+PI]
INSTR xmm6, xmm2, [rip+PI]
INSTR xmm7, xmm0, [rip+PI]
INSTR xmm8, xmm1, [rip+PI]
INSTR xmm9, xmm2, [rip+PI]
INSTR xmm10, xmm0, [rip+PI]
INSTR xmm11, xmm1, [rip+PI]
INSTR xmm12, xmm2, [rip+PI]
INSTR xmm13, xmm0, [rip+PI]
INSTR xmm14, xmm1, [rip+PI]
INSTR xmm15, xmm2, [rip+PI]
INSTR xmm3, xmm0, [rip+PI]
INSTR xmm4, xmm1, [rip+PI]
INSTR xmm5, xmm2, [rip+PI]
INSTR xmm6, xmm0, [rip+PI]
INSTR xmm7, xmm1, [rip+PI]
INSTR xmm8, xmm2, [rip+PI]
INSTR xmm9, xmm0, [rip+PI]
INSTR xmm10, xmm1, [rip+PI]
INSTR xmm11, xmm2, [rip+PI]
INSTR xmm12, xmm0, [rip+PI]
INSTR xmm13, xmm1, [rip+PI]
INSTR xmm14, xmm2, [rip+PI]
INSTR xmm15, xmm0, [rip+PI]
INSTR xmm3, xmm1, [rip+PI]
INSTR xmm4, xmm2, [rip+PI]
INSTR xmm5, xmm0, [rip+PI]
INSTR xmm6, xmm1, [rip+PI]
INSTR xmm7, xmm2, [rip+PI]
INSTR xmm8, xmm0, [rip+PI]
INSTR xmm9, xmm1, [rip+PI]
INSTR xmm10, xmm2, [rip+PI]
INSTR xmm11, xmm0, [rip+PI]
INSTR xmm12, xmm1, [rip+PI]
INSTR xmm13, xmm2, [rip+PI]
INSTR xmm14, xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vaddsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vaddsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vaddsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,172 +0,0 @@
#define INSTR vmovapd
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm3, xmm1
INSTR xmm4, xmm2
INSTR xmm5, xmm0
INSTR xmm6, xmm1
INSTR xmm7, xmm2
INSTR xmm8, xmm0
INSTR xmm9, xmm1
INSTR xmm10, xmm2
INSTR xmm11, xmm0
INSTR xmm12, xmm1
INSTR xmm13, xmm2
INSTR xmm14, xmm0
INSTR xmm15, xmm1
INSTR xmm3, xmm2
INSTR xmm4, xmm0
INSTR xmm5, xmm1
INSTR xmm6, xmm2
INSTR xmm7, xmm0
INSTR xmm8, xmm1
INSTR xmm9, xmm2
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
INSTR xmm13, xmm0
INSTR xmm14, xmm1
INSTR xmm15, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm3, xmm1
INSTR xmm4, xmm2
INSTR xmm5, xmm0
INSTR xmm6, xmm1
INSTR xmm7, xmm2
INSTR xmm8, xmm0
INSTR xmm9, xmm1
INSTR xmm10, xmm2
INSTR xmm11, xmm0
INSTR xmm12, xmm1
INSTR xmm13, xmm2
INSTR xmm14, xmm0
INSTR xmm15, xmm1
INSTR xmm3, xmm2
INSTR xmm4, xmm0
INSTR xmm5, xmm1
INSTR xmm6, xmm2
INSTR xmm7, xmm0
INSTR xmm8, xmm1
INSTR xmm9, xmm2
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
INSTR xmm13, xmm0
INSTR xmm14, xmm1
INSTR xmm15, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm3, xmm1
INSTR xmm4, xmm2
INSTR xmm5, xmm0
INSTR xmm6, xmm1
INSTR xmm7, xmm2
INSTR xmm8, xmm0
INSTR xmm9, xmm1
INSTR xmm10, xmm2
INSTR xmm11, xmm0
INSTR xmm12, xmm1
INSTR xmm13, xmm2
INSTR xmm14, xmm0
INSTR xmm15, xmm1
INSTR xmm3, xmm2
INSTR xmm4, xmm0
INSTR xmm5, xmm1
INSTR xmm6, xmm2
INSTR xmm7, xmm0
INSTR xmm8, xmm1
INSTR xmm9, xmm2
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
INSTR xmm13, xmm0
INSTR xmm14, xmm1
INSTR xmm15, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,172 +0,0 @@
#define INSTR vmovapd
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,174 +0,0 @@
#define INSTR vmovapd
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0
INSTR ymm4, ymm1
INSTR ymm5, ymm2
INSTR ymm6, ymm0
INSTR ymm7, ymm1
INSTR ymm8, ymm2
INSTR ymm9, ymm0
INSTR ymm10, ymm1
INSTR ymm11, ymm2
INSTR ymm12, ymm0
INSTR ymm13, ymm1
INSTR ymm14, ymm2
INSTR ymm15, ymm0
INSTR ymm3, ymm1
INSTR ymm4, ymm2
INSTR ymm5, ymm0
INSTR ymm6, ymm1
INSTR ymm7, ymm2
INSTR ymm8, ymm0
INSTR ymm9, ymm1
INSTR ymm10, ymm2
INSTR ymm11, ymm0
INSTR ymm12, ymm1
INSTR ymm13, ymm2
INSTR ymm14, ymm0
INSTR ymm15, ymm1
INSTR ymm3, ymm2
INSTR ymm4, ymm0
INSTR ymm5, ymm1
INSTR ymm6, ymm2
INSTR ymm7, ymm0
INSTR ymm8, ymm1
INSTR ymm9, ymm2
INSTR ymm10, ymm0
INSTR ymm11, ymm1
INSTR ymm12, ymm2
INSTR ymm13, ymm0
INSTR ymm14, ymm1
INSTR ymm15, ymm2
INSTR ymm3, ymm0
INSTR ymm4, ymm1
INSTR ymm5, ymm2
INSTR ymm6, ymm0
INSTR ymm7, ymm1
INSTR ymm8, ymm2
INSTR ymm9, ymm0
INSTR ymm10, ymm1
INSTR ymm11, ymm2
INSTR ymm12, ymm0
INSTR ymm13, ymm1
INSTR ymm14, ymm2
INSTR ymm15, ymm0
INSTR ymm3, ymm1
INSTR ymm4, ymm2
INSTR ymm5, ymm0
INSTR ymm6, ymm1
INSTR ymm7, ymm2
INSTR ymm8, ymm0
INSTR ymm9, ymm1
INSTR ymm10, ymm2
INSTR ymm11, ymm0
INSTR ymm12, ymm1
INSTR ymm13, ymm2
INSTR ymm14, ymm0
INSTR ymm15, ymm1
INSTR ymm3, ymm2
INSTR ymm4, ymm0
INSTR ymm5, ymm1
INSTR ymm6, ymm2
INSTR ymm7, ymm0
INSTR ymm8, ymm1
INSTR ymm9, ymm2
INSTR ymm10, ymm0
INSTR ymm11, ymm1
INSTR ymm12, ymm2
INSTR ymm13, ymm0
INSTR ymm14, ymm1
INSTR ymm15, ymm2
INSTR ymm3, ymm0
INSTR ymm4, ymm1
INSTR ymm5, ymm2
INSTR ymm6, ymm0
INSTR ymm7, ymm1
INSTR ymm8, ymm2
INSTR ymm9, ymm0
INSTR ymm10, ymm1
INSTR ymm11, ymm2
INSTR ymm12, ymm0
INSTR ymm13, ymm1
INSTR ymm14, ymm2
INSTR ymm15, ymm0
INSTR ymm3, ymm1
INSTR ymm4, ymm2
INSTR ymm5, ymm0
INSTR ymm6, ymm1
INSTR ymm7, ymm2
INSTR ymm8, ymm0
INSTR ymm9, ymm1
INSTR ymm10, ymm2
INSTR ymm11, ymm0
INSTR ymm12, ymm1
INSTR ymm13, ymm2
INSTR ymm14, ymm0
INSTR ymm15, ymm1
INSTR ymm3, ymm2
INSTR ymm4, ymm0
INSTR ymm5, ymm1
INSTR ymm6, ymm2
INSTR ymm7, ymm0
INSTR ymm8, ymm1
INSTR ymm9, ymm2
INSTR ymm10, ymm0
INSTR ymm11, ymm1
INSTR ymm12, ymm2
INSTR ymm13, ymm0
INSTR ymm14, ymm1
INSTR ymm15, ymm2
INSTR ymm3, ymm0
INSTR ymm4, ymm1
INSTR ymm5, ymm2
INSTR ymm6, ymm0
INSTR ymm7, ymm1
INSTR ymm8, ymm2
INSTR ymm9, ymm0
INSTR ymm10, ymm1
INSTR ymm11, ymm2
INSTR ymm12, ymm0
INSTR ymm13, ymm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,174 +0,0 @@
#define INSTR vmovapd
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,172 +0,0 @@
#define INSTR vmovaps
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm3, xmm1
INSTR xmm4, xmm2
INSTR xmm5, xmm0
INSTR xmm6, xmm1
INSTR xmm7, xmm2
INSTR xmm8, xmm0
INSTR xmm9, xmm1
INSTR xmm10, xmm2
INSTR xmm11, xmm0
INSTR xmm12, xmm1
INSTR xmm13, xmm2
INSTR xmm14, xmm0
INSTR xmm15, xmm1
INSTR xmm3, xmm2
INSTR xmm4, xmm0
INSTR xmm5, xmm1
INSTR xmm6, xmm2
INSTR xmm7, xmm0
INSTR xmm8, xmm1
INSTR xmm9, xmm2
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
INSTR xmm13, xmm0
INSTR xmm14, xmm1
INSTR xmm15, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm3, xmm1
INSTR xmm4, xmm2
INSTR xmm5, xmm0
INSTR xmm6, xmm1
INSTR xmm7, xmm2
INSTR xmm8, xmm0
INSTR xmm9, xmm1
INSTR xmm10, xmm2
INSTR xmm11, xmm0
INSTR xmm12, xmm1
INSTR xmm13, xmm2
INSTR xmm14, xmm0
INSTR xmm15, xmm1
INSTR xmm3, xmm2
INSTR xmm4, xmm0
INSTR xmm5, xmm1
INSTR xmm6, xmm2
INSTR xmm7, xmm0
INSTR xmm8, xmm1
INSTR xmm9, xmm2
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
INSTR xmm13, xmm0
INSTR xmm14, xmm1
INSTR xmm15, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm3, xmm1
INSTR xmm4, xmm2
INSTR xmm5, xmm0
INSTR xmm6, xmm1
INSTR xmm7, xmm2
INSTR xmm8, xmm0
INSTR xmm9, xmm1
INSTR xmm10, xmm2
INSTR xmm11, xmm0
INSTR xmm12, xmm1
INSTR xmm13, xmm2
INSTR xmm14, xmm0
INSTR xmm15, xmm1
INSTR xmm3, xmm2
INSTR xmm4, xmm0
INSTR xmm5, xmm1
INSTR xmm6, xmm2
INSTR xmm7, xmm0
INSTR xmm8, xmm1
INSTR xmm9, xmm2
INSTR xmm10, xmm0
INSTR xmm11, xmm1
INSTR xmm12, xmm2
INSTR xmm13, xmm0
INSTR xmm14, xmm1
INSTR xmm15, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,172 +0,0 @@
#define INSTR vmovaps
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmovhpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, [rip+PI]
INSTR xmm4, xmm1, [rip+PI]
INSTR xmm5, xmm2, [rip+PI]
INSTR xmm6, xmm0, [rip+PI]
INSTR xmm7, xmm1, [rip+PI]
INSTR xmm8, xmm2, [rip+PI]
INSTR xmm9, xmm0, [rip+PI]
INSTR xmm10, xmm1, [rip+PI]
INSTR xmm11, xmm2, [rip+PI]
INSTR xmm12, xmm0, [rip+PI]
INSTR xmm13, xmm1, [rip+PI]
INSTR xmm14, xmm2, [rip+PI]
INSTR xmm15, xmm0, [rip+PI]
INSTR xmm3, xmm1, [rip+PI]
INSTR xmm4, xmm2, [rip+PI]
INSTR xmm5, xmm0, [rip+PI]
INSTR xmm6, xmm1, [rip+PI]
INSTR xmm7, xmm2, [rip+PI]
INSTR xmm8, xmm0, [rip+PI]
INSTR xmm9, xmm1, [rip+PI]
INSTR xmm10, xmm2, [rip+PI]
INSTR xmm11, xmm0, [rip+PI]
INSTR xmm12, xmm1, [rip+PI]
INSTR xmm13, xmm2, [rip+PI]
INSTR xmm14, xmm0, [rip+PI]
INSTR xmm15, xmm1, [rip+PI]
INSTR xmm3, xmm2, [rip+PI]
INSTR xmm4, xmm0, [rip+PI]
INSTR xmm5, xmm1, [rip+PI]
INSTR xmm6, xmm2, [rip+PI]
INSTR xmm7, xmm0, [rip+PI]
INSTR xmm8, xmm1, [rip+PI]
INSTR xmm9, xmm2, [rip+PI]
INSTR xmm10, xmm0, [rip+PI]
INSTR xmm11, xmm1, [rip+PI]
INSTR xmm12, xmm2, [rip+PI]
INSTR xmm13, xmm0, [rip+PI]
INSTR xmm14, xmm1, [rip+PI]
INSTR xmm15, xmm2, [rip+PI]
INSTR xmm3, xmm0, [rip+PI]
INSTR xmm4, xmm1, [rip+PI]
INSTR xmm5, xmm2, [rip+PI]
INSTR xmm6, xmm0, [rip+PI]
INSTR xmm7, xmm1, [rip+PI]
INSTR xmm8, xmm2, [rip+PI]
INSTR xmm9, xmm0, [rip+PI]
INSTR xmm10, xmm1, [rip+PI]
INSTR xmm11, xmm2, [rip+PI]
INSTR xmm12, xmm0, [rip+PI]
INSTR xmm13, xmm1, [rip+PI]
INSTR xmm14, xmm2, [rip+PI]
INSTR xmm15, xmm0, [rip+PI]
INSTR xmm3, xmm1, [rip+PI]
INSTR xmm4, xmm2, [rip+PI]
INSTR xmm5, xmm0, [rip+PI]
INSTR xmm6, xmm1, [rip+PI]
INSTR xmm7, xmm2, [rip+PI]
INSTR xmm8, xmm0, [rip+PI]
INSTR xmm9, xmm1, [rip+PI]
INSTR xmm10, xmm2, [rip+PI]
INSTR xmm11, xmm0, [rip+PI]
INSTR xmm12, xmm1, [rip+PI]
INSTR xmm13, xmm2, [rip+PI]
INSTR xmm14, xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmovhpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,141 +0,0 @@
#define INSTR vmovq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR rdx, xmm0
INSTR r9, xmm1
INSTR r10, xmm2
INSTR r11, xmm0
INSTR r12, xmm1
INSTR r13, xmm2
INSTR r14, xmm0
INSTR r15, xmm1
INSTR rdx, xmm2
INSTR r9, xmm0
INSTR r10, xmm1
INSTR r11, xmm2
INSTR r12, xmm0
INSTR r13, xmm1
INSTR r14, xmm2
INSTR r15, xmm0
INSTR rdx, xmm1
INSTR r9, xmm2
INSTR r10, xmm0
INSTR r11, xmm1
INSTR r12, xmm2
INSTR r13, xmm0
INSTR r14, xmm1
INSTR r15, xmm2
INSTR rdx, xmm0
INSTR r9, xmm1
INSTR r10, xmm2
INSTR r11, xmm0
INSTR r12, xmm1
INSTR r13, xmm2
INSTR r14, xmm0
INSTR r15, xmm1
INSTR rdx, xmm2
INSTR r9, xmm0
INSTR r10, xmm1
INSTR r11, xmm2
INSTR r12, xmm0
INSTR r13, xmm1
INSTR r14, xmm2
INSTR r15, xmm0
INSTR rdx, xmm1
INSTR r9, xmm2
INSTR r10, xmm0
INSTR r11, xmm1
INSTR r12, xmm2
INSTR r13, xmm0
INSTR r14, xmm1
INSTR r15, xmm2
INSTR rdx, xmm0
INSTR r9, xmm1
INSTR r10, xmm2
INSTR r11, xmm0
INSTR r12, xmm1
INSTR r13, xmm2
INSTR r14, xmm0
INSTR r15, xmm1
INSTR rdx, xmm2
INSTR r9, xmm0
INSTR r10, xmm1
INSTR r11, xmm2
INSTR r12, xmm0
INSTR r13, xmm1
INSTR r14, xmm2
INSTR r15, xmm0
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,141 +0,0 @@
#define INSTR vmovq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR vmovq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR xmm3, rax
INSTR xmm4, rbx
INSTR xmm5, rcx
INSTR xmm6, rax
INSTR xmm7, rbx
INSTR xmm8, rcx
INSTR xmm9, rax
INSTR xmm10, rbx
INSTR xmm11, rcx
INSTR xmm12, rax
INSTR xmm13, rbx
INSTR xmm14, rcx
INSTR xmm15, rax
INSTR xmm3, rbx
INSTR xmm4, rcx
INSTR xmm5, rax
INSTR xmm6, rbx
INSTR xmm7, rcx
INSTR xmm8, rax
INSTR xmm9, rbx
INSTR xmm10, rcx
INSTR xmm11, rax
INSTR xmm12, rbx
INSTR xmm13, rcx
INSTR xmm14, rax
INSTR xmm15, rbx
INSTR xmm3, rcx
INSTR xmm4, rax
INSTR xmm5, rbx
INSTR xmm6, rcx
INSTR xmm7, rax
INSTR xmm8, rbx
INSTR xmm9, rcx
INSTR xmm10, rax
INSTR xmm11, rbx
INSTR xmm12, rcx
INSTR xmm13, rax
INSTR xmm14, rbx
INSTR xmm15, rcx
INSTR xmm3, rax
INSTR xmm4, rbx
INSTR xmm5, rcx
INSTR xmm6, rax
INSTR xmm7, rbx
INSTR xmm8, rcx
INSTR xmm9, rax
INSTR xmm10, rbx
INSTR xmm11, rcx
INSTR xmm12, rax
INSTR xmm13, rbx
INSTR xmm14, rcx
INSTR xmm15, rax
INSTR xmm3, rbx
INSTR xmm4, rcx
INSTR xmm5, rax
INSTR xmm6, rbx
INSTR xmm7, rcx
INSTR xmm8, rax
INSTR xmm9, rbx
INSTR xmm10, rcx
INSTR xmm11, rax
INSTR xmm12, rbx
INSTR xmm13, rcx
INSTR xmm14, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR vmovq
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmovsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm1
INSTR [rip+PI], xmm2
INSTR [rip+PI], xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmovsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
INSTR [rip+PI], xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,101 +0,0 @@
#define INSTR vmovsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
loop:
inc i
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,101 +0,0 @@
#define INSTR vmovsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
loop:
inc i
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmovsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmovsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,101 +0,0 @@
#define INSTR vmovupd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
loop:
inc i
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
INSTR xmm15, [rip+PI]
INSTR xmm3, [rip+PI]
INSTR xmm4, [rip+PI]
INSTR xmm5, [rip+PI]
INSTR xmm6, [rip+PI]
INSTR xmm7, [rip+PI]
INSTR xmm8, [rip+PI]
INSTR xmm9, [rip+PI]
INSTR xmm10, [rip+PI]
INSTR xmm11, [rip+PI]
INSTR xmm12, [rip+PI]
INSTR xmm13, [rip+PI]
INSTR xmm14, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,101 +0,0 @@
#define INSTR vmovupd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
loop:
inc i
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
INSTR xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vmulpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, [rip+PI]
INSTR ymm4, ymm1, [rip+PI]
INSTR ymm5, ymm2, [rip+PI]
INSTR ymm6, ymm0, [rip+PI]
INSTR ymm7, ymm1, [rip+PI]
INSTR ymm8, ymm2, [rip+PI]
INSTR ymm9, ymm0, [rip+PI]
INSTR ymm10, ymm1, [rip+PI]
INSTR ymm11, ymm2, [rip+PI]
INSTR ymm12, ymm0, [rip+PI]
INSTR ymm13, ymm1, [rip+PI]
INSTR ymm14, ymm2, [rip+PI]
INSTR ymm15, ymm0, [rip+PI]
INSTR ymm3, ymm1, [rip+PI]
INSTR ymm4, ymm2, [rip+PI]
INSTR ymm5, ymm0, [rip+PI]
INSTR ymm6, ymm1, [rip+PI]
INSTR ymm7, ymm2, [rip+PI]
INSTR ymm8, ymm0, [rip+PI]
INSTR ymm9, ymm1, [rip+PI]
INSTR ymm10, ymm2, [rip+PI]
INSTR ymm11, ymm0, [rip+PI]
INSTR ymm12, ymm1, [rip+PI]
INSTR ymm13, ymm2, [rip+PI]
INSTR ymm14, ymm0, [rip+PI]
INSTR ymm15, ymm1, [rip+PI]
INSTR ymm3, ymm2, [rip+PI]
INSTR ymm4, ymm0, [rip+PI]
INSTR ymm5, ymm1, [rip+PI]
INSTR ymm6, ymm2, [rip+PI]
INSTR ymm7, ymm0, [rip+PI]
INSTR ymm8, ymm1, [rip+PI]
INSTR ymm9, ymm2, [rip+PI]
INSTR ymm10, ymm0, [rip+PI]
INSTR ymm11, ymm1, [rip+PI]
INSTR ymm12, ymm2, [rip+PI]
INSTR ymm13, ymm0, [rip+PI]
INSTR ymm14, ymm1, [rip+PI]
INSTR ymm15, ymm2, [rip+PI]
INSTR ymm3, ymm0, [rip+PI]
INSTR ymm4, ymm1, [rip+PI]
INSTR ymm5, ymm2, [rip+PI]
INSTR ymm6, ymm0, [rip+PI]
INSTR ymm7, ymm1, [rip+PI]
INSTR ymm8, ymm2, [rip+PI]
INSTR ymm9, ymm0, [rip+PI]
INSTR ymm10, ymm1, [rip+PI]
INSTR ymm11, ymm2, [rip+PI]
INSTR ymm12, ymm0, [rip+PI]
INSTR ymm13, ymm1, [rip+PI]
INSTR ymm14, ymm2, [rip+PI]
INSTR ymm15, ymm0, [rip+PI]
INSTR ymm3, ymm1, [rip+PI]
INSTR ymm4, ymm2, [rip+PI]
INSTR ymm5, ymm0, [rip+PI]
INSTR ymm6, ymm1, [rip+PI]
INSTR ymm7, ymm2, [rip+PI]
INSTR ymm8, ymm0, [rip+PI]
INSTR ymm9, ymm1, [rip+PI]
INSTR ymm10, ymm2, [rip+PI]
INSTR ymm11, ymm0, [rip+PI]
INSTR ymm12, ymm1, [rip+PI]
INSTR ymm13, ymm2, [rip+PI]
INSTR ymm14, ymm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vmulpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
INSTR ymm0, ymm1, [rip+PI]
INSTR ymm1, ymm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vmulpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
INSTR ymm15, ymm1, ymm1
INSTR ymm3, ymm2, ymm2
INSTR ymm4, ymm0, ymm0
INSTR ymm5, ymm1, ymm1
INSTR ymm6, ymm2, ymm2
INSTR ymm7, ymm0, ymm0
INSTR ymm8, ymm1, ymm1
INSTR ymm9, ymm2, ymm2
INSTR ymm10, ymm0, ymm0
INSTR ymm11, ymm1, ymm1
INSTR ymm12, ymm2, ymm2
INSTR ymm13, ymm0, ymm0
INSTR ymm14, ymm1, ymm1
INSTR ymm15, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vmulpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmulsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, [rip+PI]
INSTR xmm4, xmm1, [rip+PI]
INSTR xmm5, xmm2, [rip+PI]
INSTR xmm6, xmm0, [rip+PI]
INSTR xmm7, xmm1, [rip+PI]
INSTR xmm8, xmm2, [rip+PI]
INSTR xmm9, xmm0, [rip+PI]
INSTR xmm10, xmm1, [rip+PI]
INSTR xmm11, xmm2, [rip+PI]
INSTR xmm12, xmm0, [rip+PI]
INSTR xmm13, xmm1, [rip+PI]
INSTR xmm14, xmm2, [rip+PI]
INSTR xmm15, xmm0, [rip+PI]
INSTR xmm3, xmm1, [rip+PI]
INSTR xmm4, xmm2, [rip+PI]
INSTR xmm5, xmm0, [rip+PI]
INSTR xmm6, xmm1, [rip+PI]
INSTR xmm7, xmm2, [rip+PI]
INSTR xmm8, xmm0, [rip+PI]
INSTR xmm9, xmm1, [rip+PI]
INSTR xmm10, xmm2, [rip+PI]
INSTR xmm11, xmm0, [rip+PI]
INSTR xmm12, xmm1, [rip+PI]
INSTR xmm13, xmm2, [rip+PI]
INSTR xmm14, xmm0, [rip+PI]
INSTR xmm15, xmm1, [rip+PI]
INSTR xmm3, xmm2, [rip+PI]
INSTR xmm4, xmm0, [rip+PI]
INSTR xmm5, xmm1, [rip+PI]
INSTR xmm6, xmm2, [rip+PI]
INSTR xmm7, xmm0, [rip+PI]
INSTR xmm8, xmm1, [rip+PI]
INSTR xmm9, xmm2, [rip+PI]
INSTR xmm10, xmm0, [rip+PI]
INSTR xmm11, xmm1, [rip+PI]
INSTR xmm12, xmm2, [rip+PI]
INSTR xmm13, xmm0, [rip+PI]
INSTR xmm14, xmm1, [rip+PI]
INSTR xmm15, xmm2, [rip+PI]
INSTR xmm3, xmm0, [rip+PI]
INSTR xmm4, xmm1, [rip+PI]
INSTR xmm5, xmm2, [rip+PI]
INSTR xmm6, xmm0, [rip+PI]
INSTR xmm7, xmm1, [rip+PI]
INSTR xmm8, xmm2, [rip+PI]
INSTR xmm9, xmm0, [rip+PI]
INSTR xmm10, xmm1, [rip+PI]
INSTR xmm11, xmm2, [rip+PI]
INSTR xmm12, xmm0, [rip+PI]
INSTR xmm13, xmm1, [rip+PI]
INSTR xmm14, xmm2, [rip+PI]
INSTR xmm15, xmm0, [rip+PI]
INSTR xmm3, xmm1, [rip+PI]
INSTR xmm4, xmm2, [rip+PI]
INSTR xmm5, xmm0, [rip+PI]
INSTR xmm6, xmm1, [rip+PI]
INSTR xmm7, xmm2, [rip+PI]
INSTR xmm8, xmm0, [rip+PI]
INSTR xmm9, xmm1, [rip+PI]
INSTR xmm10, xmm2, [rip+PI]
INSTR xmm11, xmm0, [rip+PI]
INSTR xmm12, xmm1, [rip+PI]
INSTR xmm13, xmm2, [rip+PI]
INSTR xmm14, xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmulsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
INSTR xmm0, xmm1, [rip+PI]
INSTR xmm1, xmm0, [rip+PI]
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmulsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vmulsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vsubpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
INSTR ymm15, ymm1, ymm1
INSTR ymm3, ymm2, ymm2
INSTR ymm4, ymm0, ymm0
INSTR ymm5, ymm1, ymm1
INSTR ymm6, ymm2, ymm2
INSTR ymm7, ymm0, ymm0
INSTR ymm8, ymm1, ymm1
INSTR ymm9, ymm2, ymm2
INSTR ymm10, ymm0, ymm0
INSTR ymm11, ymm1, ymm1
INSTR ymm12, ymm2, ymm2
INSTR ymm13, ymm0, ymm0
INSTR ymm14, ymm1, ymm1
INSTR ymm15, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vsubpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vsubsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vsubsd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vunpckhpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vunpckhpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,172 +0,0 @@
#define INSTR vxorpd
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,172 +0,0 @@
#define INSTR vxorpd
#define NINST 128
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vxorpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
INSTR ymm15, ymm1, ymm1
INSTR ymm3, ymm2, ymm2
INSTR ymm4, ymm0, ymm0
INSTR ymm5, ymm1, ymm1
INSTR ymm6, ymm2, ymm2
INSTR ymm7, ymm0, ymm0
INSTR ymm8, ymm1, ymm1
INSTR ymm9, ymm2, ymm2
INSTR ymm10, ymm0, ymm0
INSTR ymm11, ymm1, ymm1
INSTR ymm12, ymm2, ymm2
INSTR ymm13, ymm0, ymm0
INSTR ymm14, ymm1, ymm1
INSTR ymm15, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm3, ymm1, ymm1
INSTR ymm4, ymm2, ymm2
INSTR ymm5, ymm0, ymm0
INSTR ymm6, ymm1, ymm1
INSTR ymm7, ymm2, ymm2
INSTR ymm8, ymm0, ymm0
INSTR ymm9, ymm1, ymm1
INSTR ymm10, ymm2, ymm2
INSTR ymm11, ymm0, ymm0
INSTR ymm12, ymm1, ymm1
INSTR ymm13, ymm2, ymm2
INSTR ymm14, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,110 +0,0 @@
#define INSTR vxorpd
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vxorps
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
INSTR xmm15, xmm1, xmm1
INSTR xmm3, xmm2, xmm2
INSTR xmm4, xmm0, xmm0
INSTR xmm5, xmm1, xmm1
INSTR xmm6, xmm2, xmm2
INSTR xmm7, xmm0, xmm0
INSTR xmm8, xmm1, xmm1
INSTR xmm9, xmm2, xmm2
INSTR xmm10, xmm0, xmm0
INSTR xmm11, xmm1, xmm1
INSTR xmm12, xmm2, xmm2
INSTR xmm13, xmm0, xmm0
INSTR xmm14, xmm1, xmm1
INSTR xmm15, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm3, xmm1, xmm1
INSTR xmm4, xmm2, xmm2
INSTR xmm5, xmm0, xmm0
INSTR xmm6, xmm1, xmm1
INSTR xmm7, xmm2, xmm2
INSTR xmm8, xmm0, xmm0
INSTR xmm9, xmm1, xmm1
INSTR xmm10, xmm2, xmm2
INSTR xmm11, xmm0, xmm0
INSTR xmm12, xmm1, xmm1
INSTR xmm13, xmm2, xmm2
INSTR xmm14, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,108 +0,0 @@
#define INSTR vxorps
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR xor
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR edx, ebx
INSTR r9d, ecx
INSTR r10d, eax
INSTR r11d, ebx
INSTR r12d, ecx
INSTR r13d, eax
INSTR r14d, ebx
INSTR r15d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -1,143 +0,0 @@
#define INSTR xor
#define NINST 64
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.align 32
PI:
.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency