#define INSTR vinsertf128 #define NINST 64 #define N edi #define i r8d .intel_syntax noprefix .globl ninst .data ninst: .long NINST .align 32 PI: .long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 .text .globl latency .type latency, @function .align 32 latency: push rbp mov rbp, rsp xor i, i test N, N jle done # create DP 1.0 vpcmpeqw xmm0, xmm0, xmm0 # all ones vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero # expand from SSE to AVX vinsertf128 ymm0, ymm0, xmm0, 0x1 # copy DP 1.0 vmovaps ymm0, ymm0 vmovaps ymm1, ymm0 # Create DP 2.0 vaddpd ymm1, ymm1, ymm1 # Create DP 0.5 vdivpd ymm2, ymm0, ymm1 loop: inc i INSTR ymm3 INSTR ymm4 INSTR ymm5 INSTR ymm6 INSTR ymm7 INSTR ymm8 INSTR ymm9 INSTR ymm10 INSTR ymm11 INSTR ymm12 INSTR ymm13 INSTR ymm14 INSTR ymm15 INSTR ymm3 INSTR ymm4 INSTR ymm5 INSTR ymm6 INSTR ymm7 INSTR ymm8 INSTR ymm9 INSTR ymm10 INSTR ymm11 INSTR ymm12 INSTR ymm13 INSTR ymm14 INSTR ymm15 INSTR ymm3 INSTR ymm4 INSTR ymm5 INSTR ymm6 INSTR ymm7 INSTR ymm8 INSTR ymm9 INSTR ymm10 INSTR ymm11 INSTR ymm12 INSTR ymm13 INSTR ymm14 INSTR ymm15 INSTR ymm3 INSTR ymm4 INSTR ymm5 INSTR ymm6 INSTR ymm7 INSTR ymm8 INSTR ymm9 INSTR ymm10 INSTR ymm11 INSTR ymm12 INSTR ymm13 INSTR ymm14 INSTR ymm15 INSTR ymm3 INSTR ymm4 INSTR ymm5 INSTR ymm6 INSTR ymm7 INSTR ymm8 INSTR ymm9 INSTR ymm10 INSTR ymm11 INSTR ymm12 INSTR ymm13 INSTR ymm14 cmp i, N jl loop done: mov rsp, rbp pop rbp ret .size latency, .-latency