diff --git a/src/AVX-512/vfmadd213sd-zzz.S b/src/AVX-512/vfmadd213sd-zzz.S index eecc98a..9fc742d 100644 --- a/src/AVX-512/vfmadd213sd-zzz.S +++ b/src/AVX-512/vfmadd213sd-zzz.S @@ -23,17 +23,25 @@ latency: vpcmpeqw xmm0, xmm0, xmm0 # all ones vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 # copy SP 1.0 - vmovaps xmm1, xmm0 + vmovaps zmm1, zmm0 + + # Mark registers AVX-512 + vmovaps zmm0, zmm0 + vmovaps zmm1, zmm1 loop: inc i - INSTR xmm0, xmm1, xmm1 - INSTR xmm0, xmm1, xmm1 - INSTR xmm0, xmm1, xmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 cmp i, N - INSTR xmm0, xmm1, xmm1 - INSTR xmm0, xmm1, xmm1 - INSTR xmm0, xmm1, xmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 jl loop done: mov rsp, rbp