diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..eca4ca2 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +COMPILER=ICC + +TARGET = ibench +SRC_DIR = src +KDIRS += $(patsubst $(SRC_DIR)/%, %, $(wildcard $(SRC_DIR)/*)) +Q = @ + +include include_$(COMPILER).mk + +$(TARGET): ibench.c $(KDIRS) $(KERNELS) + $(Q)echo "===> COMPILING $@" + $(Q)$(CC) $(CFLAGS) $< -o $@ -ldl + +$(KDIRS): + $(Q)mkdir $(KDIRS) + +%.so: + $(Q)echo "===> ASSEMBLING $@" + $(Q)$(AS) $(LFLAGS) $(patsubst %.so, $(SRC_DIR)/%.S, $@) -o $@ + +.PHONY: clean + +clean: + $(Q)echo "===> CLEAN" + $(Q)rm -rf $(KDIRS) + $(Q)rm -f $(TARGET) diff --git a/ibench.c b/ibench.c new file mode 100644 index 0000000..962fea1 --- /dev/null +++ b/ibench.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +double (*latency)(int); +int *ninst; + +void benchmark(const int N, float freq, char *sofile) { + struct timeval start, end; + double benchtime; + char *instr = strtok(sofile, "."); + + double result; + + // run benchmark + gettimeofday(&start, NULL); + result = (*latency)(N); + gettimeofday(&end, NULL); + + + benchtime = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + // divide by 1e6 (usec -> s), ninst (number of instr per loop), + // N/1e9 (loop count vs. GHz); multiply by frequency + benchtime = benchtime / (1e6 * *ninst / freq * (N / 1e9)); + printf("%s:%s\t%.3f (clock cycles)\t[DEBUG - result: %f]\n", instr, strlen(instr) + 1 < 8 ? "\t" : "", benchtime, result); +} + +int main(int argc, const char *argv[]) { + // one million runs + const int N = 1000000; + float freq = 0.0f; + + // need a target directory containing benchmarks + if (argc < 2) { + printf("please specify a directory containing the shared objects with benchmarks to run\n"); + exit(EXIT_FAILURE); + } + + // did the command line specify a frequency? + if (argc < 3) { + printf("Please specify the CPU frequency in GHz. For best results make " + "sure the frequency is fixed, otherwise SpeedStep/Turbo Boost " + "might distort the results.\n"); + exit(EXIT_FAILURE); + } + freq = atof(argv[2]); + printf("Using frequency %.2fGHz.\n", freq); + + // perform benchmark for all shared objects in target directory + DIR *dirp; + struct dirent *dp; + struct stat st; + if ((dirp = opendir(argv[1])) == NULL) { + perror("opendir"); + exit(EXIT_FAILURE); + } + while ((dp = readdir(dirp)) != NULL) { + // only try .so files + char *suffix = ".so"; + int lensuffix = strlen(suffix); + if (strncmp(dp->d_name + strlen(dp->d_name) - lensuffix, ".so", 3)) + continue; + + // load .so + void *handle; + size_t len1 = strlen(argv[1]); + size_t len2 = strlen(dp->d_name); + // directory might be missing a trailing '/' + char *relpath; + if ((relpath = malloc(len1 + len2 + 2)) == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } + snprintf(relpath, len1 + len2 + 2, "%s/%s", argv[1], dp->d_name); + if ((handle = dlopen(relpath, RTLD_LAZY)) == NULL) { + fprintf(stderr, "dlopen: failed to open %s: %s\n", relpath, + dlerror()); + exit(EXIT_FAILURE); + } + if ((latency = (double (*)(int))dlsym(handle, "latency")) == NULL) { + fprintf(stderr, "dlsym: couldn't find function latency in %s: %s\n", + relpath, dlerror()); + return (EXIT_FAILURE); + } + if ((ninst = (int *)dlsym(handle, "ninst")) == NULL) { + fprintf(stderr, "dlsym: couldn't find symbol ninst in %s: %s\n", + relpath, dlerror()); + return (EXIT_FAILURE); + } + free(relpath); + + // do actual benchmark + benchmark(N, freq, dp->d_name); + + dlclose(handle); + } + + return 0; +} diff --git a/include_GCC.mk b/include_GCC.mk new file mode 100644 index 0000000..06369ac --- /dev/null +++ b/include_GCC.mk @@ -0,0 +1,9 @@ +CC = gcc +AS = gcc +CFLAGS = -O3 -x assembler-with-cpp +LFLAGS = -shared + +KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S)) +KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/sse/*.S)) +KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/avx/*.S)) +KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/avx2/*.S)) diff --git a/include_ICC.mk b/include_ICC.mk new file mode 100644 index 0000000..47b702e --- /dev/null +++ b/include_ICC.mk @@ -0,0 +1,9 @@ +CC = icc +AS = icc +CFLAGS = -O3 +LFLAGS = -shared + +KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S)) +KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/sse/*.S)) +KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX/*.S)) +KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX-512/*.S)) diff --git a/include_MIC.mk b/include_MIC.mk new file mode 100644 index 0000000..78ced83 --- /dev/null +++ b/include_MIC.mk @@ -0,0 +1,7 @@ +CC = icc +AS = icc +CFLAGS = -O3 -mmic +LFLAGS = -shared -mmic + +KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/gp/*.S)) +KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/imci/*.S)) diff --git a/include_POWER8.mk b/include_POWER8.mk new file mode 100644 index 0000000..f97084a --- /dev/null +++ b/include_POWER8.mk @@ -0,0 +1,6 @@ +CC = xlc +AS = xlc +CFLAGS = -O3 +LFLAGS = -shared + +KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/vsx/*.S)) diff --git a/src/AVX-512/vaddpd-avx512-TP.S b/src/AVX-512/vaddpd-avx512-TP.S new file mode 100644 index 0000000..8bd147d --- /dev/null +++ b/src/AVX-512/vaddpd-avx512-TP.S @@ -0,0 +1,46 @@ +#define INSTR vaddpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy DP 1.0 + vmovapd zmm1, zmm0 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm0 + INSTR zmm5, zmm0, zmm2 + cmp i, N + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vaddpd-avx512.S b/src/AVX-512/vaddpd-avx512.S new file mode 100644 index 0000000..0b0d32d --- /dev/null +++ b/src/AVX-512/vaddpd-avx512.S @@ -0,0 +1,46 @@ +#define INSTR vaddpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy DP 1.0 + vmovapd zmm1, zmm0 +loop: + inc i + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + cmp i, N + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vaddps-avx512-TP.S b/src/AVX-512/vaddps-avx512-TP.S new file mode 100644 index 0000000..1a45e5d --- /dev/null +++ b/src/AVX-512/vaddps-avx512-TP.S @@ -0,0 +1,46 @@ +#define INSTR vaddps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy SP 1.0 + vmovaps zmm1, zmm0 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm0 + INSTR zmm5, zmm0, zmm2 + cmp i, N + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vaddps-avx512.S b/src/AVX-512/vaddps-avx512.S new file mode 100644 index 0000000..8d00ce6 --- /dev/null +++ b/src/AVX-512/vaddps-avx512.S @@ -0,0 +1,46 @@ +#define INSTR vaddps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy SP 1.0 + vmovaps zmm1, zmm0 +loop: + inc i + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + cmp i, N + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vdivpd-avx512-TP.S b/src/AVX-512/vdivpd-avx512-TP.S new file mode 100644 index 0000000..884c97a --- /dev/null +++ b/src/AVX-512/vdivpd-avx512-TP.S @@ -0,0 +1,59 @@ +#define INSTR vdivpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddpd zmm1, zmm0, zmm0 # create 2.0 + vaddpd zmm2, zmm0, zmm1 # create 3.0 + vaddpd zmm4, zmm1, zmm1 # create 4.0 + vaddpd zmm4, zmm4, zmm4 # create 8.0 + vaddpd zmm4, zmm4, zmm4 # create 16.0 + vaddpd zmm4, zmm4, zmm4 # create 32.0 + vaddpd zmm4, zmm4, zmm4 # create 64.0 + vaddpd zmm4, zmm4, zmm4 # create 128.0 + vaddpd zmm4, zmm4, zmm4 # create 256.0 + vaddpd zmm4, zmm4, zmm4 # create 512.0 + vaddpd zmm4, zmm4, zmm4 # create 1024.0 + vdivpd zmm1, zmm4, zmm2 # create 341.3333 + vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 + vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm0 + INSTR zmm5, zmm0, zmm2 + cmp i, N + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vdivpd-avx512.S b/src/AVX-512/vdivpd-avx512.S new file mode 100644 index 0000000..687d22f --- /dev/null +++ b/src/AVX-512/vdivpd-avx512.S @@ -0,0 +1,59 @@ +#define INSTR vdivpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddpd zmm1, zmm0, zmm0 # create 2.0 + vaddpd zmm2, zmm0, zmm1 # create 3.0 + vaddpd zmm4, zmm1, zmm1 # create 4.0 + vaddpd zmm4, zmm4, zmm4 # create 8.0 + vaddpd zmm4, zmm4, zmm4 # create 16.0 + vaddpd zmm4, zmm4, zmm4 # create 32.0 + vaddpd zmm4, zmm4, zmm4 # create 64.0 + vaddpd zmm4, zmm4, zmm4 # create 128.0 + vaddpd zmm4, zmm4, zmm4 # create 256.0 + vaddpd zmm4, zmm4, zmm4 # create 512.0 + vaddpd zmm4, zmm4, zmm4 # create 1024.0 + vdivpd zmm1, zmm4, zmm2 # create 341.3333 + vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 + vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 +loop: + inc i + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + cmp i, N + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vdivps-avx512-TP.S b/src/AVX-512/vdivps-avx512-TP.S new file mode 100644 index 0000000..a111740 --- /dev/null +++ b/src/AVX-512/vdivps-avx512-TP.S @@ -0,0 +1,59 @@ +#define INSTR vdivps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddps zmm1, zmm0, zmm0 # create 2.0 + vaddps zmm2, zmm0, zmm1 # create 3.0 + vaddps zmm4, zmm1, zmm1 # create 4.0 + vaddps zmm4, zmm4, zmm4 # create 8.0 + vaddps zmm4, zmm4, zmm4 # create 16.0 + vaddps zmm4, zmm4, zmm4 # create 32.0 + vaddps zmm4, zmm4, zmm4 # create 64.0 + vaddps zmm4, zmm4, zmm4 # create 128.0 + vaddps zmm4, zmm4, zmm4 # create 256.0 + vaddps zmm4, zmm4, zmm4 # create 512.0 + vaddps zmm4, zmm4, zmm4 # create 1024.0 + vdivps zmm1, zmm4, zmm2 # create 341.3333 + vdivps zmm2, zmm0, zmm1 # create 1/341.3333 + vaddps zmm0, zmm1, zmm1 # create 2*341.3333 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm2 + INSTR zmm5, zmm0, zmm2 + cmp i, N + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vdivps-avx512.S b/src/AVX-512/vdivps-avx512.S new file mode 100644 index 0000000..16d7fa4 --- /dev/null +++ b/src/AVX-512/vdivps-avx512.S @@ -0,0 +1,59 @@ +#define INSTR vdivps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddps zmm1, zmm0, zmm0 # create 2.0 + vaddps zmm2, zmm0, zmm1 # create 3.0 + vaddps zmm4, zmm1, zmm1 # create 4.0 + vaddps zmm4, zmm4, zmm4 # create 8.0 + vaddps zmm4, zmm4, zmm4 # create 16.0 + vaddps zmm4, zmm4, zmm4 # create 32.0 + vaddps zmm4, zmm4, zmm4 # create 64.0 + vaddps zmm4, zmm4, zmm4 # create 128.0 + vaddps zmm4, zmm4, zmm4 # create 256.0 + vaddps zmm4, zmm4, zmm4 # create 512.0 + vaddps zmm4, zmm4, zmm4 # create 1024.0 + vdivps zmm1, zmm4, zmm2 # create 341.3333 + vdivps zmm2, zmm0, zmm1 # create 1/341.3333 + vaddps zmm0, zmm1, zmm1 # create 2*341.3333 +loop: + inc i + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + cmp i, N + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vfmadd213pd-avx512-TP.S b/src/AVX-512/vfmadd213pd-avx512-TP.S new file mode 100644 index 0000000..7dfa7eb --- /dev/null +++ b/src/AVX-512/vfmadd213pd-avx512-TP.S @@ -0,0 +1,53 @@ +#define INSTR vfmadd213pd +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy DP 1.0 + vmovapd zmm1, zmm0 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm0 + INSTR zmm5, zmm0, zmm2 + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + INSTR zmm9, zmm2, zmm1 + cmp i, N + INSTR zmm10, zmm2, zmm1 + INSTR zmm11, zmm2, zmm1 + INSTR zmm12, zmm2, zmm1 + INSTR zmm13, zmm2, zmm1 + INSTR zmm14, zmm2, zmm1 + INSTR zmm15, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vfmadd213pd-avx512.S b/src/AVX-512/vfmadd213pd-avx512.S new file mode 100644 index 0000000..657abed --- /dev/null +++ b/src/AVX-512/vfmadd213pd-avx512.S @@ -0,0 +1,46 @@ +#define INSTR vfmadd213pd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy DP 1.0 + vmovapd zmm1, zmm0 +loop: + inc i + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + cmp i, N + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vfmadd213ps-avx512-TP.S b/src/AVX-512/vfmadd213ps-avx512-TP.S new file mode 100644 index 0000000..865c03a --- /dev/null +++ b/src/AVX-512/vfmadd213ps-avx512-TP.S @@ -0,0 +1,53 @@ +#define INSTR vfmadd213ps +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy SP 1.0 + vmovaps zmm1, zmm0 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm0 + INSTR zmm5, zmm0, zmm2 + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + INSTR zmm9, zmm2, zmm1 + cmp i, N + INSTR zmm10, zmm2, zmm1 + INSTR zmm11, zmm2, zmm1 + INSTR zmm12, zmm2, zmm1 + INSTR zmm13, zmm2, zmm1 + INSTR zmm14, zmm2, zmm1 + INSTR zmm15, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vfmadd213ps-avx512.S b/src/AVX-512/vfmadd213ps-avx512.S new file mode 100644 index 0000000..cc9430b --- /dev/null +++ b/src/AVX-512/vfmadd213ps-avx512.S @@ -0,0 +1,46 @@ +#define INSTR vfmadd213ps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # copy SP 1.0 + vmovaps zmm1, zmm0 +loop: + inc i + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + cmp i, N + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + INSTR zmm0, zmm1, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vmulpd-avx512-TP.S b/src/AVX-512/vmulpd-avx512-TP.S new file mode 100644 index 0000000..d8c6cf6 --- /dev/null +++ b/src/AVX-512/vmulpd-avx512-TP.S @@ -0,0 +1,48 @@ +#define INSTR vmulpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # create AVX-512 DP 2.0 + vaddpd zmm1, zmm0, zmm0 + # create AVX-512 DP 0.5 + vdivpd zmm2, zmm0, zmm1 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm0 + INSTR zmm5, zmm0, zmm2 + cmp i, N + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vmulpd-avx512.S b/src/AVX-512/vmulpd-avx512.S new file mode 100644 index 0000000..fb27bc9 --- /dev/null +++ b/src/AVX-512/vmulpd-avx512.S @@ -0,0 +1,48 @@ +#define INSTR vmulpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # create AVX-512 DP 2.0 + vaddpd zmm1, zmm0, zmm0 + # create AVX-512 DP 0.5 + vdivpd zmm2, zmm0, zmm1 +loop: + inc i + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + cmp i, N + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vmulps-avx512-TP.S b/src/AVX-512/vmulps-avx512-TP.S new file mode 100644 index 0000000..5bb8008 --- /dev/null +++ b/src/AVX-512/vmulps-avx512-TP.S @@ -0,0 +1,48 @@ +#define INSTR vmulps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # create AVX-512 DP 2.0 + vaddps zmm1, zmm0, zmm0 + # create AVX-512 DP 0.5 + vdivps zmm2, zmm0, zmm1 +loop: + inc i + INSTR zmm3, zmm0, zmm1 + INSTR zmm4, zmm1, zmm0 + INSTR zmm5, zmm0, zmm2 + cmp i, N + INSTR zmm6, zmm2, zmm0 + INSTR zmm7, zmm1, zmm2 + INSTR zmm8, zmm2, zmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vmulps-avx512.S b/src/AVX-512/vmulps-avx512.S new file mode 100644 index 0000000..a0d5432 --- /dev/null +++ b/src/AVX-512/vmulps-avx512.S @@ -0,0 +1,48 @@ +#define INSTR vmulps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + # create AVX-512 DP 2.0 + vaddps zmm1, zmm0, zmm0 + # create AVX-512 DP 0.5 + vdivps zmm2, zmm0, zmm1 +loop: + inc i + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + cmp i, N + INSTR zmm0, zmm0, zmm2 + INSTR zmm0, zmm0, zmm1 + INSTR zmm0, zmm0, zmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vrcp14pd-avx512-TP.S b/src/AVX-512/vrcp14pd-avx512-TP.S new file mode 100644 index 0000000..8e54dcc --- /dev/null +++ b/src/AVX-512/vrcp14pd-avx512-TP.S @@ -0,0 +1,63 @@ +#define INSTR vrcp14pd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddpd zmm1, zmm0, zmm0 # create 2.0 + vaddpd zmm2, zmm0, zmm1 # create 3.0 + vaddpd zmm4, zmm1, zmm1 # create 4.0 + vaddpd zmm4, zmm4, zmm4 # create 8.0 + vaddpd zmm4, zmm4, zmm4 # create 16.0 + vaddpd zmm4, zmm4, zmm4 # create 32.0 + vaddpd zmm4, zmm4, zmm4 # create 64.0 + vaddpd zmm4, zmm4, zmm4 # create 128.0 + vaddpd zmm4, zmm4, zmm4 # create 256.0 + vaddpd zmm4, zmm4, zmm4 # create 512.0 + vaddpd zmm4, zmm4, zmm4 # create 1024.0 + vdivpd zmm1, zmm4, zmm2 # create 341.3333 + vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 + vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 + vmovapd zmm1, zmm0 + vmovapd zmm2, zmm0 + vmovapd zmm3, zmm0 + vmovapd zmm4, zmm0 + vmovapd zmm5, zmm0 +loop: + inc i + INSTR zmm10, zmm0 + INSTR zmm11, zmm1 + INSTR zmm12, zmm2 + cmp i, N + INSTR zmm13, zmm3 + INSTR zmm14, zmm4 + INSTR zmm15, zmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vrcp14pd-avx512.S b/src/AVX-512/vrcp14pd-avx512.S new file mode 100644 index 0000000..2245c90 --- /dev/null +++ b/src/AVX-512/vrcp14pd-avx512.S @@ -0,0 +1,58 @@ +#define INSTR vrcp14pd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddpd zmm1, zmm0, zmm0 # create 2.0 + vaddpd zmm2, zmm0, zmm1 # create 3.0 + vaddpd zmm4, zmm1, zmm1 # create 4.0 + vaddpd zmm4, zmm4, zmm4 # create 8.0 + vaddpd zmm4, zmm4, zmm4 # create 16.0 + vaddpd zmm4, zmm4, zmm4 # create 32.0 + vaddpd zmm4, zmm4, zmm4 # create 64.0 + vaddpd zmm4, zmm4, zmm4 # create 128.0 + vaddpd zmm4, zmm4, zmm4 # create 256.0 + vaddpd zmm4, zmm4, zmm4 # create 512.0 + vaddpd zmm4, zmm4, zmm4 # create 1024.0 + vdivpd zmm1, zmm4, zmm2 # create 341.3333 + vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 + vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 +loop: + inc i + INSTR zmm1, zmm0 + INSTR zmm2, zmm1 + INSTR zmm3, zmm2 + cmp i, N + INSTR zmm4, zmm3 + INSTR zmm5, zmm4 + INSTR zmm0, zmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vrcp14ps-avx512-TP.S b/src/AVX-512/vrcp14ps-avx512-TP.S new file mode 100644 index 0000000..505f41e --- /dev/null +++ b/src/AVX-512/vrcp14ps-avx512-TP.S @@ -0,0 +1,63 @@ +#define INSTR vrcp14ps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddps zmm1, zmm0, zmm0 # create 2.0 + vaddps zmm2, zmm0, zmm1 # create 3.0 + vaddps zmm4, zmm1, zmm1 # create 4.0 + vaddps zmm4, zmm4, zmm4 # create 8.0 + vaddps zmm4, zmm4, zmm4 # create 16.0 + vaddps zmm4, zmm4, zmm4 # create 32.0 + vaddps zmm4, zmm4, zmm4 # create 64.0 + vaddps zmm4, zmm4, zmm4 # create 128.0 + vaddps zmm4, zmm4, zmm4 # create 256.0 + vaddps zmm4, zmm4, zmm4 # create 512.0 + vaddps zmm4, zmm4, zmm4 # create 1024.0 + vdivps zmm1, zmm4, zmm2 # create 341.3333 + vdivps zmm2, zmm0, zmm1 # create 1/341.3333 + vaddps zmm0, zmm1, zmm1 # create 2*341.3333 + vmovaps zmm1, zmm0 + vmovaps zmm2, zmm0 + vmovaps zmm3, zmm0 + vmovaps zmm4, zmm0 + vmovaps zmm5, zmm0 +loop: + inc i + INSTR zmm10, zmm0 + INSTR zmm11, zmm1 + INSTR zmm12, zmm2 + cmp i, N + INSTR zmm13, zmm3 + INSTR zmm14, zmm4 + INSTR zmm15, zmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX-512/vrcp14ps-avx512.S b/src/AVX-512/vrcp14ps-avx512.S new file mode 100644 index 0000000..37cda6d --- /dev/null +++ b/src/AVX-512/vrcp14ps-avx512.S @@ -0,0 +1,58 @@ +#define INSTR vrcp14ps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SSE SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # expand from AVX to AVX-512 + vinsertf64x4 zmm0, zmm0, ymm0, 0x1 + + vaddps zmm1, zmm0, zmm0 # create 2.0 + vaddps zmm2, zmm0, zmm1 # create 3.0 + vaddps zmm4, zmm1, zmm1 # create 4.0 + vaddps zmm4, zmm4, zmm4 # create 8.0 + vaddps zmm4, zmm4, zmm4 # create 16.0 + vaddps zmm4, zmm4, zmm4 # create 32.0 + vaddps zmm4, zmm4, zmm4 # create 64.0 + vaddps zmm4, zmm4, zmm4 # create 128.0 + vaddps zmm4, zmm4, zmm4 # create 256.0 + vaddps zmm4, zmm4, zmm4 # create 512.0 + vaddps zmm4, zmm4, zmm4 # create 1024.0 + vdivps zmm1, zmm4, zmm2 # create 341.3333 + vdivps zmm2, zmm0, zmm1 # create 1/341.3333 + vaddps zmm0, zmm1, zmm1 # create 2*341.3333 +loop: + inc i + INSTR zmm1, zmm0 + INSTR zmm2, zmm1 + INSTR zmm3, zmm2 + cmp i, N + INSTR zmm4, zmm3 + INSTR zmm5, zmm4 + INSTR zmm0, zmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/rcpss-TP.S b/src/AVX/rcpss-TP.S new file mode 100644 index 0000000..f6fdc5b --- /dev/null +++ b/src/AVX/rcpss-TP.S @@ -0,0 +1,59 @@ +#define INSTR rcpss +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddss xmm1, xmm0, xmm0 # create 2.0 + vaddss xmm2, xmm0, xmm1 # create 3.0 + vaddss xmm4, xmm1, xmm1 # create 4.0 + vaddss xmm4, xmm4, xmm4 # create 8.0 + vaddss xmm4, xmm4, xmm4 # create 16.0 + vaddss xmm4, xmm4, xmm4 # create 32.0 + vaddss xmm4, xmm4, xmm4 # create 64.0 + vaddss xmm4, xmm4, xmm4 # create 128.0 + vaddss xmm4, xmm4, xmm4 # create 256.0 + vaddss xmm4, xmm4, xmm4 # create 512.0 + vaddss xmm4, xmm4, xmm4 # create 1024.0 + vdivss xmm1, xmm4, xmm2 # create 341.3333 + vdivss xmm2, xmm0, xmm1 # create 1/341.3333 + vaddss xmm0, xmm1, xmm1 # create 2*341.3333 + movss xmm1, xmm0 + movss xmm2, xmm0 + movss xmm3, xmm0 + movss xmm4, xmm0 + movss xmm5, xmm0 +loop: + inc i + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + cmp i, N + INSTR xmm13, xmm3 + INSTR xmm14, xmm4 + INSTR xmm15, xmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/rcpss.S b/src/AVX/rcpss.S new file mode 100644 index 0000000..1462e7c --- /dev/null +++ b/src/AVX/rcpss.S @@ -0,0 +1,54 @@ +#define INSTR vrcpps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddps xmm1, xmm0, xmm0 # create 2.0 + vaddps xmm2, xmm0, xmm1 # create 3.0 + vaddps xmm4, xmm1, xmm1 # create 4.0 + vaddps xmm4, xmm4, xmm4 # create 8.0 + vaddps xmm4, xmm4, xmm4 # create 16.0 + vaddps xmm4, xmm4, xmm4 # create 32.0 + vaddps xmm4, xmm4, xmm4 # create 64.0 + vaddps xmm4, xmm4, xmm4 # create 128.0 + vaddps xmm4, xmm4, xmm4 # create 256.0 + vaddps xmm4, xmm4, xmm4 # create 512.0 + vaddps xmm4, xmm4, xmm4 # create 1024.0 + vdivps xmm1, xmm4, xmm2 # create 341.3333 + vdivps xmm2, xmm0, xmm1 # create 1/341.3333 + vaddps xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm1, xmm0 + INSTR xmm2, xmm1 + INSTR xmm3, xmm2 + cmp i, N + INSTR xmm4, xmm3 + INSTR xmm5, xmm4 + INSTR xmm0, xmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddpd-avx-TP.S b/src/AVX/vaddpd-avx-TP.S new file mode 100644 index 0000000..5a756dd --- /dev/null +++ b/src/AVX/vaddpd-avx-TP.S @@ -0,0 +1,44 @@ +#define INSTR vaddpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm0 + INSTR ymm5, ymm0, ymm2 + cmp i, N + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddpd-avx.S b/src/AVX/vaddpd-avx.S new file mode 100644 index 0000000..f0b7280 --- /dev/null +++ b/src/AVX/vaddpd-avx.S @@ -0,0 +1,44 @@ +#define INSTR vaddpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + cmp i, N + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddpd-sse-TP.S b/src/AVX/vaddpd-sse-TP.S new file mode 100644 index 0000000..b4e4ac2 --- /dev/null +++ b/src/AVX/vaddpd-sse-TP.S @@ -0,0 +1,42 @@ +#define INSTR vaddpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddpd-sse.S b/src/AVX/vaddpd-sse.S new file mode 100644 index 0000000..27d1c68 --- /dev/null +++ b/src/AVX/vaddpd-sse.S @@ -0,0 +1,42 @@ +#define INSTR vaddpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddps-avx-TP.S b/src/AVX/vaddps-avx-TP.S new file mode 100644 index 0000000..292bdbf --- /dev/null +++ b/src/AVX/vaddps-avx-TP.S @@ -0,0 +1,44 @@ +#define INSTR vaddps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm0 + INSTR ymm5, ymm0, ymm2 + cmp i, N + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddps-avx.S b/src/AVX/vaddps-avx.S new file mode 100644 index 0000000..7cb7312 --- /dev/null +++ b/src/AVX/vaddps-avx.S @@ -0,0 +1,44 @@ +#define INSTR vaddps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + cmp i, N + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddps-sse-TP.S b/src/AVX/vaddps-sse-TP.S new file mode 100644 index 0000000..15dcf89 --- /dev/null +++ b/src/AVX/vaddps-sse-TP.S @@ -0,0 +1,42 @@ +#define INSTR vaddps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddps-sse.S b/src/AVX/vaddps-sse.S new file mode 100644 index 0000000..1a0246e --- /dev/null +++ b/src/AVX/vaddps-sse.S @@ -0,0 +1,42 @@ +#define INSTR vaddps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddsd-TP.S b/src/AVX/vaddsd-TP.S new file mode 100644 index 0000000..493dbe1 --- /dev/null +++ b/src/AVX/vaddsd-TP.S @@ -0,0 +1,42 @@ +#define INSTR vaddsd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddsd.S b/src/AVX/vaddsd.S new file mode 100644 index 0000000..b4d9972 --- /dev/null +++ b/src/AVX/vaddsd.S @@ -0,0 +1,42 @@ +#define INSTR vaddsd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddss-TP.S b/src/AVX/vaddss-TP.S new file mode 100644 index 0000000..29ba1be --- /dev/null +++ b/src/AVX/vaddss-TP.S @@ -0,0 +1,42 @@ +#define INSTR vaddss +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vaddss.S b/src/AVX/vaddss.S new file mode 100644 index 0000000..faf9769 --- /dev/null +++ b/src/AVX/vaddss.S @@ -0,0 +1,42 @@ +#define INSTR vaddss +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivpd-avx-TP.S b/src/AVX/vdivpd-avx-TP.S new file mode 100644 index 0000000..2a84edd --- /dev/null +++ b/src/AVX/vdivpd-avx-TP.S @@ -0,0 +1,55 @@ +#define INSTR vdivpd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + vinsertf128 ymm0, ymm0, xmm0, 0x1 + + vaddpd ymm1, ymm0, ymm0 # create 2.0 + vaddpd ymm2, ymm0, ymm1 # create 3.0 + vaddpd ymm4, ymm1, ymm1 # create 4.0 + vaddpd ymm4, ymm4, ymm4 # create 8.0 + vaddpd ymm4, ymm4, ymm4 # create 16.0 + vaddpd ymm4, ymm4, ymm4 # create 32.0 + vaddpd ymm4, ymm4, ymm4 # create 64.0 + vaddpd ymm4, ymm4, ymm4 # create 128.0 + vaddpd ymm4, ymm4, ymm4 # create 256.0 + vaddpd ymm4, ymm4, ymm4 # create 512.0 + vaddpd ymm4, ymm4, ymm4 # create 1024.0 + vdivpd ymm1, ymm4, ymm2 # create 341.3333 + vdivpd ymm2, ymm0, ymm1 # create 1/341.3333 + vaddpd ymm0, ymm1, ymm1 # create 2*341.3333 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm0 + INSTR ymm5, ymm0, ymm2 + cmp i, N + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivpd-avx.S b/src/AVX/vdivpd-avx.S new file mode 100644 index 0000000..7ffeff8 --- /dev/null +++ b/src/AVX/vdivpd-avx.S @@ -0,0 +1,55 @@ +#define INSTR vdivpd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + vinsertf128 ymm0, ymm0, xmm0, 0x1 + + vaddpd ymm1, ymm0, ymm0 # create 2.0 + vaddpd ymm2, ymm0, ymm1 # create 3.0 + vaddpd ymm4, ymm1, ymm1 # create 4.0 + vaddpd ymm4, ymm4, ymm4 # create 8.0 + vaddpd ymm4, ymm4, ymm4 # create 16.0 + vaddpd ymm4, ymm4, ymm4 # create 32.0 + vaddpd ymm4, ymm4, ymm4 # create 64.0 + vaddpd ymm4, ymm4, ymm4 # create 128.0 + vaddpd ymm4, ymm4, ymm4 # create 256.0 + vaddpd ymm4, ymm4, ymm4 # create 512.0 + vaddpd ymm4, ymm4, ymm4 # create 1024.0 + vdivpd ymm1, ymm4, ymm2 # create 341.3333 + vdivpd ymm2, ymm0, ymm1 # create 1/341.3333 + vaddpd ymm0, ymm1, ymm1 # create 2*341.3333 +loop: + inc i + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + cmp i, N + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivpd-sse-TP.S b/src/AVX/vdivpd-sse-TP.S new file mode 100644 index 0000000..ef53baa --- /dev/null +++ b/src/AVX/vdivpd-sse-TP.S @@ -0,0 +1,54 @@ +#define INSTR vdivpd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddpd xmm1, xmm0, xmm0 # create 2.0 + vaddpd xmm2, xmm0, xmm1 # create 3.0 + vaddpd xmm4, xmm1, xmm1 # create 4.0 + vaddpd xmm4, xmm4, xmm4 # create 8.0 + vaddpd xmm4, xmm4, xmm4 # create 16.0 + vaddpd xmm4, xmm4, xmm4 # create 32.0 + vaddpd xmm4, xmm4, xmm4 # create 64.0 + vaddpd xmm4, xmm4, xmm4 # create 128.0 + vaddpd xmm4, xmm4, xmm4 # create 256.0 + vaddpd xmm4, xmm4, xmm4 # create 512.0 + vaddpd xmm4, xmm4, xmm4 # create 1024.0 + vdivpd xmm1, xmm4, xmm2 # create 341.3333 + vdivpd xmm2, xmm0, xmm1 # create 1/341.3333 + vaddpd xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivpd-sse.S b/src/AVX/vdivpd-sse.S new file mode 100644 index 0000000..985cd98 --- /dev/null +++ b/src/AVX/vdivpd-sse.S @@ -0,0 +1,54 @@ +#define INSTR vdivpd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddpd xmm1, xmm0, xmm0 # create 2.0 + vaddpd xmm2, xmm0, xmm1 # create 3.0 + vaddpd xmm4, xmm1, xmm1 # create 4.0 + vaddpd xmm4, xmm4, xmm4 # create 8.0 + vaddpd xmm4, xmm4, xmm4 # create 16.0 + vaddpd xmm4, xmm4, xmm4 # create 32.0 + vaddpd xmm4, xmm4, xmm4 # create 64.0 + vaddpd xmm4, xmm4, xmm4 # create 128.0 + vaddpd xmm4, xmm4, xmm4 # create 256.0 + vaddpd xmm4, xmm4, xmm4 # create 512.0 + vaddpd xmm4, xmm4, xmm4 # create 1024.0 + vdivpd xmm1, xmm4, xmm2 # create 341.3333 + vdivpd xmm2, xmm0, xmm1 # create 1/341.3333 + vaddpd xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivps-avx-TP.S b/src/AVX/vdivps-avx-TP.S new file mode 100644 index 0000000..a1d4be5 --- /dev/null +++ b/src/AVX/vdivps-avx-TP.S @@ -0,0 +1,55 @@ +#define INSTR vdivps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + vinsertf128 ymm0, ymm0, xmm0, 0x1 + + vaddps ymm1, ymm0, ymm0 # create 2.0 + vaddps ymm2, ymm0, ymm1 # create 3.0 + vaddps ymm4, ymm1, ymm1 # create 4.0 + vaddps ymm4, ymm4, ymm4 # create 8.0 + vaddps ymm4, ymm4, ymm4 # create 16.0 + vaddps ymm4, ymm4, ymm4 # create 32.0 + vaddps ymm4, ymm4, ymm4 # create 64.0 + vaddps ymm4, ymm4, ymm4 # create 128.0 + vaddps ymm4, ymm4, ymm4 # create 256.0 + vaddps ymm4, ymm4, ymm4 # create 512.0 + vaddps ymm4, ymm4, ymm4 # create 1024.0 + vdivps ymm1, ymm4, ymm2 # create 341.3333 + vdivps ymm2, ymm0, ymm1 # create 1/341.3333 + vaddps ymm0, ymm1, ymm1 # create 2*341.3333 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm2 + INSTR ymm5, ymm0, ymm2 + cmp i, N + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivps-avx.S b/src/AVX/vdivps-avx.S new file mode 100644 index 0000000..e941cd7 --- /dev/null +++ b/src/AVX/vdivps-avx.S @@ -0,0 +1,55 @@ +#define INSTR vdivps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + vinsertf128 ymm0, ymm0, xmm0, 0x1 + + vaddps ymm1, ymm0, ymm0 # create 2.0 + vaddps ymm2, ymm0, ymm1 # create 3.0 + vaddps ymm4, ymm1, ymm1 # create 4.0 + vaddps ymm4, ymm4, ymm4 # create 8.0 + vaddps ymm4, ymm4, ymm4 # create 16.0 + vaddps ymm4, ymm4, ymm4 # create 32.0 + vaddps ymm4, ymm4, ymm4 # create 64.0 + vaddps ymm4, ymm4, ymm4 # create 128.0 + vaddps ymm4, ymm4, ymm4 # create 256.0 + vaddps ymm4, ymm4, ymm4 # create 512.0 + vaddps ymm4, ymm4, ymm4 # create 1024.0 + vdivps ymm1, ymm4, ymm2 # create 341.3333 + vdivps ymm2, ymm0, ymm1 # create 1/341.3333 + vaddps ymm0, ymm1, ymm1 # create 2*341.3333 +loop: + inc i + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + cmp i, N + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivps-sse-TP.S b/src/AVX/vdivps-sse-TP.S new file mode 100644 index 0000000..9ac0de2 --- /dev/null +++ b/src/AVX/vdivps-sse-TP.S @@ -0,0 +1,54 @@ +#define INSTR vdivps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddps xmm1, xmm0, xmm0 # create 2.0 + vaddps xmm2, xmm0, xmm1 # create 3.0 + vaddps xmm4, xmm1, xmm1 # create 4.0 + vaddps xmm4, xmm4, xmm4 # create 8.0 + vaddps xmm4, xmm4, xmm4 # create 16.0 + vaddps xmm4, xmm4, xmm4 # create 32.0 + vaddps xmm4, xmm4, xmm4 # create 64.0 + vaddps xmm4, xmm4, xmm4 # create 128.0 + vaddps xmm4, xmm4, xmm4 # create 256.0 + vaddps xmm4, xmm4, xmm4 # create 512.0 + vaddps xmm4, xmm4, xmm4 # create 1024.0 + vdivps xmm1, xmm4, xmm2 # create 341.3333 + vdivps xmm2, xmm0, xmm1 # create 1/341.3333 + vaddps xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivps-sse.S b/src/AVX/vdivps-sse.S new file mode 100644 index 0000000..13b3ff1 --- /dev/null +++ b/src/AVX/vdivps-sse.S @@ -0,0 +1,54 @@ +#define INSTR vdivps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddps xmm1, xmm0, xmm0 # create 2.0 + vaddps xmm2, xmm0, xmm1 # create 3.0 + vaddps xmm4, xmm1, xmm1 # create 4.0 + vaddps xmm4, xmm4, xmm4 # create 8.0 + vaddps xmm4, xmm4, xmm4 # create 16.0 + vaddps xmm4, xmm4, xmm4 # create 32.0 + vaddps xmm4, xmm4, xmm4 # create 64.0 + vaddps xmm4, xmm4, xmm4 # create 128.0 + vaddps xmm4, xmm4, xmm4 # create 256.0 + vaddps xmm4, xmm4, xmm4 # create 512.0 + vaddps xmm4, xmm4, xmm4 # create 1024.0 + vdivps xmm1, xmm4, xmm2 # create 341.3333 + vdivps xmm2, xmm0, xmm1 # create 1/341.3333 + vaddps xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivsd-TP.S b/src/AVX/vdivsd-TP.S new file mode 100644 index 0000000..bea51af --- /dev/null +++ b/src/AVX/vdivsd-TP.S @@ -0,0 +1,54 @@ +#define INSTR vdivsd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddsd xmm1, xmm0, xmm0 # create 2.0 + vaddsd xmm2, xmm0, xmm1 # create 3.0 + vaddsd xmm4, xmm1, xmm1 # create 4.0 + vaddsd xmm4, xmm4, xmm4 # create 8.0 + vaddsd xmm4, xmm4, xmm4 # create 16.0 + vaddsd xmm4, xmm4, xmm4 # create 32.0 + vaddsd xmm4, xmm4, xmm4 # create 64.0 + vaddsd xmm4, xmm4, xmm4 # create 128.0 + vaddsd xmm4, xmm4, xmm4 # create 256.0 + vaddsd xmm4, xmm4, xmm4 # create 512.0 + vaddsd xmm4, xmm4, xmm4 # create 1024.0 + vdivsd xmm1, xmm4, xmm2 # create 341.3333 + vdivsd xmm2, xmm0, xmm1 # create 1/341.3333 + vaddsd xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivsd.S b/src/AVX/vdivsd.S new file mode 100644 index 0000000..bb485bd --- /dev/null +++ b/src/AVX/vdivsd.S @@ -0,0 +1,54 @@ +#define INSTR vdivsd +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddsd xmm1, xmm0, xmm0 # create 2.0 + vaddsd xmm2, xmm0, xmm1 # create 3.0 + vaddsd xmm4, xmm1, xmm1 # create 4.0 + vaddsd xmm4, xmm4, xmm4 # create 8.0 + vaddsd xmm4, xmm4, xmm4 # create 16.0 + vaddsd xmm4, xmm4, xmm4 # create 32.0 + vaddsd xmm4, xmm4, xmm4 # create 64.0 + vaddsd xmm4, xmm4, xmm4 # create 128.0 + vaddsd xmm4, xmm4, xmm4 # create 256.0 + vaddsd xmm4, xmm4, xmm4 # create 512.0 + vaddsd xmm4, xmm4, xmm4 # create 1024.0 + vdivsd xmm1, xmm4, xmm2 # create 341.3333 + vdivsd xmm2, xmm0, xmm1 # create 1/341.3333 + vaddsd xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivss-TP.S b/src/AVX/vdivss-TP.S new file mode 100644 index 0000000..44ea7b5 --- /dev/null +++ b/src/AVX/vdivss-TP.S @@ -0,0 +1,54 @@ +#define INSTR vdivss +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddss xmm1, xmm0, xmm0 # create 2.0 + vaddss xmm2, xmm0, xmm1 # create 3.0 + vaddss xmm4, xmm1, xmm1 # create 4.0 + vaddss xmm4, xmm4, xmm4 # create 8.0 + vaddss xmm4, xmm4, xmm4 # create 16.0 + vaddss xmm4, xmm4, xmm4 # create 32.0 + vaddss xmm4, xmm4, xmm4 # create 64.0 + vaddss xmm4, xmm4, xmm4 # create 128.0 + vaddss xmm4, xmm4, xmm4 # create 256.0 + vaddss xmm4, xmm4, xmm4 # create 512.0 + vaddss xmm4, xmm4, xmm4 # create 1024.0 + vdivss xmm1, xmm4, xmm2 # create 341.3333 + vdivss xmm2, xmm0, xmm1 # create 1/341.3333 + vaddss xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vdivss.S b/src/AVX/vdivss.S new file mode 100644 index 0000000..0f01db5 --- /dev/null +++ b/src/AVX/vdivss.S @@ -0,0 +1,54 @@ +#define INSTR vdivss +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddss xmm1, xmm0, xmm0 # create 2.0 + vaddss xmm2, xmm0, xmm1 # create 3.0 + vaddss xmm4, xmm1, xmm1 # create 4.0 + vaddss xmm4, xmm4, xmm4 # create 8.0 + vaddss xmm4, xmm4, xmm4 # create 16.0 + vaddss xmm4, xmm4, xmm4 # create 32.0 + vaddss xmm4, xmm4, xmm4 # create 64.0 + vaddss xmm4, xmm4, xmm4 # create 128.0 + vaddss xmm4, xmm4, xmm4 # create 256.0 + vaddss xmm4, xmm4, xmm4 # create 512.0 + vaddss xmm4, xmm4, xmm4 # create 1024.0 + vdivss xmm1, xmm4, xmm2 # create 341.3333 + vdivss xmm2, xmm0, xmm1 # create 1/341.3333 + vaddss xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213pd-avx-TP.S b/src/AVX/vfmadd213pd-avx-TP.S new file mode 100644 index 0000000..07e6c91 --- /dev/null +++ b/src/AVX/vfmadd213pd-avx-TP.S @@ -0,0 +1,51 @@ +#define INSTR vfmadd213pd +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm0 + INSTR ymm5, ymm0, ymm2 + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + INSTR ymm9, ymm2, ymm1 + cmp i, N + INSTR ymm10, ymm2, ymm1 + INSTR ymm11, ymm2, ymm1 + INSTR ymm12, ymm2, ymm1 + INSTR ymm13, ymm2, ymm1 + INSTR ymm14, ymm2, ymm1 + INSTR ymm15, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213pd-avx.S b/src/AVX/vfmadd213pd-avx.S new file mode 100644 index 0000000..ff466b8 --- /dev/null +++ b/src/AVX/vfmadd213pd-avx.S @@ -0,0 +1,44 @@ +#define INSTR vfmadd213pd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + cmp i, N + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213pd-sse-TP.S b/src/AVX/vfmadd213pd-sse-TP.S new file mode 100644 index 0000000..d4933c3 --- /dev/null +++ b/src/AVX/vfmadd213pd-sse-TP.S @@ -0,0 +1,49 @@ +#define INSTR vfmadd213pd +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + INSTR xmm9, xmm2, xmm1 + cmp i, N + INSTR xmm10, xmm2, xmm1 + INSTR xmm11, xmm2, xmm1 + INSTR xmm12, xmm2, xmm1 + INSTR xmm13, xmm2, xmm1 + INSTR xmm14, xmm2, xmm1 + INSTR xmm15, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213pd-sse.S b/src/AVX/vfmadd213pd-sse.S new file mode 100644 index 0000000..972604e --- /dev/null +++ b/src/AVX/vfmadd213pd-sse.S @@ -0,0 +1,42 @@ +#define INSTR vfmadd213pd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + cmp i, N + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213ps-avx-TP.S b/src/AVX/vfmadd213ps-avx-TP.S new file mode 100644 index 0000000..bb07e52 --- /dev/null +++ b/src/AVX/vfmadd213ps-avx-TP.S @@ -0,0 +1,51 @@ +#define INSTR vfmadd213ps +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm0 + INSTR ymm5, ymm0, ymm2 + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + INSTR ymm9, ymm2, ymm1 + cmp i, N + INSTR ymm10, ymm2, ymm1 + INSTR ymm11, ymm2, ymm1 + INSTR ymm12, ymm2, ymm1 + INSTR ymm13, ymm2, ymm1 + INSTR ymm14, ymm2, ymm1 + INSTR ymm15, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213ps-avx.S b/src/AVX/vfmadd213ps-avx.S new file mode 100644 index 0000000..5f7ddeb --- /dev/null +++ b/src/AVX/vfmadd213ps-avx.S @@ -0,0 +1,44 @@ +#define INSTR vfmadd213ps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy SP 1.0 + vmovaps ymm1, ymm0 +loop: + inc i + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + cmp i, N + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + INSTR ymm0, ymm1, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213ps-sse-TP.S b/src/AVX/vfmadd213ps-sse-TP.S new file mode 100644 index 0000000..02badfe --- /dev/null +++ b/src/AVX/vfmadd213ps-sse-TP.S @@ -0,0 +1,49 @@ +#define INSTR vfmadd213ps +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + INSTR xmm9, xmm2, xmm1 + cmp i, N + INSTR xmm10, xmm2, xmm1 + INSTR xmm11, xmm2, xmm1 + INSTR xmm12, xmm2, xmm1 + INSTR xmm13, xmm2, xmm1 + INSTR xmm14, xmm2, xmm1 + INSTR xmm15, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213ps-sse.S b/src/AVX/vfmadd213ps-sse.S new file mode 100644 index 0000000..5716c50 --- /dev/null +++ b/src/AVX/vfmadd213ps-sse.S @@ -0,0 +1,42 @@ +#define INSTR vfmadd213ps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + cmp i, N + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213sd-TP.S b/src/AVX/vfmadd213sd-TP.S new file mode 100644 index 0000000..2b71026 --- /dev/null +++ b/src/AVX/vfmadd213sd-TP.S @@ -0,0 +1,49 @@ +#define INSTR vfmadd213sd +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + INSTR xmm9, xmm2, xmm1 + cmp i, N + INSTR xmm10, xmm2, xmm1 + INSTR xmm11, xmm2, xmm1 + INSTR xmm12, xmm2, xmm1 + INSTR xmm13, xmm2, xmm1 + INSTR xmm14, xmm2, xmm1 + INSTR xmm15, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213sd.S b/src/AVX/vfmadd213sd.S new file mode 100644 index 0000000..26e416d --- /dev/null +++ b/src/AVX/vfmadd213sd.S @@ -0,0 +1,42 @@ +#define INSTR vfmadd213sd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + cmp i, N + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213ss-TP.S b/src/AVX/vfmadd213ss-TP.S new file mode 100644 index 0000000..d7b9567 --- /dev/null +++ b/src/AVX/vfmadd213ss-TP.S @@ -0,0 +1,49 @@ +#define INSTR vfmadd213ss +#define NINST 13 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + INSTR xmm9, xmm2, xmm1 + cmp i, N + INSTR xmm10, xmm2, xmm1 + INSTR xmm11, xmm2, xmm1 + INSTR xmm12, xmm2, xmm1 + INSTR xmm13, xmm2, xmm1 + INSTR xmm14, xmm2, xmm1 + INSTR xmm15, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vfmadd213ss.S b/src/AVX/vfmadd213ss.S new file mode 100644 index 0000000..e82f2bd --- /dev/null +++ b/src/AVX/vfmadd213ss.S @@ -0,0 +1,42 @@ +#define INSTR vfmadd213ss +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy SP 1.0 + vmovaps xmm1, xmm0 +loop: + inc i + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + cmp i, N + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + INSTR xmm0, xmm1, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulpd-avx-TP.S b/src/AVX/vmulpd-avx-TP.S new file mode 100644 index 0000000..2c15165 --- /dev/null +++ b/src/AVX/vmulpd-avx-TP.S @@ -0,0 +1,46 @@ +#define INSTR vmulpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # create SP 2.0 + vaddpd ymm1, ymm0, ymm0 + # create SP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm0 + INSTR ymm5, ymm0, ymm2 + cmp i, N + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulpd-avx.S b/src/AVX/vmulpd-avx.S new file mode 100644 index 0000000..44c37bc --- /dev/null +++ b/src/AVX/vmulpd-avx.S @@ -0,0 +1,46 @@ +#define INSTR vmulpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # create SP 2.0 + vaddpd ymm1, ymm0, ymm0 + # create SP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + cmp i, N + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulpd-sse-TP.S b/src/AVX/vmulpd-sse-TP.S new file mode 100644 index 0000000..865def3 --- /dev/null +++ b/src/AVX/vmulpd-sse-TP.S @@ -0,0 +1,44 @@ +#define INSTR vmulpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddpd xmm1, xmm0, xmm0 + # create SP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulpd-sse.S b/src/AVX/vmulpd-sse.S new file mode 100644 index 0000000..44ed2c5 --- /dev/null +++ b/src/AVX/vmulpd-sse.S @@ -0,0 +1,44 @@ +#define INSTR vmulpd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddpd xmm1, xmm0, xmm0 + # create SP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulps-avx-TP.S b/src/AVX/vmulps-avx-TP.S new file mode 100644 index 0000000..01507f4 --- /dev/null +++ b/src/AVX/vmulps-avx-TP.S @@ -0,0 +1,46 @@ +#define INSTR vmulps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # create SP 2.0 + vaddps ymm1, ymm0, ymm0 + # create SP 0.5 + vdivps ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm1 + INSTR ymm4, ymm1, ymm0 + INSTR ymm5, ymm0, ymm2 + cmp i, N + INSTR ymm6, ymm2, ymm0 + INSTR ymm7, ymm1, ymm2 + INSTR ymm8, ymm2, ymm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulps-avx.S b/src/AVX/vmulps-avx.S new file mode 100644 index 0000000..f96bee3 --- /dev/null +++ b/src/AVX/vmulps-avx.S @@ -0,0 +1,46 @@ +#define INSTR vmulps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # create SP 2.0 + vaddps ymm1, ymm0, ymm0 + # create SP 0.5 + vdivps ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + cmp i, N + INSTR ymm0, ymm0, ymm2 + INSTR ymm0, ymm0, ymm1 + INSTR ymm0, ymm0, ymm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulps-sse-TP.S b/src/AVX/vmulps-sse-TP.S new file mode 100644 index 0000000..3f94ff4 --- /dev/null +++ b/src/AVX/vmulps-sse-TP.S @@ -0,0 +1,44 @@ +#define INSTR vmulps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddps xmm1, xmm0, xmm0 + # create SP 0.5 + vdivps xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulps-sse.S b/src/AVX/vmulps-sse.S new file mode 100644 index 0000000..cdf1979 --- /dev/null +++ b/src/AVX/vmulps-sse.S @@ -0,0 +1,44 @@ +#define INSTR vmulps +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddps xmm1, xmm0, xmm0 + # create SP 0.5 + vdivps xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulsd-TP.S b/src/AVX/vmulsd-TP.S new file mode 100644 index 0000000..c4a9f46 --- /dev/null +++ b/src/AVX/vmulsd-TP.S @@ -0,0 +1,44 @@ +#define INSTR vmulsd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddps xmm1, xmm0, xmm0 + # create SP 0.5 + vdivps xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulsd.S b/src/AVX/vmulsd.S new file mode 100644 index 0000000..67a5d77 --- /dev/null +++ b/src/AVX/vmulsd.S @@ -0,0 +1,44 @@ +#define INSTR vmulsd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddpd xmm1, xmm0, xmm0 + # create SP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulss-TP.S b/src/AVX/vmulss-TP.S new file mode 100644 index 0000000..713f03b --- /dev/null +++ b/src/AVX/vmulss-TP.S @@ -0,0 +1,44 @@ +#define INSTR vmulss +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddps xmm1, xmm0, xmm0 + # create SP 0.5 + vdivps xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm1 + INSTR xmm4, xmm1, xmm0 + INSTR xmm5, xmm0, xmm2 + cmp i, N + INSTR xmm6, xmm2, xmm0 + INSTR xmm7, xmm1, xmm2 + INSTR xmm8, xmm2, xmm1 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vmulss.S b/src/AVX/vmulss.S new file mode 100644 index 0000000..4a99d79 --- /dev/null +++ b/src/AVX/vmulss.S @@ -0,0 +1,44 @@ +#define INSTR vmulss +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # create SP 2.0 + vaddps xmm1, xmm0, xmm0 + # create SP 0.5 + vdivps xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + cmp i, N + INSTR xmm0, xmm0, xmm2 + INSTR xmm0, xmm0, xmm1 + INSTR xmm0, xmm0, xmm2 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vrcpps-avx-TP.S b/src/AVX/vrcpps-avx-TP.S new file mode 100644 index 0000000..5917124 --- /dev/null +++ b/src/AVX/vrcpps-avx-TP.S @@ -0,0 +1,60 @@ +#define INSTR vrcpps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + vinsertf128 ymm0, ymm0, xmm0, 0x1 + + vaddps ymm1, ymm0, ymm0 # create 2.0 + vaddps ymm2, ymm0, ymm1 # create 3.0 + vaddps ymm4, ymm1, ymm1 # create 4.0 + vaddps ymm4, ymm4, ymm4 # create 8.0 + vaddps ymm4, ymm4, ymm4 # create 16.0 + vaddps ymm4, ymm4, ymm4 # create 32.0 + vaddps ymm4, ymm4, ymm4 # create 64.0 + vaddps ymm4, ymm4, ymm4 # create 128.0 + vaddps ymm4, ymm4, ymm4 # create 256.0 + vaddps ymm4, ymm4, ymm4 # create 512.0 + vaddps ymm4, ymm4, ymm4 # create 1024.0 + vdivps ymm1, ymm4, ymm2 # create 341.3333 + vdivps ymm2, ymm0, ymm1 # create 1/341.3333 + vaddps ymm0, ymm1, ymm1 # create 2*341.3333 + vmovaps ymm1, ymm0 + vmovaps ymm2, ymm0 + vmovaps ymm3, ymm0 + vmovaps ymm4, ymm0 + vmovaps ymm5, ymm0 +loop: + inc i + INSTR ymm10, ymm0 + INSTR ymm11, ymm1 + INSTR ymm12, ymm2 + cmp i, N + INSTR ymm13, ymm3 + INSTR ymm14, ymm4 + INSTR ymm15, ymm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vrcpps-avx.S b/src/AVX/vrcpps-avx.S new file mode 100644 index 0000000..b7cfed1 --- /dev/null +++ b/src/AVX/vrcpps-avx.S @@ -0,0 +1,55 @@ +#define INSTR vrcpps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + vinsertf128 ymm0, ymm0, xmm0, 0x1 + + vaddps ymm1, ymm0, ymm0 # create 2.0 + vaddps ymm2, ymm0, ymm1 # create 3.0 + vaddps ymm4, ymm1, ymm1 # create 4.0 + vaddps ymm4, ymm4, ymm4 # create 8.0 + vaddps ymm4, ymm4, ymm4 # create 16.0 + vaddps ymm4, ymm4, ymm4 # create 32.0 + vaddps ymm4, ymm4, ymm4 # create 64.0 + vaddps ymm4, ymm4, ymm4 # create 128.0 + vaddps ymm4, ymm4, ymm4 # create 256.0 + vaddps ymm4, ymm4, ymm4 # create 512.0 + vaddps ymm4, ymm4, ymm4 # create 1024.0 + vdivps ymm1, ymm4, ymm2 # create 341.3333 + vdivps ymm2, ymm0, ymm1 # create 1/341.3333 + vaddps ymm0, ymm1, ymm1 # create 2*341.3333 +loop: + inc i + INSTR ymm1, ymm0 + INSTR ymm2, ymm1 + INSTR ymm3, ymm2 + cmp i, N + INSTR ymm4, ymm3 + INSTR ymm5, ymm4 + INSTR ymm0, ymm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vrcpps-sse-TP.S b/src/AVX/vrcpps-sse-TP.S new file mode 100644 index 0000000..b994620 --- /dev/null +++ b/src/AVX/vrcpps-sse-TP.S @@ -0,0 +1,59 @@ +#define INSTR vrcpps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddps xmm1, xmm0, xmm0 # create 2.0 + vaddps xmm2, xmm0, xmm1 # create 3.0 + vaddps xmm4, xmm1, xmm1 # create 4.0 + vaddps xmm4, xmm4, xmm4 # create 8.0 + vaddps xmm4, xmm4, xmm4 # create 16.0 + vaddps xmm4, xmm4, xmm4 # create 32.0 + vaddps xmm4, xmm4, xmm4 # create 64.0 + vaddps xmm4, xmm4, xmm4 # create 128.0 + vaddps xmm4, xmm4, xmm4 # create 256.0 + vaddps xmm4, xmm4, xmm4 # create 512.0 + vaddps xmm4, xmm4, xmm4 # create 1024.0 + vdivps xmm1, xmm4, xmm2 # create 341.3333 + vdivps xmm2, xmm0, xmm1 # create 1/341.3333 + vaddps xmm0, xmm1, xmm1 # create 2*341.3333 + vmovaps xmm1, xmm0 + vmovaps xmm2, xmm0 + vmovaps xmm3, xmm0 + vmovaps xmm4, xmm0 + vmovaps xmm5, xmm0 +loop: + inc i + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + cmp i, N + INSTR xmm13, xmm3 + INSTR xmm14, xmm4 + INSTR xmm15, xmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/AVX/vrcpps-sse.S b/src/AVX/vrcpps-sse.S new file mode 100644 index 0000000..1462e7c --- /dev/null +++ b/src/AVX/vrcpps-sse.S @@ -0,0 +1,54 @@ +#define INSTR vrcpps +#define NINST 6 +#define N edi +#define i r8d + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create SP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) + vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + + vaddps xmm1, xmm0, xmm0 # create 2.0 + vaddps xmm2, xmm0, xmm1 # create 3.0 + vaddps xmm4, xmm1, xmm1 # create 4.0 + vaddps xmm4, xmm4, xmm4 # create 8.0 + vaddps xmm4, xmm4, xmm4 # create 16.0 + vaddps xmm4, xmm4, xmm4 # create 32.0 + vaddps xmm4, xmm4, xmm4 # create 64.0 + vaddps xmm4, xmm4, xmm4 # create 128.0 + vaddps xmm4, xmm4, xmm4 # create 256.0 + vaddps xmm4, xmm4, xmm4 # create 512.0 + vaddps xmm4, xmm4, xmm4 # create 1024.0 + vdivps xmm1, xmm4, xmm2 # create 341.3333 + vdivps xmm2, xmm0, xmm1 # create 1/341.3333 + vaddps xmm0, xmm1, xmm1 # create 2*341.3333 +loop: + inc i + INSTR xmm1, xmm0 + INSTR xmm2, xmm1 + INSTR xmm3, xmm2 + cmp i, N + INSTR xmm4, xmm3 + INSTR xmm5, xmm4 + INSTR xmm0, xmm5 + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency diff --git a/src/vsx/xvadddp.S b/src/vsx/xvadddp.S new file mode 100644 index 0000000..7d90557 --- /dev/null +++ b/src/vsx/xvadddp.S @@ -0,0 +1,47 @@ +#define INSTR xvadddp +#define NINST 6 +#define N 3 + +.globl ninst +.data +ninst: +.long NINST +.align 16 +zero: +.double 0.0, 0.0 +one: +.double 1.0, 1.0 +.text +.abiversion 2 +.section ".toc","aw" +.section ".text" +.align 2 +.globl latency +.type latency, @function +latency : +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l +.localentry latency, .-latency + + mtctr N # move to count register + # load DP FP zero + li 10, 0 + + addis 9,2,zero@toc@ha + addi 9,9,zero@toc@l + lxvd2x 0, 0, 9 + + addis 9,2,one@toc@ha + addi 9,9,one@toc@l + lxvd2x 1, 0, 9 +loop: + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + bdnz loop + xvmovdp 1, 0 + blr +.size latency, .-latency diff --git a/src/vsx/xvaddsp.S b/src/vsx/xvaddsp.S new file mode 100644 index 0000000..81d3f4d --- /dev/null +++ b/src/vsx/xvaddsp.S @@ -0,0 +1,47 @@ +#define INSTR xvaddsp +#define NINST 6 +#define N 3 + +.globl ninst +.data +ninst: +.long NINST +.align 16 +zero: +.single 0.0, 0.0 +one: +.single 1.0, 1.0 +.text +.abiversion 2 +.section ".toc","aw" +.section ".text" +.align 2 +.globl latency +.type latency, @function +latency : +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l +.localentry latency, .-latency + + mtctr N # move to count register + # load DP FP zero + li 10, 0 + + addis 9,2,zero@toc@ha + addi 9,9,zero@toc@l + lxvd2x 0, 0, 9 + + addis 9,2,one@toc@ha + addi 9,9,one@toc@l + lxvd2x 1, 0, 9 +loop: + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + bdnz loop + xvmovdp 1, 0 + blr +.size latency, .-latency diff --git a/src/vsx/xvdivdp.S b/src/vsx/xvdivdp.S new file mode 100644 index 0000000..d66220a --- /dev/null +++ b/src/vsx/xvdivdp.S @@ -0,0 +1,49 @@ +#define INSTR xvdivdp +#define NINST 6 +#define N 3 + +.globl ninst +.data +ninst: +.long NINST +.align 16 +half: +.double 0.5, 0.5 +one: +.double 1.0, 1.0 +two: +.double 2.0, 2.0 +.text +.abiversion 2 +.section ".toc","aw" +.section ".text" +.align 2 +.globl latency +.type latency, @function +latency : +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l +.localentry latency, .-latency + + mtctr N # move to count register + li 10, 0 # offset zero + addis 9,2,one@toc@ha # upper 32 bit of address + addi 9,9,one@toc@l # lower 32 bit of address + lxvd2x 0, 0, 9 + addis 9,2,half@toc@ha # upper 32 bit of address + addi 9,9,half@toc@l # lower 32 bit of address + lxvd2x 1, 0, 9 + addis 9,2,two@toc@ha # upper 32 bit of address + addi 9,9,two@toc@l # lower 32 bit of address + lxvd2x 2, 0, 9 +loop: + INSTR 0, 0, 1 + INSTR 0, 0, 2 + INSTR 0, 0, 1 + INSTR 0, 0, 2 + INSTR 0, 0, 1 + INSTR 0, 0, 2 + bdnz loop + xvmovdp 1, 0 + blr +.size latency, .-latency diff --git a/src/vsx/xvdivsp.S b/src/vsx/xvdivsp.S new file mode 100644 index 0000000..dcce7c0 --- /dev/null +++ b/src/vsx/xvdivsp.S @@ -0,0 +1,49 @@ +#define INSTR xvdivsp +#define NINST 6 +#define N 3 + +.globl ninst +.data +ninst: +.long NINST +.align 16 +half: +.single 0.5, 0.5 +one: +.single 1.0, 1.0 +two: +.single 2.0, 2.0 +.text +.abiversion 2 +.section ".toc","aw" +.section ".text" +.align 2 +.globl latency +.type latency, @function +latency : +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l +.localentry latency, .-latency + + mtctr N # move to count register + li 10, 0 # offset zero + addis 9,2,one@toc@ha # upper 32 bit of address + addi 9,9,one@toc@l # lower 32 bit of address + lxvd2x 0, 0, 9 + addis 9,2,half@toc@ha # upper 32 bit of address + addi 9,9,half@toc@l # lower 32 bit of address + lxvd2x 1, 0, 9 + addis 9,2,two@toc@ha # upper 32 bit of address + addi 9,9,two@toc@l # lower 32 bit of address + lxvd2x 2, 0, 9 +loop: + INSTR 0, 0, 1 + INSTR 0, 0, 2 + INSTR 0, 0, 1 + INSTR 0, 0, 2 + INSTR 0, 0, 1 + INSTR 0, 0, 2 + bdnz loop + xvmovdp 1, 0 + blr +.size latency, .-latency diff --git a/src/vsx/xvmaddadp.S b/src/vsx/xvmaddadp.S new file mode 100644 index 0000000..f558280 --- /dev/null +++ b/src/vsx/xvmaddadp.S @@ -0,0 +1,53 @@ +#define INSTR xvmaddadp +#define NINST 6 +#define N 3 + +.globl ninst +.data +ninst: +.long NINST +.align 16 +zero: +.double 0.0, 0.0 +two: +.double 2.0, 2.0 +three: +.double 3.0, 3.0 +.text +.abiversion 2 +.section ".toc","aw" +.section ".text" +.align 2 +.globl latency +.type latency, @function +latency : +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l +.localentry latency, .-latency + + mtctr N # move to count register + # load DP FP zero + li 10, 0 + + addis 9,2,zero@toc@ha + addi 9,9,zero@toc@l + lxvd2x 0, 0, 9 + + addis 9,2,two@toc@ha + addi 9,9,two@toc@l + lxvd2x 1, 0, 9 + + addis 9,2,three@toc@ha + addi 9,9,three@toc@l + lxvd2x 2, 0, 9 +loop: + INSTR 0, 1, 2 + INSTR 0, 1, 2 + INSTR 0, 1, 2 + INSTR 0, 1, 2 + INSTR 0, 1, 2 + INSTR 0, 1, 2 + bdnz loop + xvmovdp 1, 0 + blr +.size latency, .-latency diff --git a/src/vsx/xvmuldp.S b/src/vsx/xvmuldp.S new file mode 100644 index 0000000..49c2b23 --- /dev/null +++ b/src/vsx/xvmuldp.S @@ -0,0 +1,44 @@ +#define INSTR xvmuldp +#define NINST 6 +#define N 3 + +.globl ninst +.data +ninst: +.long NINST +.align 16 +zero: +.double 0.0, 0.0 +one: +.double 1.0, 1.0 +.text +.abiversion 2 +.section ".toc","aw" +.section ".text" +.align 2 +.globl latency +.type latency, @function +latency : +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l +.localentry latency, .-latency + + mtctr N # move to count register + li 10, 0 # offset zero + addis 9,2,one@toc@ha # upper 32 bit of address + addi 9,9,one@toc@l # lower 32 bit of address + lxvd2x 0, 0, 9 + addis 9,2,one@toc@ha # upper 32 bit of address + addi 9,9,one@toc@l # lower 32 bit of address + lxvd2x 1, 0, 9 +loop: + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + bdnz loop + xvmovdp 1, 0 + blr +.size latency, .-latency diff --git a/src/vsx/xvmulsp.S b/src/vsx/xvmulsp.S new file mode 100644 index 0000000..59c5d61 --- /dev/null +++ b/src/vsx/xvmulsp.S @@ -0,0 +1,44 @@ +#define INSTR xvmulsp +#define NINST 6 +#define N 3 + +.globl ninst +.data +ninst: +.long NINST +.align 16 +zero: +.single 0.0, 0.0 +one: +.single 1.0, 1.0 +.text +.abiversion 2 +.section ".toc","aw" +.section ".text" +.align 2 +.globl latency +.type latency, @function +latency : +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l +.localentry latency, .-latency + + mtctr N # move to count register + li 10, 0 # offset zero + addis 9,2,one@toc@ha # upper 32 bit of address + addi 9,9,one@toc@l # lower 32 bit of address + lxvd2x 0, 0, 9 + addis 9,2,one@toc@ha # upper 32 bit of address + addi 9,9,one@toc@l # lower 32 bit of address + lxvd2x 1, 0, 9 +loop: + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + INSTR 0, 0, 1 + bdnz loop + xvmovdp 1, 0 + blr +.size latency, .-latency