From cadedeba7b09c7f1b3bf0fd3debf6ea4255fad8a Mon Sep 17 00:00:00 2001 From: JanLJL Date: Mon, 3 Feb 2020 13:19:18 +0100 Subject: [PATCH] added example kernels --- examples/README.md | 43 + examples/add/add.s.csx.gcc.s | 36 + examples/add/add.s.csx.icc.s | 19 + examples/add/add.s.tx2.clang.s | 91 ++ examples/add/add.s.tx2.gcc.s | 45 + examples/add/add.s.zen.gcc.s | 30 + examples/copy/copy.s.csx.gcc.s | 28 + examples/copy/copy.s.csx.icc.s | 15 + examples/copy/copy.s.tx2.clang.s | 42 + examples/copy/copy.s.tx2.gcc.s | 29 + examples/copy/copy.s.zen.gcc.s | 22 + examples/daxpy/daxpy.s.csx.gcc.s | 36 + examples/daxpy/daxpy.s.csx.icc.s | 16 + examples/daxpy/daxpy.s.tx2.clang.s | 90 ++ examples/daxpy/daxpy.s.tx2.gcc.s | 41 + examples/daxpy/daxpy.s.zen.gcc.s | 30 + examples/gs/gs.s.csx.gcc.s | 1144 ++++++++++++++++ examples/gs/gs.s.csx.icc.s | 1123 ++++++++++++++++ examples/gs/gs.s.tx2.clang.s | 1194 +++++++++++++++++ examples/gs/gs.s.tx2.gcc.s | 737 ++++++++++ examples/gs/gs.s.zen.gcc.s | 1073 +++++++++++++++ examples/j2d/j2d.s.csx.gcc.AVX.s | 40 + examples/j2d/j2d.s.csx.gcc.SSE.s | 46 + examples/j2d/j2d.s.csx.icc.AVX.s | 37 + examples/j2d/j2d.s.csx.icc.AVX512.s | 69 + examples/j2d/j2d.s.csx.icc.SSE.s | 40 + examples/j2d/j2d.s.tx2.clang.s | 131 ++ examples/j2d/j2d.s.tx2.gcc.s | 43 + examples/j2d/j2d.s.zen.gcc.s | 36 + examples/striad/striad.s.csx.gcc.s | 44 + examples/striad/striad.s.csx.icc.s | 21 + examples/striad/striad.s.tx2.clang.s | 112 ++ examples/striad/striad.s.tx2.gcc.s | 53 + examples/striad/striad.s.zen.gcc.s | 38 + .../sum_reduction.s.csx.gcc.O3.s | 46 + .../sum_reduction/sum_reduction.s.csx.gcc.s | 20 + .../sum_reduction/sum_reduction.s.csx.icc.s | 17 + .../sum_reduction/sum_reduction.s.tx2.clang.s | 57 + .../sum_reduction.s.tx2.gcc.O3.s | 47 + .../sum_reduction/sum_reduction.s.tx2.gcc.s | 23 + .../sum_reduction.s.zen.gcc.O3.s | 38 + .../sum_reduction/sum_reduction.s.zen.gcc.s | 14 + examples/triad/triad.s.csx.gcc.s | 36 + examples/triad/triad.s.csx.icc.s | 16 + examples/triad/triad.s.tx2.clang.s | 118 ++ examples/triad/triad.s.tx2.gcc.s | 45 + examples/triad/triad.s.zen.gcc.s | 30 + examples/update/update.s.csx.gcc.s | 28 + examples/update/update.s.csx.icc.s | 17 + examples/update/update.s.tx2.clang.s | 15 + examples/update/update.s.tx2.gcc.s | 31 + examples/update/update.s.zen.gcc.s | 22 + 52 files changed, 7214 insertions(+) create mode 100644 examples/README.md create mode 100644 examples/add/add.s.csx.gcc.s create mode 100644 examples/add/add.s.csx.icc.s create mode 100644 examples/add/add.s.tx2.clang.s create mode 100644 examples/add/add.s.tx2.gcc.s create mode 100644 examples/add/add.s.zen.gcc.s create mode 100644 examples/copy/copy.s.csx.gcc.s create mode 100644 examples/copy/copy.s.csx.icc.s create mode 100644 examples/copy/copy.s.tx2.clang.s create mode 100644 examples/copy/copy.s.tx2.gcc.s create mode 100644 examples/copy/copy.s.zen.gcc.s create mode 100644 examples/daxpy/daxpy.s.csx.gcc.s create mode 100644 examples/daxpy/daxpy.s.csx.icc.s create mode 100644 examples/daxpy/daxpy.s.tx2.clang.s create mode 100644 examples/daxpy/daxpy.s.tx2.gcc.s create mode 100644 examples/daxpy/daxpy.s.zen.gcc.s create mode 100644 examples/gs/gs.s.csx.gcc.s create mode 100644 examples/gs/gs.s.csx.icc.s create mode 100644 examples/gs/gs.s.tx2.clang.s create mode 100644 examples/gs/gs.s.tx2.gcc.s create mode 100644 examples/gs/gs.s.zen.gcc.s create mode 100644 examples/j2d/j2d.s.csx.gcc.AVX.s create mode 100644 examples/j2d/j2d.s.csx.gcc.SSE.s create mode 100644 examples/j2d/j2d.s.csx.icc.AVX.s create mode 100644 examples/j2d/j2d.s.csx.icc.AVX512.s create mode 100644 examples/j2d/j2d.s.csx.icc.SSE.s create mode 100644 examples/j2d/j2d.s.tx2.clang.s create mode 100644 examples/j2d/j2d.s.tx2.gcc.s create mode 100644 examples/j2d/j2d.s.zen.gcc.s create mode 100644 examples/striad/striad.s.csx.gcc.s create mode 100644 examples/striad/striad.s.csx.icc.s create mode 100644 examples/striad/striad.s.tx2.clang.s create mode 100644 examples/striad/striad.s.tx2.gcc.s create mode 100644 examples/striad/striad.s.zen.gcc.s create mode 100644 examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s create mode 100644 examples/sum_reduction/sum_reduction.s.csx.gcc.s create mode 100644 examples/sum_reduction/sum_reduction.s.csx.icc.s create mode 100644 examples/sum_reduction/sum_reduction.s.tx2.clang.s create mode 100644 examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s create mode 100644 examples/sum_reduction/sum_reduction.s.tx2.gcc.s create mode 100644 examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s create mode 100644 examples/sum_reduction/sum_reduction.s.zen.gcc.s create mode 100644 examples/triad/triad.s.csx.gcc.s create mode 100644 examples/triad/triad.s.csx.icc.s create mode 100644 examples/triad/triad.s.tx2.clang.s create mode 100644 examples/triad/triad.s.tx2.gcc.s create mode 100644 examples/triad/triad.s.zen.gcc.s create mode 100644 examples/update/update.s.csx.gcc.s create mode 100644 examples/update/update.s.csx.icc.s create mode 100644 examples/update/update.s.tx2.clang.s create mode 100644 examples/update/update.s.tx2.gcc.s create mode 100644 examples/update/update.s.zen.gcc.s diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..8fac95f --- /dev/null +++ b/examples/README.md @@ -0,0 +1,43 @@ +# Examples +We collected sample kernels for the user to run examples with OSACA. +The assembly files contain only the extracted and already marked kernel for code compiled with on Intel Cascade Lake (CSX), AMD Zen and Marvell ThunderX2 (TX2), but can be run on any system supporting the ISA and supported by OSACA. +The used compilers were Intel Parallel Studio 19.0up05 and GNU 9.1.0 in case of the x86 systems and ARM HPC Compiler for Linux version 19.2 and GNU 8.2.0 for the ARM-based TX2. + +To analyze the kernels with OSACA, run +``` +osaca --arch ARCH filepath +``` +While all Zen and TX2 kernels use the comment-style OSACA markers, the kernels for Intel Cascade Lake (*.csx.*.s) use the byte markers to be able to be analyzed by IACA as well. +For this use +``` +iaca -arch SKX filepath +``` + +------------ +The kernels will be explained briefly in the following. + +### Copy +```c +double * restrict a, * restrict b; + +for(long i=0; i < size; ++i){ + a[i] = b[i]; +} +``` + +### Vector add + +### Vector update + +### Sum reduction + +### DAXPY + +### STREAM triad + +### Schönauer triad + +### Gauss-Seidel method + +### Jacobi 2D + diff --git a/examples/add/add.s.csx.gcc.s b/examples/add/add.s.csx.gcc.s new file mode 100644 index 0000000..038ed57 --- /dev/null +++ b/examples/add/add.s.csx.gcc.s @@ -0,0 +1,36 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L19: + vmovupd (%r14,%rax), %ymm3 + vmovupd 32(%r14,%rax), %ymm4 + vmovupd 64(%r14,%rax), %ymm6 + vmovupd 96(%r14,%rax), %ymm9 + vmovupd 128(%r14,%rax), %ymm11 + vmovupd 160(%r14,%rax), %ymm13 + vmovupd 192(%r14,%rax), %ymm15 + vmovupd 224(%r14,%rax), %ymm0 + vaddpd 0(%r13,%rax), %ymm3, %ymm7 + vaddpd 32(%r13,%rax), %ymm4, %ymm5 + vaddpd 64(%r13,%rax), %ymm6, %ymm8 + vaddpd 96(%r13,%rax), %ymm9, %ymm10 + vaddpd 128(%r13,%rax), %ymm11, %ymm12 + vaddpd 160(%r13,%rax), %ymm13, %ymm14 + vaddpd 192(%r13,%rax), %ymm15, %ymm1 + vaddpd 224(%r13,%rax), %ymm0, %ymm2 + vmovupd %ymm7, (%r12,%rax) + vmovupd %ymm5, 32(%r12,%rax) + vmovupd %ymm8, 64(%r12,%rax) + vmovupd %ymm10, 96(%r12,%rax) + vmovupd %ymm12, 128(%r12,%rax) + vmovupd %ymm14, 160(%r12,%rax) + vmovupd %ymm1, 192(%r12,%rax) + vmovupd %ymm2, 224(%r12,%rax) + addq $256, %rax + cmpq %rax, %rcx + jne .L19 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/add/add.s.csx.icc.s b/examples/add/add.s.csx.icc.s new file mode 100644 index 0000000..d541c09 --- /dev/null +++ b/examples/add/add.s.csx.icc.s @@ -0,0 +1,19 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.40: # Preds ..B1.40 ..B1.39 + # Execution count [2.22e+03] + vmovups (%rcx,%rax,8), %zmm1 #78.5 + vmovups 64(%rcx,%rax,8), %zmm3 #78.5 + vaddpd (%r13,%rax,8), %zmm1, %zmm2 #78.5 + vaddpd 64(%r13,%rax,8), %zmm3, %zmm4 #78.5 + vmovupd %zmm2, (%r14,%rax,8) #78.5 + vmovupd %zmm4, 64(%r14,%rax,8) #78.5 + addq $16, %rax #78.5 + cmpq %r12, %rax #78.5 + jb ..B1.40 # Prob 82% #78.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/add/add.s.tx2.clang.s b/examples/add/add.s.tx2.clang.s new file mode 100644 index 0000000..1bd898b --- /dev/null +++ b/examples/add/add.s.tx2.clang.s @@ -0,0 +1,91 @@ + // OSACA-BEGIN +.LBB1_29: // Parent Loop BB1_20 Depth=1 + // Parent Loop BB1_22 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp q0, q1, [x9, #-256] + ldp q4, q5, [x9, #-224] + ldp q2, q3, [x10, #-256] + ldp q6, q7, [x10, #-224] + fadd v2.2d, v2.2d, v0.2d + fadd v3.2d, v3.2d, v1.2d + stp q2, q3, [x11, #-256] + fadd v0.2d, v6.2d, v4.2d + fadd v1.2d, v7.2d, v5.2d + stp q0, q1, [x11, #-224] + ldp q4, q5, [x9, #-192] + ldp q16, q17, [x9, #-160] + ldp q6, q7, [x10, #-192] + ldp q18, q19, [x10, #-160] + fadd v6.2d, v6.2d, v4.2d + fadd v7.2d, v7.2d, v5.2d + stp q6, q7, [x11, #-192] + fadd v4.2d, v18.2d, v16.2d + fadd v5.2d, v19.2d, v17.2d + stp q4, q5, [x11, #-160] + ldp q16, q17, [x9, #-128] + ldp q19, q20, [x9, #-96] + ldp q18, q21, [x10, #-128] + ldp q22, q23, [x10, #-96] + fadd v16.2d, v18.2d, v16.2d + fadd v18.2d, v21.2d, v17.2d + stp q16, q18, [x11, #-128] + fadd v17.2d, v22.2d, v19.2d + fadd v19.2d, v23.2d, v20.2d + stp q17, q19, [x11, #-96] + ldp q20, q21, [x9, #-64] + ldp q24, q25, [x10, #-64] + ldp q22, q23, [x9, #-32] + ldp q26, q27, [x10, #-32] + fadd v20.2d, v24.2d, v20.2d + fadd v21.2d, v25.2d, v21.2d + stp q20, q21, [x11, #-64] + ldp q24, q25, [x9] + ldp q28, q29, [x10] + fadd v22.2d, v26.2d, v22.2d + fadd v23.2d, v27.2d, v23.2d + stp q22, q23, [x11, #-32] + ldp q26, q27, [x9, #32] + ldp q30, q31, [x10, #32] + fadd v24.2d, v28.2d, v24.2d + fadd v25.2d, v29.2d, v25.2d + stp q24, q25, [x11] + ldp q28, q29, [x9, #64] + ldp q8, q10, [x10, #64] + fadd v26.2d, v30.2d, v26.2d + fadd v27.2d, v31.2d, v27.2d + stp q26, q27, [x11, #32] + ldp q30, q31, [x9, #96] + ldp q11, q12, [x10, #96] + fadd v28.2d, v8.2d, v28.2d + fadd v29.2d, v10.2d, v29.2d + stp q28, q29, [x11, #64] + ldp q8, q10, [x9, #128] + ldp q13, q14, [x10, #128] + ldp q3, q0, [x9, #192] + ldp q1, q6, [x10, #192] + fadd v30.2d, v11.2d, v30.2d + fadd v31.2d, v12.2d, v31.2d + stp q30, q31, [x11, #96] + ldp q11, q12, [x9, #160] + fadd v8.2d, v13.2d, v8.2d + fadd v10.2d, v14.2d, v10.2d + stp q8, q10, [x11, #128] + ldp q13, q14, [x10, #160] + fadd v1.2d, v1.2d, v3.2d + ldp q3, q4, [x9, #224] + fadd v0.2d, v6.2d, v0.2d + stp q1, q0, [x11, #192] + ldp q5, q6, [x10, #224] + fadd v11.2d, v13.2d, v11.2d + fadd v2.2d, v14.2d, v12.2d + stp q11, q2, [x11, #160] + fadd v3.2d, v5.2d, v3.2d + fadd v4.2d, v6.2d, v4.2d + stp q3, q4, [x11, #224] + add x8, x8, #64 // =64 + add x11, x11, #512 // =512 + add x10, x10, #512 // =512 + add x9, x9, #512 // =512 + adds x12, x12, #8 // =8 + b.ne .LBB1_29 + // OSACA-END diff --git a/examples/add/add.s.tx2.gcc.s b/examples/add/add.s.tx2.gcc.s new file mode 100644 index 0000000..6c40802 --- /dev/null +++ b/examples/add/add.s.tx2.gcc.s @@ -0,0 +1,45 @@ + // OSACA-BEGIN +.L17: + add x0, x10, 16 + ldr q29, [x21, x10] + ldr q30, [x20, x10] + add x7, x10, 32 + ldr q31, [x21, x0] + ldr q2, [x20, x0] + add x6, x10, 48 + add x5, x10, 64 + ldr q5, [x21, x7] + ldr q1, [x20, x7] + add x4, x10, 80 + add x11, x10, 96 + ldr q4, [x21, x6] + ldr q0, [x20, x6] + add x2, x10, 112 + fadd v7.2d, v29.2d, v30.2d + ldr q3, [x21, x5] + ldr q9, [x20, x5] + fadd v6.2d, v31.2d, v2.2d + ldr q19, [x21, x4] + ldr q18, [x20, x4] + fadd v20.2d, v5.2d, v1.2d + ldr q21, [x21, x11] + ldr q17, [x20, x11] + fadd v22.2d, v4.2d, v0.2d + ldr q23, [x21, x2] + ldr q16, [x20, x2] + fadd v24.2d, v3.2d, v9.2d + fadd v25.2d, v19.2d, v18.2d + fadd v26.2d, v21.2d, v17.2d + str q7, [x19, x10] + add x10, x10, 128 + fadd v27.2d, v23.2d, v16.2d + str q6, [x19, x0] + str q20, [x19, x7] + str q22, [x19, x6] + str q24, [x19, x5] + str q25, [x19, x4] + str q26, [x19, x11] + str q27, [x19, x2] + cmp x24, x10 + bne .L17 + // OSACA-END diff --git a/examples/add/add.s.zen.gcc.s b/examples/add/add.s.zen.gcc.s new file mode 100644 index 0000000..f98e2d3 --- /dev/null +++ b/examples/add/add.s.zen.gcc.s @@ -0,0 +1,30 @@ + # OSACA-BEGIN +.L19: + vmovups 0(%r13,%rax), %xmm0 + vmovups 16(%r13,%rax), %xmm3 + vmovups 32(%r13,%rax), %xmm4 + vmovups 48(%r13,%rax), %xmm6 + vmovups 64(%r13,%rax), %xmm9 + vmovups 80(%r13,%rax), %xmm11 + vmovups 96(%r13,%rax), %xmm13 + vmovups 112(%r13,%rax), %xmm15 + vaddpd (%r12,%rax), %xmm0, %xmm7 + vaddpd 16(%r12,%rax), %xmm3, %xmm2 + vaddpd 32(%r12,%rax), %xmm4, %xmm5 + vaddpd 48(%r12,%rax), %xmm6, %xmm8 + vaddpd 64(%r12,%rax), %xmm9, %xmm10 + vaddpd 80(%r12,%rax), %xmm11, %xmm12 + vaddpd 96(%r12,%rax), %xmm13, %xmm14 + vaddpd 112(%r12,%rax), %xmm15, %xmm1 + vmovups %xmm7, 0(%rbp,%rax) + vmovups %xmm2, 16(%rbp,%rax) + vmovups %xmm5, 32(%rbp,%rax) + vmovups %xmm8, 48(%rbp,%rax) + vmovups %xmm10, 64(%rbp,%rax) + vmovups %xmm12, 80(%rbp,%rax) + vmovups %xmm14, 96(%rbp,%rax) + vmovups %xmm1, 112(%rbp,%rax) + subq $-128, %rax + cmpq %rbx, %rax + jne .L19 + # OSACA-END diff --git a/examples/copy/copy.s.csx.gcc.s b/examples/copy/copy.s.csx.gcc.s new file mode 100644 index 0000000..5367f59 --- /dev/null +++ b/examples/copy/copy.s.csx.gcc.s @@ -0,0 +1,28 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L19: + vmovupd (%r12,%rcx), %ymm10 + vmovupd 32(%r12,%rcx), %ymm11 + vmovupd 64(%r12,%rcx), %ymm12 + vmovupd 96(%r12,%rcx), %ymm13 + vmovupd 128(%r12,%rcx), %ymm14 + vmovupd 160(%r12,%rcx), %ymm15 + vmovupd 192(%r12,%rcx), %ymm0 + vmovupd 224(%r12,%rcx), %ymm1 + vmovupd %ymm10, 0(%r13,%rcx) + vmovupd %ymm11, 32(%r13,%rcx) + vmovupd %ymm12, 64(%r13,%rcx) + vmovupd %ymm13, 96(%r13,%rcx) + vmovupd %ymm14, 128(%r13,%rcx) + vmovupd %ymm15, 160(%r13,%rcx) + vmovupd %ymm0, 192(%r13,%rcx) + vmovupd %ymm1, 224(%r13,%rcx) + addq $256, %rcx + cmpq %rcx, %r10 + jne .L19 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/copy/copy.s.csx.icc.s b/examples/copy/copy.s.csx.icc.s new file mode 100644 index 0000000..c2836a2 --- /dev/null +++ b/examples/copy/copy.s.csx.icc.s @@ -0,0 +1,15 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.39: # Preds ..B1.39 ..B1.38 + # Execution count [2.22e+03] + vmovups (%r14,%rax,8), %zmm1 #79.5 + vmovupd %zmm1, (%r13,%rax,8) #79.5 + addq $8, %rax #79.5 + cmpq %r12, %rax #79.5 + jb ..B1.39 # Prob 82% #79.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/copy/copy.s.tx2.clang.s b/examples/copy/copy.s.tx2.clang.s new file mode 100644 index 0000000..934e8c4 --- /dev/null +++ b/examples/copy/copy.s.tx2.clang.s @@ -0,0 +1,42 @@ + // OSACA-BEGIN +.LBB1_29: // Parent Loop BB1_20 Depth=1 + // Parent Loop BB1_22 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp q0, q1, [x9, #-256] + ldp q2, q3, [x9, #-224] + stp q0, q1, [x10, #-256] + stp q2, q3, [x10, #-224] + add x8, x8, #64 // =64 + ldp q0, q1, [x9] + ldp q2, q3, [x9, #32] + stp q0, q1, [x10] + stp q2, q3, [x10, #32] + ldp q0, q1, [x9, #-192] + ldp q2, q3, [x9, #-160] + stp q0, q1, [x10, #-192] + stp q2, q3, [x10, #-160] + ldp q0, q1, [x9, #64] + ldp q2, q3, [x9, #96] + stp q0, q1, [x10, #64] + stp q2, q3, [x10, #96] + ldp q0, q1, [x9, #-128] + ldp q2, q3, [x9, #-96] + stp q0, q1, [x10, #-128] + stp q2, q3, [x10, #-96] + ldp q0, q1, [x9, #128] + ldp q2, q3, [x9, #160] + stp q0, q1, [x10, #128] + stp q2, q3, [x10, #160] + ldp q0, q1, [x9, #-64] + ldp q2, q3, [x9, #-32] + stp q0, q1, [x10, #-64] + stp q2, q3, [x10, #-32] + ldp q0, q1, [x9, #192] + ldp q2, q3, [x9, #224] + add x9, x9, #512 // =512 + stp q0, q1, [x10, #192] + stp q2, q3, [x10, #224] + add x10, x10, #512 // =512 + adds x11, x11, #8 // =8 + b.ne .LBB1_29 + // OSACA-END diff --git a/examples/copy/copy.s.tx2.gcc.s b/examples/copy/copy.s.tx2.gcc.s new file mode 100644 index 0000000..fec5470 --- /dev/null +++ b/examples/copy/copy.s.tx2.gcc.s @@ -0,0 +1,29 @@ + // OSACA-BEGIN +.L17: + add x16, x15, 16 + ldr q9, [x19, x15] + add x30, x15, 32 + add x17, x15, 48 + ldr q16, [x19, x16] + ldr q18, [x19, x30] + add x18, x15, 64 + add x1, x15, 80 + ldr q17, [x19, x17] + ldr q19, [x19, x18] + add x3, x15, 96 + add x2, x15, 112 + ldr q20, [x19, x1] + ldr q21, [x19, x3] + str q9, [x20, x15] + ldr q22, [x19, x2] + add x15, x15, 128 + str q16, [x20, x16] + str q18, [x20, x30] + str q17, [x20, x17] + str q19, [x20, x18] + str q20, [x20, x1] + str q21, [x20, x3] + str q22, [x20, x2] + cmp x23, x15 + bne .L17 + // OSACA-END diff --git a/examples/copy/copy.s.zen.gcc.s b/examples/copy/copy.s.zen.gcc.s new file mode 100644 index 0000000..e8e68b0 --- /dev/null +++ b/examples/copy/copy.s.zen.gcc.s @@ -0,0 +1,22 @@ + # OSACA-BEGIN +.L19: + vmovups 0(%rbp,%r10), %xmm9 + vmovups 16(%rbp,%r10), %xmm10 + vmovups 32(%rbp,%r10), %xmm11 + vmovups 48(%rbp,%r10), %xmm12 + vmovups 64(%rbp,%r10), %xmm13 + vmovups 80(%rbp,%r10), %xmm14 + vmovups 96(%rbp,%r10), %xmm15 + vmovups 112(%rbp,%r10), %xmm0 + vmovups %xmm9, (%r12,%r10) + vmovups %xmm10, 16(%r12,%r10) + vmovups %xmm11, 32(%r12,%r10) + vmovups %xmm12, 48(%r12,%r10) + vmovups %xmm13, 64(%r12,%r10) + vmovups %xmm14, 80(%r12,%r10) + vmovups %xmm15, 96(%r12,%r10) + vmovups %xmm0, 112(%r12,%r10) + subq $-128, %r10 + cmpq %r10, %r15 + jne .L19 + # OSACA-END diff --git a/examples/daxpy/daxpy.s.csx.gcc.s b/examples/daxpy/daxpy.s.csx.gcc.s new file mode 100644 index 0000000..5d29cf2 --- /dev/null +++ b/examples/daxpy/daxpy.s.csx.gcc.s @@ -0,0 +1,36 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L19: + vmovupd 0(%r13,%rsi), %ymm14 + vmovupd 32(%r13,%rsi), %ymm15 + vmovupd 64(%r13,%rsi), %ymm1 + vmovupd 96(%r13,%rsi), %ymm0 + vmovupd 128(%r13,%rsi), %ymm3 + vmovupd 160(%r13,%rsi), %ymm4 + vmovupd 192(%r13,%rsi), %ymm5 + vmovupd 224(%r13,%rsi), %ymm7 + vfmadd213pd (%r12,%rsi), %ymm6, %ymm14 + vfmadd213pd 32(%r12,%rsi), %ymm6, %ymm15 + vfmadd213pd 64(%r12,%rsi), %ymm6, %ymm1 + vfmadd213pd 96(%r12,%rsi), %ymm6, %ymm0 + vfmadd213pd 128(%r12,%rsi), %ymm6, %ymm3 + vfmadd213pd 160(%r12,%rsi), %ymm6, %ymm4 + vfmadd213pd 192(%r12,%rsi), %ymm6, %ymm5 + vfmadd213pd 224(%r12,%rsi), %ymm6, %ymm7 + vmovupd %ymm14, (%r12,%rsi) + vmovupd %ymm15, 32(%r12,%rsi) + vmovupd %ymm1, 64(%r12,%rsi) + vmovupd %ymm0, 96(%r12,%rsi) + vmovupd %ymm3, 128(%r12,%rsi) + vmovupd %ymm4, 160(%r12,%rsi) + vmovupd %ymm5, 192(%r12,%rsi) + vmovupd %ymm7, 224(%r12,%rsi) + addq $256, %rsi + cmpq %rsi, %r10 + jne .L19 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/daxpy/daxpy.s.csx.icc.s b/examples/daxpy/daxpy.s.csx.icc.s new file mode 100644 index 0000000..d26fc3e --- /dev/null +++ b/examples/daxpy/daxpy.s.csx.icc.s @@ -0,0 +1,16 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.39: # Preds ..B1.39 ..B1.38 + # Execution count [2.22e+03] + vmovups (%r13,%rax,8), %zmm1 #77.5 + vfmadd213pd (%r14,%rax,8), %zmm2, %zmm1 #77.5 + vmovupd %zmm1, (%r14,%rax,8) #77.5 + addq $8, %rax #77.5 + cmpq %rbx, %rax #77.5 + jb ..B1.39 # Prob 82% #77.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/daxpy/daxpy.s.tx2.clang.s b/examples/daxpy/daxpy.s.tx2.clang.s new file mode 100644 index 0000000..c615986 --- /dev/null +++ b/examples/daxpy/daxpy.s.tx2.clang.s @@ -0,0 +1,90 @@ + // OSACA-BEGIN +.LBB1_29: // Parent Loop BB1_20 Depth=1 + // Parent Loop BB1_22 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp q1, q2, [x9, #-256] + ldp q3, q0, [x9, #-224] + ldp q4, q5, [x10, #-256] + ldp q6, q7, [x10, #-224] + fmla v1.2d, v4.2d, v31.2d + fmla v2.2d, v5.2d, v31.2d + stp q1, q2, [x9, #-256] + fmla v3.2d, v6.2d, v31.2d + fmla v0.2d, v7.2d, v31.2d + stp q3, q0, [x9, #-224] + ldp q5, q6, [x9, #-192] + ldp q7, q4, [x9, #-160] + ldp q16, q17, [x10, #-192] + ldp q18, q19, [x10, #-160] + fmla v5.2d, v16.2d, v31.2d + fmla v6.2d, v17.2d, v31.2d + stp q5, q6, [x9, #-192] + fmla v7.2d, v18.2d, v31.2d + fmla v4.2d, v19.2d, v31.2d + stp q7, q4, [x9, #-160] + ldp q19, q18, [x9, #-128] + ldp q16, q17, [x9, #-96] + ldp q20, q21, [x10, #-128] + ldp q22, q23, [x10, #-96] + fmla v18.2d, v21.2d, v31.2d + fmla v16.2d, v22.2d, v31.2d + ldp q21, q22, [x9, #-64] + ldp q24, q25, [x10, #-64] + fmla v19.2d, v20.2d, v31.2d + stp q19, q18, [x9, #-128] + fmla v17.2d, v23.2d, v31.2d + stp q16, q17, [x9, #-96] + ldp q23, q20, [x9, #-32] + ldp q26, q27, [x10, #-32] + fmla v21.2d, v24.2d, v31.2d + fmla v22.2d, v25.2d, v31.2d + stp q21, q22, [x9, #-64] + ldp q24, q25, [x9] + ldp q28, q29, [x10] + fmla v23.2d, v26.2d, v31.2d + fmla v20.2d, v27.2d, v31.2d + stp q23, q20, [x9, #-32] + ldp q26, q27, [x9, #32] + fmla v24.2d, v28.2d, v31.2d + fmla v25.2d, v29.2d, v31.2d + stp q24, q25, [x9] + ldp q28, q29, [x10, #32] + fmla v26.2d, v28.2d, v31.2d + fmla v27.2d, v29.2d, v31.2d + stp q26, q27, [x9, #32] + ldp q24, q25, [x9, #64] + ldp q28, q29, [x10, #64] + ldp q26, q27, [x9, #96] + fmla v24.2d, v28.2d, v31.2d + fmla v25.2d, v29.2d, v31.2d + stp q24, q25, [x9, #64] + ldp q28, q29, [x10, #96] + fmla v26.2d, v28.2d, v31.2d + fmla v27.2d, v29.2d, v31.2d + stp q26, q27, [x9, #96] + ldp q24, q25, [x9, #128] + ldp q26, q27, [x10, #128] + fmla v24.2d, v26.2d, v31.2d + fmla v25.2d, v27.2d, v31.2d + stp q24, q25, [x9, #128] + ldp q26, q27, [x9, #160] + ldp q1, q2, [x10, #160] + fmla v26.2d, v1.2d, v31.2d + fmla v27.2d, v2.2d, v31.2d + stp q26, q27, [x9, #160] + ldp q0, q1, [x9, #192] + ldp q2, q3, [x10, #192] + fmla v0.2d, v2.2d, v31.2d + fmla v1.2d, v3.2d, v31.2d + stp q0, q1, [x9, #192] + ldp q2, q3, [x9, #224] + ldp q4, q5, [x10, #224] + fmla v2.2d, v4.2d, v31.2d + fmla v3.2d, v5.2d, v31.2d + stp q2, q3, [x9, #224] + add x8, x8, #64 // =64 + add x10, x10, #512 // =512 + add x9, x9, #512 // =512 + adds x11, x11, #8 // =8 + b.ne .LBB1_29 + // OSACA-END diff --git a/examples/daxpy/daxpy.s.tx2.gcc.s b/examples/daxpy/daxpy.s.tx2.gcc.s new file mode 100644 index 0000000..643651a --- /dev/null +++ b/examples/daxpy/daxpy.s.tx2.gcc.s @@ -0,0 +1,41 @@ + // OSACA-BEGIN +.L17: + mov x5, x3 + ldr q23, [x10] + ldr q24, [x5], 16 + mov x6, x10 + ldr q25, [x3, 16] + ldr q26, [x3, 48] + add x10, x10, 128 + add x3, x3, 128 + ldr q27, [x3, -64] + ldr q28, [x3, -48] + ldr q29, [x3, -32] + ldr q30, [x3, -16] + fmla v23.2d, v3.2d, v24.2d + ldr q31, [x5, 16] + str q23, [x6], 16 + ldr q0, [x10, -112] + fmla v0.2d, v3.2d, v25.2d + str q0, [x10, -112] + ldr q2, [x6, 16] + fmla v2.2d, v3.2d, v31.2d + str q2, [x6, 16] + ldr q5, [x10, -80] + ldr q4, [x10, -64] + ldr q6, [x10, -48] + ldr q1, [x10, -32] + ldr q7, [x10, -16] + fmla v5.2d, v3.2d, v26.2d + fmla v4.2d, v3.2d, v27.2d + fmla v6.2d, v3.2d, v28.2d + fmla v1.2d, v3.2d, v29.2d + fmla v7.2d, v3.2d, v30.2d + str q5, [x10, -80] + str q4, [x10, -64] + str q6, [x10, -48] + str q1, [x10, -32] + str q7, [x10, -16] + cmp x23, x10 + bne .L17 + // OSACA-END diff --git a/examples/daxpy/daxpy.s.zen.gcc.s b/examples/daxpy/daxpy.s.zen.gcc.s new file mode 100644 index 0000000..91d398a --- /dev/null +++ b/examples/daxpy/daxpy.s.zen.gcc.s @@ -0,0 +1,30 @@ + # OSACA-BEGIN +.L19: + vmovups (%r12,%rax), %xmm12 + vmovups 16(%r12,%rax), %xmm13 + vmovups 32(%r12,%rax), %xmm14 + vmovups 48(%r12,%rax), %xmm15 + vmovups 64(%r12,%rax), %xmm1 + vmovups 80(%r12,%rax), %xmm0 + vmovups 96(%r12,%rax), %xmm4 + vmovups 112(%r12,%rax), %xmm5 + vfmadd213pd 0(%rbp,%rax), %xmm3, %xmm12 + vfmadd213pd 16(%rbp,%rax), %xmm3, %xmm13 + vfmadd213pd 32(%rbp,%rax), %xmm3, %xmm14 + vfmadd213pd 48(%rbp,%rax), %xmm3, %xmm15 + vfmadd213pd 64(%rbp,%rax), %xmm3, %xmm1 + vfmadd213pd 80(%rbp,%rax), %xmm3, %xmm0 + vfmadd213pd 96(%rbp,%rax), %xmm3, %xmm4 + vfmadd213pd 112(%rbp,%rax), %xmm3, %xmm5 + vmovups %xmm12, 0(%rbp,%rax) + vmovups %xmm13, 16(%rbp,%rax) + vmovups %xmm14, 32(%rbp,%rax) + vmovups %xmm15, 48(%rbp,%rax) + vmovups %xmm1, 64(%rbp,%rax) + vmovups %xmm0, 80(%rbp,%rax) + vmovups %xmm4, 96(%rbp,%rax) + vmovups %xmm5, 112(%rbp,%rax) + subq $-128, %rax + cmpq %r15, %rax + jne .L19 + # OSACA-END diff --git a/examples/gs/gs.s.csx.gcc.s b/examples/gs/gs.s.csx.gcc.s new file mode 100644 index 0000000..e8af358 --- /dev/null +++ b/examples/gs/gs.s.csx.gcc.s @@ -0,0 +1,1144 @@ + .file "gs.f90" + .text + .section .rodata.str1.1,"aMS",@progbits,1 +.LC0: + .string "gs.f90" + .section .rodata.str1.8,"aMS",@progbits,1 + .align 8 +.LC1: + .string "Integer overflow when calculating the amount of memory to allocate" + .align 8 +.LC2: + .string "Allocation would exceed memory limit" + .section .rodata.str1.1 +.LC8: + .string "# Iterations: " +.LC9: + .string " Performance: " +.LC11: + .string " MLUPs" + .text + .p2align 4 + .type MAIN__, @function +MAIN__: +.LFB0: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movabsq $21474836608, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + movq $-1, %r14 + pushq %r13 + pushq %r12 + pushq %rbx + .cfi_offset 13, -40 + .cfi_offset 12, -48 + .cfi_offset 3, -56 + movq %r14, %rbx + subq $696, %rsp + leaq 160(%rsp), %rdi + movq %rax, 160(%rsp) + movq $.LC0, 168(%rsp) + movl $12, 176(%rsp) + call _gfortran_st_read + movl $4, %edx + leaq 112(%rsp), %rsi + leaq 160(%rsp), %rdi + call _gfortran_transfer_integer + movl $4, %edx + leaq 116(%rsp), %rsi + leaq 160(%rsp), %rdi + call _gfortran_transfer_integer + leaq 160(%rsp), %rdi + call _gfortran_st_read_done + movslq 112(%rsp), %r15 + movslq 116(%rsp), %rdi + testq %r15, %r15 + cmovns %r15, %rbx + movabsq $4611686018427387904, %rcx + incq %rbx + testq %rdi, %rdi + cmovns %rdi, %r14 + movabsq $2305843009213693951, %rsi + incq %r14 + imulq %rbx, %r14 + xorl %edx, %edx + movl %r15d, 88(%rsp) + cmpq %rcx, %r14 + leaq (%r14,%r14), %r13 + sete %dl + cmpq %rsi, %r13 + setg %r8b + movzbl %r8b, %r9d + movq %rdi, 56(%rsp) + movq %rdi, %r12 + addl %r9d, %edx + testq %r15, %r15 + js .L36 + testq %rdi, %rdi + js .L36 + movq %r14, %r10 + salq $4, %r10 +.L2: + testl %edx, %edx + jne .L286 + testq %r10, %r10 + movl $1, %edi + cmovne %r10, %rdi + call malloc + movq %rax, %rdx + testq %rax, %rax + je .L287 + movl 88(%rsp), %r11d + cmpl $1, %r12d + jle .L5 + cmpl $1, %r11d + jle .L6 + movl %r11d, %r9d + subl $2, %r11d + movq %r11, %rcx + addq %rbx, %r11 + leaq 16(%rax,%r11,8), %r10 + leaq 0(,%rbx,8), %rdi + leal -1(%r9), %r11d + leaq 8(%rax,%rdi), %rsi + movq %rdi, 8(%rsp) + movl %r11d, %edi + leaq 0(,%r14,8), %rax + movl %r11d, 52(%rsp) + shrl $2, %edi + andl $-4, %r11d + movq %r10, 80(%rsp) + movq %rax, (%rsp) + leal 2(%r11), %r10d + leal 3(%r11), %eax + salq $5, %rdi + movq %r13, %r8 + movq %rdi, 64(%rsp) + movl %r10d, 48(%rsp) + movq %r10, 24(%rsp) + movl %eax, 20(%rsp) + movq %rax, 40(%rsp) + movl $1, 72(%rsp) + leal 1(%r11), %r9d + subq %r14, %r8 + movq %r9, 32(%rsp) + addq %rbx, %r8 + movq %rbx, %r9 + vxorpd %xmm0, %xmm0, %xmm0 +.L14: + leaq 3(%r8), %rdi + cmpq %rdi, %r9 + leaq 3(%r9), %rax + setg %r10b + cmpq %rax, %r8 + setg %dil + orb %dil, %r10b + je .L39 + movq (%rsp), %rax + cmpl $2, %ecx + seta %r10b + leaq (%rsi,%rax), %rdi + xorl %eax, %eax + testb %r10b, %r10b + je .L39 + movq 64(%rsp), %r10 + subq $32, %r10 + shrq $5, %r10 + incq %r10 + andl $7, %r10d + je .L13 + cmpq $1, %r10 + je .L177 + cmpq $2, %r10 + je .L178 + cmpq $3, %r10 + je .L179 + cmpq $4, %r10 + je .L180 + cmpq $5, %r10 + je .L181 + cmpq $6, %r10 + je .L182 + vmovupd %ymm0, (%rsi) + movl $32, %eax + vmovupd %ymm0, (%rdi) +.L182: + vmovupd %ymm0, (%rsi,%rax) + vmovupd %ymm0, (%rdi,%rax) + addq $32, %rax +.L181: + vmovupd %ymm0, (%rsi,%rax) + vmovupd %ymm0, (%rdi,%rax) + addq $32, %rax +.L180: + vmovupd %ymm0, (%rsi,%rax) + vmovupd %ymm0, (%rdi,%rax) + addq $32, %rax +.L179: + vmovupd %ymm0, (%rsi,%rax) + vmovupd %ymm0, (%rdi,%rax) + addq $32, %rax +.L178: + vmovupd %ymm0, (%rsi,%rax) + vmovupd %ymm0, (%rdi,%rax) + addq $32, %rax +.L177: + vmovupd %ymm0, (%rsi,%rax) + vmovupd %ymm0, (%rdi,%rax) + addq $32, %rax + cmpq 64(%rsp), %rax + je .L156 +.L13: + vmovupd %ymm0, (%rsi,%rax) + vmovupd %ymm0, (%rdi,%rax) + vmovupd %ymm0, 32(%rax,%rsi) + vmovupd %ymm0, 32(%rdi,%rax) + vmovupd %ymm0, 64(%rax,%rsi) + vmovupd %ymm0, 64(%rdi,%rax) + vmovupd %ymm0, 96(%rax,%rsi) + vmovupd %ymm0, 96(%rdi,%rax) + vmovupd %ymm0, 128(%rax,%rsi) + vmovupd %ymm0, 128(%rdi,%rax) + vmovupd %ymm0, 160(%rax,%rsi) + vmovupd %ymm0, 160(%rdi,%rax) + vmovupd %ymm0, 192(%rax,%rsi) + vmovupd %ymm0, 192(%rdi,%rax) + vmovupd %ymm0, 224(%rax,%rsi) + vmovupd %ymm0, 224(%rdi,%rax) + addq $256, %rax + cmpq 64(%rsp), %rax + jne .L13 +.L156: + cmpl %r11d, 52(%rsp) + je .L16 + movq 32(%rsp), %rdi + leaq (%r9,%rdi), %r10 + movq $0x000000000, (%rdx,%r10,8) + leaq (%r8,%rdi), %rax + movl 48(%rsp), %r10d + movl 88(%rsp), %edi + movq $0x000000000, (%rdx,%rax,8) + cmpl %r10d, %edi + jle .L16 + movq 24(%rsp), %r10 + leaq (%r9,%r10), %rax + movq $0x000000000, (%rdx,%rax,8) + movl 20(%rsp), %eax + leaq (%r8,%r10), %r10 + movq $0x000000000, (%rdx,%r10,8) + cmpl %eax, %edi + jle .L16 + movq 40(%rsp), %rdi + leaq (%r9,%rdi), %r10 + leaq (%r8,%rdi), %rax + movq $0x000000000, (%rdx,%r10,8) + movq $0x000000000, (%rdx,%rax,8) +.L16: + incl 72(%rsp) + movq 8(%rsp), %rdi + addq %rbx, %r9 + addq %rdi, 80(%rsp) + movl 72(%rsp), %r10d + addq %rbx, %r8 + addq %rdi, %rsi + cmpl %r10d, %r12d + jne .L14 +.L11: + movq 56(%rsp), %r10 + movl 88(%rsp), %r8d + imulq %rbx, %r10 + movl $0, %r11d + testl %r8d, %r8d + movq %r10, %rax + cmovns %r8d, %r11d + leaq 3(%r10), %rsi + subq %r14, %rax + movq %r13, %r9 + addq %r13, %rax + subq %r14, %r9 + cmpq $6, %rsi + seta %dil + cmpl $2, %r11d + leaq 3(%rax), %r8 + movq %rsi, 80(%rsp) + seta %sil + andl %esi, %edi + cmpq $6, %r8 + movq %r9, 72(%rsp) + seta %sil + leaq 3(%r9), %r9 + andl %edi, %esi + cmpq $6, %r9 + seta %dil + andl %esi, %edi + cmpq %rax, 80(%rsp) + setl %sil + cmpq %r8, %r10 + setg 64(%rsp) + orb 64(%rsp), %sil + andl %esi, %edi + cmpq %rax, %r9 + setl %sil + movb %dil, 64(%rsp) + cmpq 72(%rsp), %r8 + setl %r8b + orl %r8d, %esi + testb %sil, 64(%rsp) + je .L19 + movq 72(%rsp), %rdi + cmpq %r9, %r10 + setg %r9b + cmpq %rdi, 80(%rsp) + setl %sil + orb %sil, %r9b + je .L19 + incl %r11d + movl %r11d, %r9d + shrl $2, %r9d + salq $5, %r9 + movq %r9, 80(%rsp) + subq $32, %r9 + shrq $5, %r9 + incq %r9 + leaq (%rdx,%rax,8), %rdi + vmovapd .LC4(%rip), %ymm1 + leaq (%rdx,%r10,8), %r8 + leaq (%rdx,%r14,8), %rsi + xorl %eax, %eax + vxorpd %xmm2, %xmm2, %xmm2 + andl $7, %r9d + je .L21 + cmpq $1, %r9 + je .L189 + cmpq $2, %r9 + je .L190 + cmpq $3, %r9 + je .L191 + cmpq $4, %r9 + je .L192 + cmpq $5, %r9 + je .L193 + cmpq $6, %r9 + je .L194 + vmovupd %ymm1, (%r8) + movl $32, %eax + vmovupd %ymm1, (%rdi) + vmovupd %ymm2, (%rdx) + vmovupd %ymm2, (%rsi) +.L194: + vmovupd %ymm1, (%r8,%rax) + vmovupd %ymm1, (%rdi,%rax) + vmovupd %ymm2, (%rdx,%rax) + vmovupd %ymm2, (%rsi,%rax) + addq $32, %rax +.L193: + vmovupd %ymm1, (%r8,%rax) + vmovupd %ymm1, (%rdi,%rax) + vmovupd %ymm2, (%rdx,%rax) + vmovupd %ymm2, (%rsi,%rax) + addq $32, %rax +.L192: + vmovupd %ymm1, (%r8,%rax) + vmovupd %ymm1, (%rdi,%rax) + vmovupd %ymm2, (%rdx,%rax) + vmovupd %ymm2, (%rsi,%rax) + addq $32, %rax +.L191: + vmovupd %ymm1, (%r8,%rax) + vmovupd %ymm1, (%rdi,%rax) + vmovupd %ymm2, (%rdx,%rax) + vmovupd %ymm2, (%rsi,%rax) + addq $32, %rax +.L190: + vmovupd %ymm1, (%r8,%rax) + vmovupd %ymm1, (%rdi,%rax) + vmovupd %ymm2, (%rdx,%rax) + vmovupd %ymm2, (%rsi,%rax) + addq $32, %rax +.L189: + vmovupd %ymm1, (%r8,%rax) + vmovupd %ymm1, (%rdi,%rax) + vmovupd %ymm2, (%rdx,%rax) + vmovupd %ymm2, (%rsi,%rax) + addq $32, %rax + cmpq 80(%rsp), %rax + je .L114 +.L21: + vmovupd %ymm1, (%r8,%rax) + vmovupd %ymm1, (%rdi,%rax) + vmovupd %ymm2, (%rdx,%rax) + vmovupd %ymm2, (%rsi,%rax) + vmovupd %ymm1, 32(%r8,%rax) + vmovupd %ymm1, 32(%rdi,%rax) + vmovupd %ymm2, 32(%rdx,%rax) + vmovupd %ymm2, 32(%rsi,%rax) + vmovupd %ymm1, 64(%r8,%rax) + vmovupd %ymm1, 64(%rdi,%rax) + vmovupd %ymm2, 64(%rdx,%rax) + vmovupd %ymm2, 64(%rsi,%rax) + vmovupd %ymm1, 96(%r8,%rax) + vmovupd %ymm1, 96(%rdi,%rax) + vmovupd %ymm2, 96(%rdx,%rax) + vmovupd %ymm2, 96(%rsi,%rax) + vmovupd %ymm1, 128(%r8,%rax) + vmovupd %ymm1, 128(%rdi,%rax) + vmovupd %ymm2, 128(%rdx,%rax) + vmovupd %ymm2, 128(%rsi,%rax) + vmovupd %ymm1, 160(%r8,%rax) + vmovupd %ymm1, 160(%rdi,%rax) + vmovupd %ymm2, 160(%rdx,%rax) + vmovupd %ymm2, 160(%rsi,%rax) + vmovupd %ymm1, 192(%r8,%rax) + vmovupd %ymm1, 192(%rdi,%rax) + vmovupd %ymm2, 192(%rdx,%rax) + vmovupd %ymm2, 192(%rsi,%rax) + vmovupd %ymm1, 224(%r8,%rax) + vmovupd %ymm1, 224(%rdi,%rax) + vmovupd %ymm2, 224(%rdx,%rax) + vmovupd %ymm2, 224(%rsi,%rax) + addq $256, %rax + cmpq 80(%rsp), %rax + jne .L21 +.L114: + movl %r11d, %eax + andl $-4, %eax + testb $3, %r11b + je .L282 + movslq %eax, %r8 + vmovsd .LC5(%rip), %xmm3 + leaq (%r10,%r8), %rsi + movq %r8, %rdi + vmovsd %xmm3, (%rdx,%rsi,8) + subq %r14, %rdi + addq %r14, %rsi + movl 88(%rsp), %r9d + vmovsd %xmm3, (%rdx,%rsi,8) + addq %r13, %rdi + movq $0x000000000, (%rdx,%r8,8) + leal 1(%rax), %r8d + movq $0x000000000, (%rdx,%rdi,8) + cmpl %r8d, %r9d + jl .L282 + movslq %r8d, %rsi + movq %rsi, %r8 + leaq (%r10,%rsi), %rdi + subq %r14, %r8 + vmovsd %xmm3, (%rdx,%rdi,8) + addq %r13, %r8 + addq %r14, %rdi + addl $2, %eax + vmovsd %xmm3, (%rdx,%rdi,8) + movq $0x000000000, (%rdx,%rsi,8) + movq $0x000000000, (%rdx,%r8,8) + cmpl %eax, %r9d + jl .L282 + cltq + movq %rax, %r9 + leaq (%r10,%rax), %r10 + subq %r14, %r9 + vmovsd %xmm3, (%rdx,%r10,8) + addq %r13, %r9 + addq %r14, %r10 + vmovsd %xmm3, (%rdx,%r10,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rdx,%r9,8) + vzeroupper +.L10: + testl %r12d, %r12d + js .L17 +.L18: + vxorpd %xmm5, %xmm5, %xmm5 + vcvtsi2sdl %r11d, %xmm5, %xmm6 + vcvtsi2sdl 88(%rsp), %xmm5, %xmm7 + movq %r15, %r11 + subq %r14, %r11 + leaq 0(%r13,%r11), %rdi + vdivsd %xmm7, %xmm6, %xmm8 + subq %r14, %r13 + leaq 0(,%rbx,8), %rsi + movl %r12d, %r14d + andl $7, %r14d + movl $1, %r9d + leaq (%rdx,%rsi), %rax + vmovsd %xmm8, (%rdx) + vmovsd %xmm8, (%rdx,%r13,8) + vmovsd %xmm8, (%rdx,%r15,8) + vmovsd %xmm8, (%rdx,%rdi,8) + cmpl $1, %r12d + jl .L17 + testl %r14d, %r14d + je .L26 + cmpl $1, %r14d + je .L201 + cmpl $2, %r14d + je .L202 + cmpl $3, %r14d + je .L203 + cmpl $4, %r14d + je .L204 + cmpl $5, %r14d + je .L205 + cmpl $6, %r14d + je .L206 + vmovsd %xmm8, (%rax) + movl $2, %r9d + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax +.L206: + vmovsd %xmm8, (%rax) + incl %r9d + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax +.L205: + vmovsd %xmm8, (%rax) + incl %r9d + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax +.L204: + vmovsd %xmm8, (%rax) + incl %r9d + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax +.L203: + vmovsd %xmm8, (%rax) + incl %r9d + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax +.L202: + vmovsd %xmm8, (%rax) + incl %r9d + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax +.L201: + incl %r9d + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + cmpl %r9d, %r12d + jl .L17 +.L26: + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + vmovsd %xmm8, (%rax) + addl $8, %r9d + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rdi,8) + addq %rsi, %rax + cmpl %r9d, %r12d + jge .L26 +.L17: + movl %ecx, %ecx + leaq 0(,%rbx,8), %r13 + addq %rbx, %rcx + leaq 8(%rdx,%r13), %r15 + leaq (%rbx,%rbx), %r10 + leaq 16(%rdx,%rcx,8), %r8 + movq %r15, 64(%rsp) + movq %r10, 72(%rsp) + movq %r8, 56(%rsp) + movl $10, 80(%rsp) +.L25: + leaq 128(%rsp), %rsi + leaq 144(%rsp), %rdi + sall 80(%rsp) + call timing_ + movq .LC6(%rip), %r11 + xorl %r15d, %r15d + vmovq %r11, %xmm9 + .p2align 4,,10 + .p2align 3 +.L29: + cmpl $1, %r12d + jle .L32 + cmpl $1, 88(%rsp) + jle .L32 + movq 56(%rsp), %r8 + movq 72(%rsp), %r14 + movq 64(%rsp), %rdi + movq %rbx, %r9 + xorl %r11d, %r11d + movl $1, %r10d + .p2align 4,,10 + .p2align 3 +.L33: + movq %r8, %rdx + subq %rdi, %rdx + subq $8, %rdx + shrq $3, %rdx + movq %r11, %rsi + movq %r14, %rcx + incq %rdx + vmovsd -8(%rdi), %xmm8 + incl %r10d + movq %rdi, %rax + subq %r9, %rsi + subq %r9, %rcx + andl $7, %edx + je .L31 + cmpq $1, %rdx + je .L195 + cmpq $2, %rdx + je .L196 + cmpq $3, %rdx + je .L197 + cmpq $4, %rdx + je .L198 + cmpq $5, %rdx + je .L199 + cmpq $6, %rdx + je .L200 + vmovsd (%rdi,%rsi,8), %xmm10 + vaddsd (%rdi,%rcx,8), %xmm8, %xmm12 + vaddsd 8(%rdi), %xmm10, %xmm11 + leaq 8(%rdi), %rax + vaddsd %xmm12, %xmm11, %xmm13 + vmulsd %xmm9, %xmm13, %xmm8 + vmovsd %xmm8, (%rdi) +.L200: + vmovsd (%rax,%rsi,8), %xmm14 + vaddsd (%rax,%rcx,8), %xmm8, %xmm0 + vaddsd 8(%rax), %xmm14, %xmm15 + addq $8, %rax + vaddsd %xmm0, %xmm15, %xmm1 + vmulsd %xmm9, %xmm1, %xmm8 + vmovsd %xmm8, -8(%rax) +.L199: + vmovsd (%rax,%rsi,8), %xmm2 + vaddsd (%rax,%rcx,8), %xmm8, %xmm4 + vaddsd 8(%rax), %xmm2, %xmm3 + addq $8, %rax + vaddsd %xmm4, %xmm3, %xmm5 + vmulsd %xmm9, %xmm5, %xmm8 + vmovsd %xmm8, -8(%rax) +.L198: + vmovsd (%rax,%rsi,8), %xmm6 + vaddsd (%rax,%rcx,8), %xmm8, %xmm8 + vaddsd 8(%rax), %xmm6, %xmm7 + addq $8, %rax + vaddsd %xmm8, %xmm7, %xmm10 + vmulsd %xmm9, %xmm10, %xmm8 + vmovsd %xmm8, -8(%rax) +.L197: + vmovsd (%rax,%rsi,8), %xmm11 + vaddsd (%rax,%rcx,8), %xmm8, %xmm13 + vaddsd 8(%rax), %xmm11, %xmm12 + addq $8, %rax + vaddsd %xmm13, %xmm12, %xmm14 + vmulsd %xmm9, %xmm14, %xmm8 + vmovsd %xmm8, -8(%rax) +.L196: + vmovsd (%rax,%rsi,8), %xmm15 + vaddsd (%rax,%rcx,8), %xmm8, %xmm0 + vaddsd 8(%rax), %xmm15, %xmm1 + addq $8, %rax + vaddsd %xmm0, %xmm1, %xmm2 + vmulsd %xmm9, %xmm2, %xmm8 + vmovsd %xmm8, -8(%rax) +.L195: + vmovsd (%rax,%rsi,8), %xmm3 + vaddsd (%rax,%rcx,8), %xmm8, %xmm5 + vaddsd 8(%rax), %xmm3, %xmm4 + addq $8, %rax + vaddsd %xmm5, %xmm4, %xmm6 + vmulsd %xmm9, %xmm6, %xmm8 + vmovsd %xmm8, -8(%rax) + cmpq %r8, %rax + je .L267 + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L31: + vmovsd (%rax,%rsi,8), %xmm7 + vaddsd (%rax,%rcx,8), %xmm8, %xmm11 + vaddsd 8(%rax), %xmm7, %xmm10 + leaq 8(%rax), %rdx + vaddsd %xmm11, %xmm10, %xmm12 + vmulsd %xmm9, %xmm12, %xmm13 + vmovsd %xmm13, (%rax) + vmovsd (%rdx,%rsi,8), %xmm14 + vaddsd (%rdx,%rcx,8), %xmm13, %xmm1 + vaddsd 16(%rax), %xmm14, %xmm15 + leaq 16(%rax), %rdx + vaddsd %xmm1, %xmm15, %xmm0 + vmulsd %xmm9, %xmm0, %xmm3 + vmovsd %xmm3, 8(%rax) + vmovsd (%rdx,%rsi,8), %xmm2 + vaddsd (%rdx,%rcx,8), %xmm3, %xmm5 + vaddsd 24(%rax), %xmm2, %xmm4 + leaq 24(%rax), %rdx + vaddsd %xmm5, %xmm4, %xmm6 + vmulsd %xmm9, %xmm6, %xmm8 + vmovsd %xmm8, 16(%rax) + vmovsd (%rdx,%rsi,8), %xmm7 + vaddsd (%rdx,%rcx,8), %xmm8, %xmm11 + vaddsd 32(%rax), %xmm7, %xmm10 + leaq 32(%rax), %rdx + vaddsd %xmm11, %xmm10, %xmm12 + vmulsd %xmm9, %xmm12, %xmm13 + vmovsd %xmm13, 24(%rax) + vmovsd (%rdx,%rsi,8), %xmm14 + vaddsd (%rdx,%rcx,8), %xmm13, %xmm1 + vaddsd 40(%rax), %xmm14, %xmm15 + leaq 40(%rax), %rdx + vaddsd %xmm1, %xmm15, %xmm0 + vmulsd %xmm9, %xmm0, %xmm3 + vmovsd %xmm3, 32(%rax) + vmovsd (%rdx,%rsi,8), %xmm2 + vaddsd (%rdx,%rcx,8), %xmm3, %xmm5 + vaddsd 48(%rax), %xmm2, %xmm4 + leaq 48(%rax), %rdx + vaddsd %xmm5, %xmm4, %xmm6 + vmulsd %xmm9, %xmm6, %xmm8 + vmovsd %xmm8, 40(%rax) + vmovsd (%rdx,%rsi,8), %xmm7 + vaddsd (%rdx,%rcx,8), %xmm8, %xmm11 + vaddsd 56(%rax), %xmm7, %xmm10 + leaq 56(%rax), %rdx + addq $64, %rax + vaddsd %xmm11, %xmm10, %xmm12 + vmulsd %xmm9, %xmm12, %xmm13 + vmovsd %xmm13, -16(%rax) + vmovsd (%rdx,%rsi,8), %xmm14 + vaddsd (%rdx,%rcx,8), %xmm13, %xmm1 + vaddsd (%rax), %xmm14, %xmm15 + vaddsd %xmm1, %xmm15, %xmm0 + vmulsd %xmm9, %xmm0, %xmm8 + vmovsd %xmm8, -8(%rax) + cmpq %r8, %rax + jne .L31 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L267: + addq %r13, %rdi + addq %rbx, %r11 + addq %rbx, %r9 + addq %rbx, %r14 + addq %r13, %r8 + cmpl %r10d, %r12d + jne .L33 +.L32: + leal 1(%r15), %r8d + cmpl 80(%rsp), %r8d + je .L28 + movl %r8d, %r15d + jmp .L29 +.L39: + movq 80(%rsp), %r10 + movq %r8, %rdi + subq %rsi, %r10 + subq $8, %r10 + shrq $3, %r10 + incq %r10 + movq %rsi, %rax + subq %r9, %rdi + andl $7, %r10d + je .L9 + cmpq $1, %r10 + je .L183 + cmpq $2, %r10 + je .L184 + cmpq $3, %r10 + je .L185 + cmpq $4, %r10 + je .L186 + cmpq $5, %r10 + je .L187 + cmpq $6, %r10 + je .L188 + movq $0x000000000, (%rsi) + movq $0x000000000, (%rsi,%rdi,8) + leaq 8(%rsi), %rax +.L188: + movq $0x000000000, (%rax) + movq $0x000000000, (%rax,%rdi,8) + addq $8, %rax +.L187: + movq $0x000000000, (%rax) + movq $0x000000000, (%rax,%rdi,8) + addq $8, %rax +.L186: + movq $0x000000000, (%rax) + movq $0x000000000, (%rax,%rdi,8) + addq $8, %rax +.L185: + movq $0x000000000, (%rax) + movq $0x000000000, (%rax,%rdi,8) + addq $8, %rax +.L184: + movq $0x000000000, (%rax) + movq $0x000000000, (%rax,%rdi,8) + addq $8, %rax +.L183: + movq $0x000000000, (%rax) + movq $0x000000000, (%rax,%rdi,8) + addq $8, %rax + cmpq 80(%rsp), %rax + je .L16 +.L9: + movq $0x000000000, (%rax) + movq $0x000000000, (%rax,%rdi,8) + movq $0x000000000, 8(%rax) + movq $0x000000000, 8(%rax,%rdi,8) + movq $0x000000000, 16(%rax) + movq $0x000000000, 16(%rax,%rdi,8) + movq $0x000000000, 24(%rax) + movq $0x000000000, 24(%rax,%rdi,8) + movq $0x000000000, 32(%rax) + movq $0x000000000, 32(%rax,%rdi,8) + movq $0x000000000, 40(%rax) + movq $0x000000000, 40(%rax,%rdi,8) + movq $0x000000000, 48(%rax) + movq $0x000000000, 48(%rax,%rdi,8) + movq $0x000000000, 56(%rax) + movq $0x000000000, 56(%rax,%rdi,8) + addq $64, %rax + cmpq 80(%rsp), %rax + jne .L9 + jmp .L16 +.L36: + xorl %r10d, %r10d + jmp .L2 + .p2align 4,,10 + .p2align 3 +.L28: + addl $2, %r15d + leaq 120(%rsp), %rsi + leaq 136(%rsp), %rdi + movl %r15d, 108(%rsp) + call timing_ + vmovsd 136(%rsp), %xmm9 + vsubsd 144(%rsp), %xmm9, %xmm3 + vcomisd .LC7(%rip), %xmm3 + jnb .L40 + cmpl $999999999, 80(%rsp) + jle .L25 +.L40: + movl 80(%rsp), %ebx + cmpl %ebx, %r15d + jle .L35 + movl %ebx, 108(%rsp) +.L35: + leaq 160(%rsp), %rdi + movabsq $25769803904, %r13 + vmovsd %xmm3, 88(%rsp) + movq $.LC0, 168(%rsp) + movl $72, 176(%rsp) + movq %r13, 160(%rsp) + call _gfortran_st_write + movl $14, %edx + movl $.LC8, %esi + leaq 160(%rsp), %rdi + call _gfortran_transfer_character_write + movl $4, %edx + leaq 108(%rsp), %rsi + leaq 160(%rsp), %rdi + call _gfortran_transfer_integer_write + movl $14, %edx + movl $.LC9, %esi + leaq 160(%rsp), %rdi + call _gfortran_transfer_character_write + decl %r12d + vxorpd %xmm2, %xmm2, %xmm2 + vcvtsi2sdl 52(%rsp), %xmm2, %xmm4 + vcvtsi2sdl %r12d, %xmm2, %xmm5 + vcvtsi2sdl 108(%rsp), %xmm2, %xmm8 + vmovsd 88(%rsp), %xmm11 + movl $8, %edx + vmulsd %xmm5, %xmm4, %xmm6 + vmulsd .LC10(%rip), %xmm8, %xmm7 + leaq 152(%rsp), %rsi + leaq 160(%rsp), %rdi + vmulsd %xmm7, %xmm6, %xmm10 + vdivsd %xmm11, %xmm10, %xmm12 + vmovsd %xmm12, 152(%rsp) + call _gfortran_transfer_real_write + movl $6, %edx + movl $.LC11, %esi + leaq 160(%rsp), %rdi + call _gfortran_transfer_character_write + leaq 160(%rsp), %rdi + call _gfortran_st_write_done + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + call _gfortran_stop_string +.L282: + vzeroupper + jmp .L10 +.L5: + testl %r11d, %r11d + js .L37 +.L284: + leal -2(%r11), %ecx + decl %r11d + movl %r11d, 52(%rsp) + jmp .L11 +.L6: + cmpl $0, 88(%rsp) + jns .L288 + movl 88(%rsp), %eax + xorl %r11d, %r11d + leal -2(%rax), %ecx + decl %eax + movl %eax, 52(%rsp) + jmp .L18 +.L19: + imulq $-8, %r14, %rax + leaq (%rdx,%r10,8), %r8 + addq %r13, %r10 + leaq (%rax,%r10,8), %rdi + movl 88(%rsp), %r10d + vmovsd .LC5(%rip), %xmm4 + leaq (%rax,%r13,8), %rsi + movl %r10d, %r9d + addq %rdx, %rdi + addq %rdx, %rsi + andl $7, %r9d + decl %r10d + vmovsd %xmm4, (%r8) + movl $1, %eax + vmovsd %xmm4, (%rdi) + movq $0x000000000, (%rdx) + movq $0x000000000, (%rsi) + jl .L45 + testl %r9d, %r9d + je .L24 + cmpl $1, %r9d + je .L207 + cmpl $2, %r9d + je .L208 + cmpl $3, %r9d + je .L209 + cmpl $4, %r9d + je .L210 + cmpl $5, %r9d + je .L211 + cmpl $6, %r9d + je .L212 + vmovsd %xmm4, 8(%r8) + vmovsd %xmm4, 8(%rdi) + movq $0x000000000, 8(%rdx) + movq $0x000000000, 8(%rsi) + movl $2, %eax +.L212: + vmovsd %xmm4, (%r8,%rax,8) + vmovsd %xmm4, (%rdi,%rax,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rsi,%rax,8) + incq %rax +.L211: + vmovsd %xmm4, (%r8,%rax,8) + vmovsd %xmm4, (%rdi,%rax,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rsi,%rax,8) + incq %rax +.L210: + vmovsd %xmm4, (%r8,%rax,8) + vmovsd %xmm4, (%rdi,%rax,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rsi,%rax,8) + incq %rax +.L209: + vmovsd %xmm4, (%r8,%rax,8) + vmovsd %xmm4, (%rdi,%rax,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rsi,%rax,8) + incq %rax +.L208: + vmovsd %xmm4, (%r8,%rax,8) + vmovsd %xmm4, (%rdi,%rax,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rsi,%rax,8) + incq %rax +.L207: + vmovsd %xmm4, (%r8,%rax,8) + vmovsd %xmm4, (%rdi,%rax,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rsi,%rax,8) + incq %rax + cmpl %eax, 88(%rsp) + jl .L45 +.L24: + leaq 1(%rax), %r10 + vmovsd %xmm4, (%r8,%rax,8) + leaq 2(%rax), %r9 + vmovsd %xmm4, (%rdi,%rax,8) + movq $0x000000000, (%rdx,%rax,8) + movq $0x000000000, (%rsi,%rax,8) + vmovsd %xmm4, (%r8,%r10,8) + vmovsd %xmm4, (%rdi,%r10,8) + movq $0x000000000, (%rdx,%r10,8) + movq $0x000000000, (%rsi,%r10,8) + leaq 3(%rax), %r10 + vmovsd %xmm4, (%r8,%r9,8) + vmovsd %xmm4, (%rdi,%r9,8) + movq $0x000000000, (%rdx,%r9,8) + movq $0x000000000, (%rsi,%r9,8) + vmovsd %xmm4, (%r8,%r10,8) + leaq 4(%rax), %r9 + vmovsd %xmm4, (%rdi,%r10,8) + movq $0x000000000, (%rdx,%r10,8) + movq $0x000000000, (%rsi,%r10,8) + leaq 5(%rax), %r10 + vmovsd %xmm4, (%r8,%r9,8) + vmovsd %xmm4, (%rdi,%r9,8) + movq $0x000000000, (%rdx,%r9,8) + movq $0x000000000, (%rsi,%r9,8) + vmovsd %xmm4, (%r8,%r10,8) + leaq 6(%rax), %r9 + vmovsd %xmm4, (%rdi,%r10,8) + movq $0x000000000, (%rdx,%r10,8) + movq $0x000000000, (%rsi,%r10,8) + leaq 7(%rax), %r10 + addq $8, %rax + vmovsd %xmm4, (%r8,%r9,8) + vmovsd %xmm4, (%rdi,%r9,8) + movq $0x000000000, (%rdx,%r9,8) + movq $0x000000000, (%rsi,%r9,8) + vmovsd %xmm4, (%r8,%r10,8) + vmovsd %xmm4, (%rdi,%r10,8) + movq $0x000000000, (%rdx,%r10,8) + movq $0x000000000, (%rsi,%r10,8) + cmpl %eax, 88(%rsp) + jge .L24 +.L45: + incl %r11d + vzeroupper + jmp .L10 +.L37: + movl 88(%rsp), %r8d + xorl %r11d, %r11d + leal -2(%r8), %ecx + decl %r8d + movl %r8d, 52(%rsp) + jmp .L10 +.L287: + movl $.LC2, %edi + call _gfortran_os_error +.L286: + movl $.LC1, %edi + xorl %eax, %eax + call _gfortran_runtime_error +.L288: + movl 88(%rsp), %r11d + jmp .L284 + .cfi_endproc +.LFE0: + .size MAIN__, .-MAIN__ + .section .text.startup,"ax",@progbits + .p2align 4 + .globl main + .type main, @function +main: +.LFB1: + .cfi_startproc + subq $8, %rsp + .cfi_def_cfa_offset 16 + call _gfortran_set_args + movl $options.9.4008, %esi + movl $7, %edi + call _gfortran_set_options + call MAIN__ + .cfi_endproc +.LFE1: + .size main, .-main + .section .rodata + .align 16 + .type options.9.4008, @object + .size options.9.4008, 28 +options.9.4008: + .long 2116 + .long 4095 + .long 0 + .long 1 + .long 1 + .long 0 + .long 31 + .section .rodata.cst32,"aM",@progbits,32 + .align 32 +.LC4: + .long 0 + .long 1072693248 + .long 0 + .long 1072693248 + .long 0 + .long 1072693248 + .long 0 + .long 1072693248 + .section .rodata.cst8,"aM",@progbits,8 + .align 8 +.LC5: + .long 0 + .long 1072693248 + .align 8 +.LC6: + .long 0 + .long 1070596096 + .align 8 +.LC7: + .long 2576980378 + .long 1070176665 + .align 8 +.LC10: + .long 2696277389 + .long 1051772663 + .ident "GCC: (GNU) 9.1.0" + .section .note.GNU-stack,"",@progbits diff --git a/examples/gs/gs.s.csx.icc.s b/examples/gs/gs.s.csx.icc.s new file mode 100644 index 0000000..19295fd --- /dev/null +++ b/examples/gs/gs.s.csx.icc.s @@ -0,0 +1,1123 @@ +# mark_description "Intel(R) Fortran Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 2019"; +# mark_description "0815"; +# mark_description "-qopenmp-simd -fno-alias -unroll -fno-builtin -xCORE-AVX512 -qopt-zmm-usage=high -Ofast -S -use-msasm -o gs."; +# mark_description "s.csx.icc.s"; + .file "gs.f90" + .text +..TXTST0: +.L_2__routine_start_MAIN___0: +# -- Begin MAIN__ + .text +# mark_begin; + .align 16,0x90 + .globl MAIN__ +# --- HEAT +MAIN__: +..B1.1: # Preds ..B1.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_MAIN__.1: +..L2: + #1.9 + pushq %rbp #1.9 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #1.9 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-128, %rsp #1.9 + pushq %r12 #1.9 + pushq %r13 #1.9 + pushq %r14 #1.9 + pushq %r15 #1.9 + pushq %rbx #1.9 + subq $216, %rsp #1.9 + movq $0x64199d9ffe, %rsi #1.9 + movl $3, %edi #1.9 + call __intel_new_feature_proc_init #1.9 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B1.95: # Preds ..B1.1 + # Execution count [1.00e+00] + vstmxcsr (%rsp) #1.9 + movl $__NLITPACK_0.0.1, %edi #1.9 + xorl %eax, %eax #1.9 + orl $32832, (%rsp) #1.9 + vldmxcsr (%rsp) #1.9 +..___tag_value_MAIN__.11: + call for_set_reentrancy #1.9 +..___tag_value_MAIN__.12: + # LOE +..B1.2: # Preds ..B1.95 + # Execution count [1.00e+00] + movl $-4, %esi #12.3 + lea 152(%rsp), %rax #12.3 + movq %rax, -24(%rax) #12.3 + lea (%rsp), %rdi #12.3 + movq $0x1208384ff00, %rdx #12.3 + movl $__STRLITPACK_3.0.1, %ecx #12.3 + xorl %eax, %eax #12.3 + lea 128(%rsp), %r8 #12.3 + movq $0, (%rdi) #12.3 +..___tag_value_MAIN__.13: + call for_read_seq_lis #12.3 +..___tag_value_MAIN__.14: + # LOE +..B1.3: # Preds ..B1.2 + # Execution count [1.00e+00] + movl $__STRLITPACK_4.0.1, %esi #12.3 + lea 156(%rsp), %rax #12.3 + movq %rax, -20(%rax) #12.3 + lea (%rsp), %rdi #12.3 + xorl %eax, %eax #12.3 + lea 136(%rsp), %rdx #12.3 +..___tag_value_MAIN__.15: + call for_read_seq_lis_xmit #12.3 +..___tag_value_MAIN__.16: + # LOE +..B1.4: # Preds ..B1.3 + # Execution count [1.00e+00] + movq 24+heat_$PHI.0.1(%rip), %r9 #15.3 + movq %r9, %r10 #15.3 + andq $-256, %r10 #15.3 + movq $0xf000000000, %r12 #15.3 + shrq $8, %r10 #15.3 + andq %r12, %r9 #15.3 + movl 152(%rsp), %r14d #14.3 + movq $0xffffff0fffffffff, %rbx #15.3 + movslq %r14d, %r12 #15.3 + xorl %esi, %esi #15.3 + shlq $63, %r10 #15.3 + movq %r12, %r15 #15.3 + shrq $55, %r10 #15.3 + movl $8, %r11d #15.3 + addq $133, %r10 #15.3 + sarq $63, %r15 #15.3 + andq %rbx, %r10 #15.3 + movl 156(%rsp), %r13d #13.3 + lea 1(%r12), %rbx #15.3 + andn %rbx, %r15, %rdx #15.3 + movslq %r13d, %rbx #15.3 + movq %rbx, %rdi #15.3 + sarq $63, %rdi #15.3 + shrq $36, %r9 #15.3 + lea (,%rdx,8), %r8 #15.3 + movq %r8, 80+heat_$PHI.0.1(%rip) #15.3 + lea 1(%rbx), %rax #15.3 + andn %rax, %rdi, %rcx #15.3 + lea 144(%rsp), %rdi #15.3 + imulq %rcx, %r8 #15.3 + shlq $60, %r9 #15.3 + xorl %eax, %eax #15.3 + shrq $24, %r9 #15.3 + movq %rsi, 16+heat_$PHI.0.1(%rip) #15.3 + orq %r9, %r10 #15.3 + movq %rsi, 64+heat_$PHI.0.1(%rip) #15.3 + movq %rsi, 88+heat_$PHI.0.1(%rip) #15.3 + movl $3, %esi #15.3 + movq %r8, 104+heat_$PHI.0.1(%rip) #15.3 + movl $16, %r8d #15.3 + movq %r10, 24+heat_$PHI.0.1(%rip) #15.3 + movq %r11, 8+heat_$PHI.0.1(%rip) #15.3 + movq $3, 32+heat_$PHI.0.1(%rip) #15.3 + movq %r11, 56+heat_$PHI.0.1(%rip) #15.3 + movq %rdx, 48+heat_$PHI.0.1(%rip) #15.3 + movq $1, 112+heat_$PHI.0.1(%rip) #15.3 + movq $2, 96+heat_$PHI.0.1(%rip) #15.3 + movq %rcx, 72+heat_$PHI.0.1(%rip) #15.3 +..___tag_value_MAIN__.17: + call for_check_mult_overflow64 #15.3 +..___tag_value_MAIN__.18: + # LOE rbx r12 eax r13d r14d +..B1.5: # Preds ..B1.4 + # Execution count [1.00e+00] + movq $0xfffffff00fffffff, %r8 #15.3 + movq $0xf000000000, %rcx #15.3 + andq 24+heat_$PHI.0.1(%rip), %r8 #15.3 + andl $1, %eax #15.3 + addq $1073741824, %r8 #15.3 + movl $heat_$PHI.0.1, %esi #15.3 + movq %r8, 24+heat_$PHI.0.1(%rip) #15.3 + andq %r8, %rcx #15.3 + movl %r8d, %edx #15.3 + andq $-256, %r8 #15.3 + shrq $8, %r8 #15.3 + andl $1, %edx #15.3 + shll $4, %eax #15.3 + addl %edx, %edx #15.3 + andl $1, %r8d #15.3 + orl %eax, %edx #15.3 + shll $21, %r8d #15.3 + xorl %eax, %eax #15.3 + shrq $36, %rcx #15.3 + orl %r8d, %edx #15.3 + andl $-31457281, %edx #15.3 + shll $21, %ecx #15.3 + orl %ecx, %edx #15.3 + addl $262144, %edx #15.3 + movq 144(%rsp), %rdi #15.3 +..___tag_value_MAIN__.19: + call for_alloc_allocatable #15.3 +..___tag_value_MAIN__.20: + # LOE rbx r12 r13d r14d +..B1.6: # Preds ..B1.5 + # Execution count [1.00e+00] + xorl %r8d, %r8d #21.3 + lea -1(%r13), %eax #21.3 + movl %eax, 96(%rsp) #21.3[spill] + testl %eax, %eax #21.3 + jle ..B1.31 # Prob 2% #21.3 + # LOE rbx r8 r12 r13d r14d +..B1.7: # Preds ..B1.6 + # Execution count [9.79e-01] + movq heat_$PHI.0.1(%rip), %r9 #23.9 + lea -1(%r14), %r15d #22.6 + movq 104+heat_$PHI.0.1(%rip), %rcx #23.9 + lea -1(%rbx), %r11 #21.3 + movq 80+heat_$PHI.0.1(%rip), %r10 #23.9 + xorl %edx, %edx #21.3 + movslq %r15d, %rdi #22.6 + vmovdqu .L_2il0floatpacket.0(%rip), %ymm2 #22.6 + lea (%r9,%rcx,2), %rsi #24.9 + vmovdqu .L_2il0floatpacket.1(%rip), %ymm0 #22.6 + movl %r14d, 104(%rsp) #21.3[spill] + movq %rbx, 112(%rsp) #21.3[spill] + movq %r12, 120(%rsp) #21.3[spill] + movl %r13d, 64(%rsp) #21.3[spill] + vpxord %zmm1, %zmm1, %zmm1 #23.9 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 r15d ymm0 ymm2 zmm1 +..B1.8: # Preds ..B1.29 ..B1.7 + # Execution count [5.00e+00] + testl %r15d, %r15d #22.6 + jle ..B1.29 # Prob 50% #22.6 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 r15d ymm0 ymm2 zmm1 +..B1.9: # Preds ..B1.8 + # Execution count [4.89e+00] + movq %rdi, 72(%rsp) #[spill] + movq %r11, 80(%rsp) #[spill] + # LOE rdx rcx rsi r8 r9 r10 r15d ymm0 ymm2 zmm1 +..B1.10: # Preds ..B1.90 ..B1.9 + # Execution count [5.33e+00] + cmpl $16, %r15d #22.6 + jl ..B1.92 # Prob 10% #22.6 + # LOE rdx rcx rsi r8 r9 r10 r15d ymm0 ymm2 zmm1 +..B1.11: # Preds ..B1.10 + # Execution count [5.33e+00] + movq %r10, %r12 #24.9 + subq %rcx, %r12 #24.9 + lea 8(%r12,%rsi), %rbx #22.6 + addq %rdx, %rbx #22.6 + andq $63, %rbx #22.6 + testb $7, %bl #22.6 + je ..B1.13 # Prob 50% #22.6 + # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 +..B1.12: # Preds ..B1.11 + # Execution count [2.66e+00] + xorl %ebx, %ebx #22.6 + jmp ..B1.15 # Prob 100% #22.6 + # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 +..B1.13: # Preds ..B1.11 + # Execution count [2.66e+00] + testl %ebx, %ebx #22.6 + je ..B1.15 # Prob 50% #22.6 + # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 +..B1.14: # Preds ..B1.13 + # Execution count [2.96e+01] + negl %ebx #22.6 + addl $64, %ebx #22.6 + shrl $3, %ebx #22.6 + cmpl %ebx, %r15d #22.6 + cmovl %r15d, %ebx #22.6 + # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 +..B1.15: # Preds ..B1.12 ..B1.14 ..B1.13 + # Execution count [5.44e+00] + movl %r15d, %eax #22.6 + subl %ebx, %eax #22.6 + andl $15, %eax #22.6 + negl %eax #22.6 + addl %r15d, %eax #22.6 + cmpl $1, %ebx #22.6 + jb ..B1.20 # Prob 50% #22.6 + # LOE rdx rcx rsi r8 r9 r10 r12 eax ebx r15d ymm0 ymm2 zmm1 +..B1.17: # Preds ..B1.15 + # Execution count [5.33e+00] + vmovdqa %ymm2, %ymm4 #22.6 + lea (%r12,%rcx,2), %r13 #24.9 + addq %r9, %r13 #24.9 + lea (%r10,%r9), %r11 #23.9 + vpbroadcastd %ebx, %ymm3 #22.6 + xorl %r14d, %r14d #22.6 + movslq %ebx, %rdi #22.6 + addq %rdx, %r13 #24.9 + addq %rdx, %r11 #23.9 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax ebx r15d ymm0 ymm2 ymm3 ymm4 zmm1 +..B1.18: # Preds ..B1.18 ..B1.17 + # Execution count [2.96e+01] + vpcmpgtd %ymm4, %ymm3, %k1 #22.6 + vpaddd %ymm0, %ymm4, %ymm4 #22.6 + vmovupd %zmm1, 8(%r11,%r14,8){%k1} #23.9 + vmovupd %zmm1, 8(%r13,%r14,8){%k1} #24.9 + addq $8, %r14 #22.6 + cmpq %rdi, %r14 #22.6 + jb ..B1.18 # Prob 82% #22.6 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax ebx r15d ymm0 ymm2 ymm3 ymm4 zmm1 +..B1.19: # Preds ..B1.18 + # Execution count [5.33e+00] + cmpl %ebx, %r15d #22.6 + je ..B1.90 # Prob 10% #22.6 + # LOE rdx rcx rsi r8 r9 r10 r12 eax ebx r15d ymm0 ymm2 zmm1 +..B1.20: # Preds ..B1.15 ..B1.19 + # Execution count [4.79e+00] + movq 72(%rsp), %rdi #[spill] + movq 80(%rsp), %r11 #[spill] + # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 eax ebx r15d ymm0 ymm2 zmm1 +..B1.21: # Preds ..B1.20 + # Execution count [2.96e+01] + lea 16(%rbx), %r13d #22.6 + cmpl %r13d, %eax #22.6 + jl ..B1.25 # Prob 50% #22.6 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 eax ebx r15d ymm0 ymm2 zmm1 +..B1.22: # Preds ..B1.21 + # Execution count [5.33e+00] + movslq %ebx, %rbx #22.6 + lea (%r12,%rcx,2), %r14 #24.9 + addq %r9, %r14 #24.9 + lea (%r10,%r9), %r13 #23.9 + movslq %eax, %r12 #22.6 + addq %rdx, %r14 #24.9 + addq %rdx, %r13 #23.9 + # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax r15d ymm0 ymm2 zmm1 +..B1.23: # Preds ..B1.23 ..B1.22 + # Execution count [2.96e+01] + vmovupd %zmm1, 8(%r13,%rbx,8) #23.9 + vmovupd %zmm1, 8(%r14,%rbx,8) #24.9 + vmovupd %zmm1, 72(%r13,%rbx,8) #23.9 + vmovupd %zmm1, 72(%r14,%rbx,8) #24.9 + addq $16, %rbx #22.6 + cmpq %r12, %rbx #22.6 + jb ..B1.23 # Prob 82% #22.6 + # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax r15d ymm0 ymm2 zmm1 +..B1.25: # Preds ..B1.23 ..B1.21 ..B1.92 + # Execution count [5.44e+00] + lea 1(%rax), %ebx #22.6 + cmpl %r15d, %ebx #22.6 + ja ..B1.29 # Prob 50% #22.6 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 eax r15d ymm0 ymm2 zmm1 +..B1.26: # Preds ..B1.25 + # Execution count [5.33e+00] + movq %r10, %rbx #23.9 + lea (%rcx,%r9), %r14 #23.9 + subq %rcx, %rbx #23.9 + xorl %r13d, %r13d #22.6 + movslq %eax, %r12 #23.9 + negl %eax #22.6 + addl %r15d, %eax #22.6 + vpbroadcastd %eax, %ymm3 #22.6 + vmovdqa %ymm2, %ymm4 #22.6 + lea (%rsi,%rbx), %rax #24.9 + addq %r14, %rbx #23.9 + addq %rdx, %rax #24.9 + addq %rdx, %rbx #23.9 + lea (%rax,%r12,8), %rax #24.9 + lea (%rbx,%r12,8), %rbx #23.9 + negq %r12 #22.6 + addq %rdi, %r12 #22.6 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r15d ymm0 ymm2 ymm3 ymm4 zmm1 +..B1.27: # Preds ..B1.27 ..B1.26 + # Execution count [2.96e+01] + vpcmpgtd %ymm4, %ymm3, %k1 #22.6 + vpaddd %ymm0, %ymm4, %ymm4 #22.6 + vmovupd %zmm1, 8(%rbx,%r13,8){%k1} #23.9 + vmovupd %zmm1, 8(%rax,%r13,8){%k1} #24.9 + addq $8, %r13 #22.6 + cmpq %r12, %r13 #22.6 + jb ..B1.27 # Prob 82% #22.6 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r15d ymm0 ymm2 ymm3 ymm4 zmm1 +..B1.29: # Preds ..B1.27 ..B1.8 ..B1.25 + # Execution count [4.91e+00] + incq %r8 #21.3 + addq %r10, %rdx #21.3 + cmpq %r11, %r8 #21.3 + jb ..B1.8 # Prob 82% #21.3 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 r15d ymm0 ymm2 zmm1 +..B1.30: # Preds ..B1.90 ..B1.29 + # Execution count [8.83e-01] + movl 104(%rsp), %r14d #[spill] + movq 112(%rsp), %rbx #[spill] + movq 120(%rsp), %r12 #[spill] + movl 64(%rsp), %r13d #[spill] + # LOE rbx r12 r13d r14d +..B1.31: # Preds ..B1.6 ..B1.30 + # Execution count [1.00e+00] + xorl %eax, %eax #29.3 + testl %r14d, %r14d #29.3 + jl ..B1.40 # Prob 50% #29.3 + # LOE rbx r12 eax r13d r14d +..B1.32: # Preds ..B1.31 + # Execution count [4.35e-01] + movq 80+heat_$PHI.0.1(%rip), %r8 #30.6 + lea 1(%r14), %edx #14.3 + movq 104+heat_$PHI.0.1(%rip), %rdi #30.6 + movq heat_$PHI.0.1(%rip), %rcx #30.6 + cmpl $8, %edx #29.3 + jl ..B1.89 # Prob 10% #29.3 + # LOE rcx rbx rdi r8 r12 eax edx r13d r14d +..B1.33: # Preds ..B1.32 + # Execution count [4.35e-01] + movq %rbx, %r10 #30.6 + movq %rcx, %rax #31.6 + imulq %r8, %r10 #30.6 + vmovupd .L_2il0floatpacket.2(%rip), %ymm1 #30.6 + subq %rdi, %rax #31.6 + movl %edx, %esi #29.3 + andl $-8, %esi #29.3 + subq %rdi, %r10 #30.6 + vxorpd %ymm0, %ymm0, %ymm0 #31.6 + lea (%rdi,%rcx), %r9 #30.6 + xorl %r11d, %r11d #29.3 + lea (%rcx,%rdi,2), %r15 #30.6 + addq %r10, %r9 #30.6 + lea (%rax,%rdi,2), %rax #31.6 + addq %r15, %r10 #30.6 + movslq %esi, %r15 #29.3 + # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r15 edx esi r13d r14d ymm0 ymm1 +..B1.34: # Preds ..B1.34 ..B1.33 + # Execution count [4.90e+00] + vmovupd %ymm1, (%r9,%r11,8) #30.6 + vmovupd %ymm0, (%rcx,%r11,8) #31.6 + vmovupd %ymm1, (%r10,%r11,8) #30.6 + vmovupd %ymm0, (%rax,%r11,8) #31.6 + vmovupd %ymm1, 32(%r9,%r11,8) #30.6 + vmovupd %ymm0, 32(%rcx,%r11,8) #31.6 + vmovupd %ymm1, 32(%r10,%r11,8) #30.6 + vmovupd %ymm0, 32(%rax,%r11,8) #31.6 + addq $8, %r11 #29.3 + cmpq %r15, %r11 #29.3 + jb ..B1.34 # Prob 91% #29.3 + # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r15 edx esi r13d r14d ymm0 ymm1 +..B1.35: # Preds ..B1.34 + # Execution count [4.35e-01] + movl %esi, %eax #32.3 + # LOE rcx rbx rdi r8 r12 eax edx esi r13d r14d +..B1.36: # Preds ..B1.35 ..B1.89 + # Execution count [1.00e+00] + lea 1(%rsi), %r9d #29.3 + cmpl %edx, %r9d #29.3 + ja ..B1.40 # Prob 50% #29.3 + # LOE rcx rbx rdi r8 r12 eax edx esi r13d r14d +..B1.37: # Preds ..B1.36 + # Execution count [4.35e-01] + imulq %rbx, %r8 #30.6 + vmovupd .L_2il0floatpacket.2(%rip), %ymm2 #30.6 + vmovdqu .L_2il0floatpacket.5(%rip), %xmm4 #29.3 + vmovdqu .L_2il0floatpacket.6(%rip), %xmm3 #29.3 + movq %r8, %r9 #30.6 + movq %rcx, %r11 #31.6 + subq %rdi, %r9 #30.6 + subq %rdi, %r11 #31.6 + addq %rcx, %r8 #30.6 + xorl %r10d, %r10d #29.3 + vxorpd %ymm1, %ymm1, %ymm1 #31.6 + lea (%r9,%rdi,2), %rax #30.6 + addq %rcx, %rax #30.6 + lea (%r11,%rdi,2), %r15 #31.6 + movslq %esi, %rdi #30.6 + negl %esi #29.3 + addl %edx, %esi #29.3 + vpbroadcastd %esi, %xmm0 #29.3 + lea (%rcx,%rdi,8), %r11 #31.6 + movslq %edx, %rcx #29.3 + subq %rdi, %rcx #29.3 + lea (%r15,%rdi,8), %r9 #31.6 + lea (%rax,%rdi,8), %rax #30.6 + lea (%r8,%rdi,8), %r8 #30.6 + # LOE rax rcx rbx r8 r9 r10 r11 r12 edx r13d r14d xmm0 xmm3 xmm4 ymm1 ymm2 +..B1.38: # Preds ..B1.38 ..B1.37 + # Execution count [4.90e+00] + vpcmpgtd %xmm3, %xmm0, %k1 #29.3 + vpaddd %xmm4, %xmm3, %xmm3 #29.3 + vmovupd %ymm2, (%r8,%r10,8){%k1} #30.6 + vmovupd %ymm1, (%r11,%r10,8){%k1} #31.6 + vmovupd %ymm2, (%rax,%r10,8){%k1} #30.6 + vmovupd %ymm1, (%r9,%r10,8){%k1} #31.6 + addq $4, %r10 #29.3 + cmpq %rcx, %r10 #29.3 + jb ..B1.38 # Prob 91% #29.3 + # LOE rax rcx rbx r8 r9 r10 r11 r12 edx r13d r14d xmm0 xmm3 xmm4 ymm1 ymm2 +..B1.39: # Preds ..B1.38 + # Execution count [4.35e-01] + movl %edx, %eax #32.3 + # LOE rbx r12 eax r13d r14d +..B1.40: # Preds ..B1.39 ..B1.36 ..B1.31 + # Execution count [1.00e+00] + testl %r13d, %r13d #33.3 + jl ..B1.49 # Prob 50% #33.3 + # LOE rbx r12 eax r13d r14d +..B1.41: # Preds ..B1.40 + # Execution count [4.35e-01] + movq 80+heat_$PHI.0.1(%rip), %r9 #34.6 + incl %r13d #13.3 + movq 104+heat_$PHI.0.1(%rip), %r15 #34.6 + movl 152(%rsp), %r11d #34.27 + movq heat_$PHI.0.1(%rip), %r10 #34.6 + testq %r9, %r9 #55.82 + je ..B1.79 # Prob 10% #55.82 + # LOE rbx r9 r10 r12 r15 eax r11d r13d r14d +..B1.42: # Preds ..B1.41 + # Execution count [4.35e-01] + cmpl $8, %r13d #33.3 + jl ..B1.78 # Prob 10% #33.3 + # LOE rbx r9 r10 r12 r15 eax r11d r13d r14d +..B1.43: # Preds ..B1.42 + # Execution count [4.35e-01] + vxorpd %xmm1, %xmm1, %xmm1 #34.19 + vxorpd %xmm0, %xmm0, %xmm0 #34.27 + vcvtsi2sd %eax, %xmm1, %xmm1 #34.19 + vcvtsi2sd %r11d, %xmm0, %xmm0 #34.27 + vpbroadcastd %r9d, %zmm3 #34.6 + vdivsd %xmm0, %xmm1, %xmm2 #34.6 + movq %r10, %rsi #34.6 + movl %r13d, %r8d #33.3 + subq %r15, %rsi #34.6 + andl $-8, %r8d #33.3 + movslq %r8d, %r8 #33.3 + lea (,%r12,8), %rdi #35.6 + xorl %ecx, %ecx #33.3 + subq %r15, %rdi #35.6 + movl %r11d, 80(%rsp) #34.6[spill] + lea (%rsi,%r15,2), %rdx #34.6 + movq %rdx, 64(%rsp) #34.6[spill] + lea (%r15,%r10), %rsi #35.6 + movl %eax, 88(%rsp) #34.6[spill] + lea (%r10,%r15,2), %rdx #35.6 + vbroadcastsd %xmm2, %zmm1 #34.6 + addq %rdi, %rsi #35.6 + vpmuldq .L_2il0floatpacket.8(%rip), %zmm3, %zmm0 #34.6 + movq %r15, 72(%rsp) #34.6[spill] + addq %rdx, %rdi #35.6 + movq %r8, %r11 #34.6 + xorl %edx, %edx #33.3 + movq 64(%rsp), %rax #34.6[spill] + .align 16,0x90 + # LOE rax rdx rcx rbx rsi rdi r9 r10 r11 r12 r8d r13d r14d zmm0 zmm1 +..B1.44: # Preds ..B1.44 ..B1.43 + # Execution count [4.90e+00] + vpcmpeqb %xmm0, %xmm0, %k1 #34.6 + lea (%r10,%rdx), %r15 #34.6 + vpcmpeqb %xmm0, %xmm0, %k2 #35.6 + vpcmpeqb %xmm0, %xmm0, %k3 #34.6 + vpcmpeqb %xmm0, %xmm0, %k4 #35.6 + vscatterqpd %zmm1, (%r15,%zmm0){%k1} #34.6 + addq $8, %rcx #33.3 + lea (%rsi,%rdx), %r15 #35.6 + vscatterqpd %zmm1, (%r15,%zmm0){%k2} #35.6 + lea (%rax,%rdx), %r15 #34.6 + vscatterqpd %zmm1, (%r15,%zmm0){%k3} #34.6 + lea (%rdi,%rdx), %r15 #35.6 + vscatterqpd %zmm1, (%r15,%zmm0){%k4} #35.6 + lea (%rdx,%r9,8), %rdx #33.3 + cmpq %r11, %rcx #33.3 + jb ..B1.44 # Prob 91% #33.3 + # LOE rax rdx rcx rbx rsi rdi r9 r10 r11 r12 r8d r13d r14d zmm0 zmm1 +..B1.45: # Preds ..B1.44 + # Execution count [4.35e-01] + movq 72(%rsp), %r15 #[spill] + movl 80(%rsp), %r11d #[spill] + movl 88(%rsp), %eax #[spill] + # LOE rbx r9 r10 r12 r15 eax r8d r11d r13d r14d +..B1.46: # Preds ..B1.45 ..B1.78 + # Execution count [9.56e-01] + lea 1(%r8), %edx #33.3 + cmpl %r13d, %edx #33.3 + ja ..B1.49 # Prob 50% #33.3 + # LOE rbx r9 r10 r12 r15 eax r8d r11d r13d r14d +..B1.47: # Preds ..B1.46 + # Execution count [4.35e-01] + vxorpd %xmm1, %xmm1, %xmm1 #34.19 + vxorpd %xmm2, %xmm2, %xmm2 #34.27 + vcvtsi2sd %eax, %xmm1, %xmm1 #34.19 + vcvtsi2sd %r11d, %xmm2, %xmm2 #34.27 + vdivsd %xmm2, %xmm1, %xmm3 #34.6 + subl %r8d, %r13d #33.3 + movq %r10, %rax #34.6 + movslq %r8d, %r8 #34.6 + lea (,%r12,8), %rdi #35.6 + imulq %r9, %r8 #34.6 + vpbroadcastd %r13d, %ymm0 #33.3 + vpbroadcastd %r9d, %zmm4 #34.6 + vbroadcastsd %xmm3, %zmm6 #34.6 + vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k4 #33.3 + vpmuldq .L_2il0floatpacket.8(%rip), %zmm4, %zmm5 #34.6 + subq %r15, %rax #34.6 + subq %r15, %rdi #35.6 + kmovw %k4, %k1 #34.6 + lea (%r15,%r10), %rcx #35.6 + addq %rdi, %rcx #35.6 + lea (%r10,%r8), %rdx #34.6 + kmovw %k4, %k2 #35.6 + lea (%r10,%r15,2), %r10 #35.6 + addq %r10, %rdi #35.6 + lea (%rax,%r15,2), %rsi #34.6 + addq %r8, %rcx #35.6 + addq %r8, %rsi #34.6 + addq %r8, %rdi #35.6 + kmovw %k4, %k3 #34.6 + vscatterqpd %zmm6, (%rdx,%zmm5){%k1} #34.6 + vscatterqpd %zmm6, (%rcx,%zmm5){%k2} #35.6 + vscatterqpd %zmm6, (%rsi,%zmm5){%k3} #34.6 + vscatterqpd %zmm6, (%rdi,%zmm5){%k4} #35.6 + # LOE rbx r12 r14d +..B1.49: # Preds ..B1.79 ..B1.40 ..B1.80 ..B1.83 ..B1.47 + # ..B1.46 + # Execution count [8.00e-01] + decl %r14d #54.9 + decq %rbx #53.6 + movl %r14d, %r13d #54.9 + decq %r12 #54.9 + shrl $2, %r13d #54.9 + movl $10, %r15d #43.3 + movl %r13d, %eax #54.9 + movq %rbx, 112(%rsp) #54.9[spill] + vmovsd .L_2il0floatpacket.3(%rip), %xmm1 #44.17 + vmovsd .L_2il0floatpacket.4(%rip), %xmm0 #55.31 + movq %rax, 80(%rsp) #54.9[spill] + movq %r12, 120(%rsp) #54.9[spill] + movl 96(%rsp), %ebx #54.9[spill] + # LOE ebx r13d r14d r15d +..B1.50: # Preds ..B1.87 ..B1.49 ..B1.69 + # Execution count [2.33e+00] + xorl %eax, %eax #47.8 + lea 168(%rsp), %rdi #47.8 + addl %r15d, %r15d #45.3 + lea 176(%rsp), %rsi #47.8 + vzeroupper #47.8 +..___tag_value_MAIN__.46: + call timing_ #47.8 +..___tag_value_MAIN__.47: + # LOE ebx r13d r14d r15d +..B1.51: # Preds ..B1.50 + # Execution count [2.28e+00] + movl $1, %r12d #50.3 + testl %r15d, %r15d #50.3 + jle ..B1.86 # Prob 0% #50.3 + # LOE ebx r12d r13d r14d r15d +..B1.52: # Preds ..B1.51 + # Execution count [2.28e+00] + movq 80+heat_$PHI.0.1(%rip), %rsi #55.35 + xorl %r10d, %r10d #50.3 + movq heat_$PHI.0.1(%rip), %r9 #55.12 + movq %rsi, %rcx #55.50 + movq 104+heat_$PHI.0.1(%rip), %rax #55.35 + subq %rax, %rcx #55.50 + addq %r9, %rax #55.50 + xorl %r11d, %r11d #55.66 + vmovsd .L_2il0floatpacket.4(%rip), %xmm0 #55.66 + lea (%rsi,%r9), %rdi #55.50 + addq %rax, %rcx #55.50 + lea (%r9,%rsi,2), %r8 #55.66 + # LOE rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 +..B1.53: # Preds ..B1.66 ..B1.52 + # Execution count [1.27e+01] + movq %r11, %rdx #53.6 + movq %rdx, %rax #53.6 + testl %ebx, %ebx #53.6 + jle ..B1.66 # Prob 2% #53.6 + # LOE rax rdx rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 +..B1.54: # Preds ..B1.53 + # Execution count [1.24e+01] + movl %r10d, 64(%rsp) #[spill] + movl %r15d, 72(%rsp) #[spill] + # LOE rax rdx rcx rsi rdi r8 r9 r13d r14d xmm0 +..B1.55: # Preds ..B1.64 ..B1.54 + # Execution count [6.88e+01] + testl %r14d, %r14d #54.9 + jle ..B1.64 # Prob 50% #54.9 + # LOE rax rdx rcx rsi rdi r8 r9 r13d r14d xmm0 +..B1.56: # Preds ..B1.55 + # Execution count [6.88e+01] + xorl %r15d, %r15d #54.9 + movl $1, %r12d #54.9 + xorl %r11d, %r11d #54.9 + testl %r13d, %r13d #54.9 + je ..B1.60 # Prob 2% #54.9 + # LOE rax rdx rcx rsi rdi r8 r9 r11 r15 r12d r13d r14d xmm0 +..B1.57: # Preds ..B1.56 + # Execution count [6.74e+01] + movl %r14d, 104(%rsp) #55.66[spill] + lea (%rdi,%rax), %r12 #55.50 + vmovsd (%rax,%rcx), %xmm1 #55.50 + lea (%r9,%rax), %r10 #55.35 + movq 80(%rsp), %r14 #55.66[spill] + lea (%r8,%rax), %rbx #55.66 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.58: # Preds ..B1.58 ..B1.57 + # Execution count [9.36e+01] + vmovsd 8(%r11,%r10), %xmm2 #55.35 + incq %r15 #54.9 + vaddsd 16(%r11,%r12), %xmm2, %xmm3 #55.12 + vaddsd 8(%r11,%rbx), %xmm3, %xmm4 #55.12 + vaddsd %xmm1, %xmm4, %xmm1 #55.12 + vmulsd %xmm1, %xmm0, %xmm5 #55.12 + vmovsd %xmm5, 8(%r11,%r12) #55.12 + vaddsd 16(%r11,%r10), %xmm5, %xmm6 #55.48 + vaddsd 24(%r11,%r12), %xmm6, %xmm7 #55.63 + vaddsd 16(%r11,%rbx), %xmm7, %xmm8 #55.79 + vmulsd %xmm8, %xmm0, %xmm9 #55.12 + vmovsd %xmm9, 16(%r11,%r12) #55.12 + vaddsd 24(%r11,%r10), %xmm9, %xmm10 #55.48 + vaddsd 32(%r11,%r12), %xmm10, %xmm11 #55.63 + vaddsd 24(%r11,%rbx), %xmm11, %xmm12 #55.79 + vmulsd %xmm12, %xmm0, %xmm13 #55.12 + vmovsd %xmm13, 24(%r11,%r12) #55.12 + vaddsd 32(%r11,%r10), %xmm13, %xmm14 #55.48 + vaddsd 40(%r11,%r12), %xmm14, %xmm15 #55.63 + vaddsd 32(%r11,%rbx), %xmm15, %xmm16 #55.79 + vmulsd %xmm16, %xmm0, %xmm1 #55.12 + vmovsd %xmm1, 32(%r11,%r12) #55.12 + addq $32, %r11 #54.9 + cmpq %r14, %r15 #54.9 + jb ..B1.58 # Prob 28% #54.9 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 +..B1.59: # Preds ..B1.58 + # Execution count [6.74e+01] + movl 104(%rsp), %r14d #[spill] + lea 1(,%r15,4), %r12d #55.12 + # LOE rax rdx rcx rsi rdi r8 r9 r12d r13d r14d xmm0 +..B1.60: # Preds ..B1.59 ..B1.56 + # Execution count [6.88e+01] + movslq %r12d, %r12 #54.9 + decq %r12 #54.9 + cmpq 120(%rsp), %r12 #54.9[spill] + jae ..B1.64 # Prob 2% #54.9 + # LOE rax rdx rcx rsi rdi r8 r9 r12 r13d r14d xmm0 +..B1.61: # Preds ..B1.60 + # Execution count [6.74e+01] + movq 120(%rsp), %r15 #55.66[spill] + lea (%rdi,%rax), %r11 #55.50 + lea (%r9,%rax), %r10 #55.35 + lea (%r8,%rax), %rbx #55.66 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r15 r13d r14d xmm0 +..B1.62: # Preds ..B1.62 ..B1.61 + # Execution count [2.02e+02] + vmovsd 8(%r10,%r12,8), %xmm1 #55.35 + vaddsd 16(%r11,%r12,8), %xmm1, %xmm2 #55.48 + vaddsd 8(%rbx,%r12,8), %xmm2, %xmm3 #55.63 + vaddsd (%r11,%r12,8), %xmm3, %xmm4 #55.79 + vmulsd %xmm4, %xmm0, %xmm5 #55.12 + vmovsd %xmm5, 8(%r11,%r12,8) #55.12 + incq %r12 #54.9 + cmpq %r15, %r12 #54.9 + jb ..B1.62 # Prob 66% #54.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r15 r13d r14d xmm0 +..B1.64: # Preds ..B1.62 ..B1.55 ..B1.60 + # Execution count [6.88e+01] + incq %rdx #53.6 + addq %rsi, %rax #53.6 + cmpq 112(%rsp), %rdx #53.6[spill] + jb ..B1.55 # Prob 82% #53.6 + # LOE rax rdx rcx rsi rdi r8 r9 r13d r14d xmm0 +..B1.65: # Preds ..B1.64 + # Execution count [1.24e+01] + movl 64(%rsp), %r10d #[spill] + xorl %r11d, %r11d # + movl 72(%rsp), %r15d #[spill] + movl 96(%rsp), %ebx #[spill] + # LOE rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 +..B1.66: # Preds ..B1.65 ..B1.53 + # Execution count [1.27e+01] + incl %r10d #50.3 + cmpl %r15d, %r10d #50.3 + jb ..B1.53 # Prob 82% #50.3 + # LOE rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 +..B1.67: # Preds ..B1.66 + # Execution count [2.28e+00] + xorl %eax, %eax #66.8 + lea 184(%rsp), %rdi #66.8 + lea 160(%rsp), %rsi #66.8 + lea 1(%r15), %r12d #50.3 +..___tag_value_MAIN__.59: + call timing_ #66.8 +..___tag_value_MAIN__.60: + # LOE ebx r12d r13d r14d r15d +..B1.68: # Preds ..B1.67 + # Execution count [2.33e+00] + vmovsd 184(%rsp), %xmm16 #67.3 + vmovsd .L_2il0floatpacket.3(%rip), %xmm0 #44.17 + vsubsd 168(%rsp), %xmm16, %xmm1 #67.3 + vcomisd %xmm1, %xmm0 #44.17 + jbe ..B1.71 # Prob 18% #44.17 + # LOE ebx r12d r13d r14d r15d +..B1.69: # Preds ..B1.68 + # Execution count [1.91e+00] + cmpl $1000000000, %r15d #44.36 + jl ..B1.50 # Prob 80% #44.36 + # LOE ebx r12d r13d r14d r15d +..B1.71: # Preds ..B1.87 ..B1.68 ..B1.69 + # Execution count [1.00e+00] + cmpl %r12d, %r15d #70.8 + lea (%rsp), %rdi #72.3 + movq $0x1208384ff00, %rdx #72.3 + movl $__STRLITPACK_5.0.1, %ecx #72.3 + lea 64(%rsp), %r8 #72.3 + cmovl %r15d, %r12d #70.8 + movl $-1, %esi #70.8 + xorl %eax, %eax #70.8 + movq $0, (%rdi) #72.3 + movq $14, 64(%rdi) #72.3 + movq $__STRLITPACK_2, 72(%rdi) #72.3 +..___tag_value_MAIN__.61: + call for_write_seq_lis #72.3 +..___tag_value_MAIN__.62: + # LOE r12d +..B1.72: # Preds ..B1.71 + # Execution count [1.00e+00] + movl $__STRLITPACK_6.0.1, %esi #72.3 + lea (%rsp), %rdi #72.3 + xorl %eax, %eax #72.3 + lea 112(%rsp), %rdx #72.3 + movl %r12d, (%rdx) #72.3 +..___tag_value_MAIN__.63: + call for_write_seq_lis_xmit #72.3 +..___tag_value_MAIN__.64: + # LOE r12d +..B1.73: # Preds ..B1.72 + # Execution count [1.00e+00] + movl $__STRLITPACK_7.0.1, %esi #72.3 + lea (%rsp), %rdi #72.3 + xorl %eax, %eax #72.3 + lea 80(%rsp), %rdx #72.3 + movq $14, (%rdx) #72.3 + movq $__STRLITPACK_1, 8(%rdx) #72.3 +..___tag_value_MAIN__.65: + call for_write_seq_lis_xmit #72.3 +..___tag_value_MAIN__.66: + # LOE r12d +..B1.74: # Preds ..B1.73 + # Execution count [1.00e+00] + movl 152(%rsp), %eax #72.3 + vxorpd %xmm0, %xmm0, %xmm0 #72.49 + decl %eax #72.49 + vxorpd %xmm2, %xmm2, %xmm2 #72.60 + vcvtsi2sd %eax, %xmm0, %xmm0 #72.49 + movl 156(%rsp), %edx #72.49 + vxorpd %xmm7, %xmm7, %xmm7 #72.71 + decl %edx #72.60 + lea (%rsp), %rdi #72.3 + vcvtsi2sd %edx, %xmm2, %xmm2 #72.60 + vcvtsi2sd %r12d, %xmm7, %xmm7 #72.71 + vmulsd .L_2il0floatpacket.7(%rip), %xmm0, %xmm1 #72.59 + vmovsd 184(%rdi), %xmm3 #72.70 + lea 120(%rsp), %rdx #72.3 + vmulsd %xmm2, %xmm1, %xmm4 #72.70 + vsubsd 48(%rdx), %xmm3, %xmm5 #72.83 + vdivsd %xmm5, %xmm4, %xmm6 #72.79 + vmulsd %xmm7, %xmm6, %xmm8 #72.3 + movl $__STRLITPACK_8.0.1, %esi #72.3 + xorl %eax, %eax #72.3 + vmovsd %xmm8, (%rdx) #72.3 +..___tag_value_MAIN__.67: + call for_write_seq_lis_xmit #72.3 +..___tag_value_MAIN__.68: + # LOE +..B1.75: # Preds ..B1.74 + # Execution count [1.00e+00] + movl $__STRLITPACK_9.0.1, %esi #72.3 + lea (%rsp), %rdi #72.3 + xorl %eax, %eax #72.3 + lea 96(%rsp), %rdx #72.3 + movq $6, (%rdx) #72.3 + movq $__STRLITPACK_0, 8(%rdx) #72.3 +..___tag_value_MAIN__.69: + call for_write_seq_lis_xmit #72.3 +..___tag_value_MAIN__.70: + # LOE +..B1.76: # Preds ..B1.75 + # Execution count [1.00e+00] + xorl %esi, %esi #73.3 + movl $__STRLITPACK_10, %edi #73.3 + movq $0x1208384ff00, %rdx #73.3 + xorl %ecx, %ecx #73.3 + xorl %r8d, %r8d #73.3 + xorl %eax, %eax #73.3 +..___tag_value_MAIN__.71: + call for_stop_core #73.3 +..___tag_value_MAIN__.72: + # LOE +..B1.77: # Preds ..B1.76 + # Execution count [1.00e+00] + xorl %eax, %eax #74.3 + addq $216, %rsp #74.3 + .cfi_restore 3 + popq %rbx #74.3 + .cfi_restore 15 + popq %r15 #74.3 + .cfi_restore 14 + popq %r14 #74.3 + .cfi_restore 13 + popq %r13 #74.3 + .cfi_restore 12 + popq %r12 #74.3 + movq %rbp, %rsp #74.3 + popq %rbp #74.3 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #74.3 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B1.78: # Preds ..B1.42 + # Execution count [4.35e-02]: Infreq + xorl %r8d, %r8d #33.3 + jmp ..B1.46 # Prob 100% #33.3 + # LOE rbx r9 r10 r12 r15 eax r8d r11d r13d r14d +..B1.79: # Preds ..B1.41 + # Execution count [4.35e-02]: Infreq + cmpl $1, %r13d #33.3 + jb ..B1.49 # Prob 50% #33.3 + # LOE rbx r10 r12 r15 eax r11d r13d r14d +..B1.80: # Preds ..B1.79 + # Execution count [4.35e-01]: Infreq + xorl %ecx, %ecx #33.3 + testl %r13d, %r13d #33.3 + je ..B1.49 # Prob 56% #33.3 + # LOE rcx rbx r10 r12 r15 eax r11d r13d r14d +..B1.81: # Preds ..B1.80 + # Execution count [4.35e-01]: Infreq + vxorpd %xmm0, %xmm0, %xmm0 #34.19 + vxorpd %xmm1, %xmm1, %xmm1 #34.27 + vcvtsi2sd %eax, %xmm0, %xmm0 #34.19 + vcvtsi2sd %r11d, %xmm1, %xmm1 #34.27 + movq %r15, %rax #34.6 + lea (,%r12,8), %rdx #35.6 + negq %rax #34.6 + movq %rdx, %rsi #35.6 + vdivsd %xmm1, %xmm0, %xmm0 #34.6 + movslq %r13d, %r13 #33.3 + subq %r15, %rsi #35.6 + lea (%rax,%r15,2), %rax #34.6 + # LOE rax rdx rcx rbx rsi r10 r12 r13 r15 r14d xmm0 +..B1.82: # Preds ..B1.82 ..B1.81 + # Execution count [4.90e+00]: Infreq + incq %rcx #33.3 + vmovsd %xmm0, (%r10) #34.6 + vmovsd %xmm0, (%r10,%rdx) #35.6 + vmovsd %xmm0, (%rax,%r10) #34.6 + cmpq %r13, %rcx #33.3 + jb ..B1.82 # Prob 91% #33.3 + # LOE rax rdx rcx rbx rsi r10 r12 r13 r15 r14d xmm0 +..B1.83: # Preds ..B1.82 + # Execution count [4.35e-01]: Infreq + lea (%rsi,%r15,2), %rax #35.6 + vmovsd %xmm0, (%rax,%r10) #35.6 + jmp ..B1.49 # Prob 100% #35.6 + # LOE rbx r12 r14d +..B1.86: # Preds ..B1.51 + # Execution count [4.82e-02]: Infreq + xorl %eax, %eax #66.8 + lea 184(%rsp), %rdi #66.8 + lea 160(%rsp), %rsi #66.8 +..___tag_value_MAIN__.87: + call timing_ #66.8 +..___tag_value_MAIN__.88: + # LOE ebx r12d r13d r14d r15d +..B1.87: # Preds ..B1.86 + # Execution count [0.00e+00]: Infreq + vmovsd 184(%rsp), %xmm16 #67.3 + vmovsd .L_2il0floatpacket.3(%rip), %xmm0 #44.17 + vsubsd 168(%rsp), %xmm16, %xmm1 #67.3 + vcomisd %xmm1, %xmm0 #44.17 + ja ..B1.50 # Prob 82% #44.17 + jmp ..B1.71 # Prob 100% #44.17 + # LOE ebx r12d r13d r14d r15d +..B1.89: # Preds ..B1.32 + # Execution count [4.35e-02]: Infreq + xorl %esi, %esi #29.3 + jmp ..B1.36 # Prob 100% #29.3 + # LOE rcx rbx rdi r8 r12 eax edx esi r13d r14d +..B1.90: # Preds ..B1.19 + # Execution count [5.33e-01]: Infreq + incq %r8 #21.3 + addq %r10, %rdx #21.3 + cmpq 80(%rsp), %r8 #21.3[spill] + jb ..B1.10 # Prob 82% #21.3 + jmp ..B1.30 # Prob 100% #21.3 + # LOE rdx rcx rsi r8 r9 r10 r15d ymm0 ymm2 zmm1 +..B1.92: # Preds ..B1.10 + # Execution count [5.33e-01]: Infreq + movq 72(%rsp), %rdi #[spill] + xorl %eax, %eax #22.6 + movq 80(%rsp), %r11 #[spill] + jmp ..B1.25 # Prob 100% # + .align 16,0x90 + # LOE rdx rcx rsi rdi r8 r9 r10 r11 eax r15d ymm0 ymm2 zmm1 + .cfi_endproc +# mark_end; + .type MAIN__,@function + .size MAIN__,.-MAIN__ +..LNMAIN__.0: + .data + .align 32 + .align 32 +heat_$PHI.0.1: + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x40000080,0x00000000 + .long 0x00000003,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .long 0x00000000,0x00000000 + .section .rodata, "a" + .align 64 + .align 4 +__NLITPACK_0.0.1: + .long 2 + .align 4 +__STRLITPACK_3.0.1: + .long 131849 + .byte 0 + .space 3, 0x00 # pad + .align 4 +__STRLITPACK_4.0.1: + .long 66313 + .byte 0 + .space 3, 0x00 # pad + .align 4 +__STRLITPACK_5.0.1: + .long 132152 + .byte 0 + .space 3, 0x00 # pad + .align 4 +__STRLITPACK_6.0.1: + .long 131337 + .byte 0 + .space 3, 0x00 # pad + .align 4 +__STRLITPACK_7.0.1: + .long 132152 + .byte 0 + .space 3, 0x00 # pad + .align 4 +__STRLITPACK_8.0.1: + .long 131376 + .byte 0 + .space 3, 0x00 # pad + .align 4 +__STRLITPACK_9.0.1: + .long 66616 + .byte 0 + .data +# -- End MAIN__ + .section .rodata, "a" + .space 7, 0x00 # pad + .align 64 +.L_2il0floatpacket.8: + .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000 + .type .L_2il0floatpacket.8,@object + .size .L_2il0floatpacket.8,64 + .align 32 +.L_2il0floatpacket.0: + .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,32 + .align 32 +.L_2il0floatpacket.1: + .long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,32 + .align 32 +.L_2il0floatpacket.2: + .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,32 + .align 16 +.L_2il0floatpacket.5: + .long 0x00000004,0x00000004,0x00000004,0x00000004 + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,16 + .align 16 +.L_2il0floatpacket.6: + .long 0x00000000,0x00000001,0x00000002,0x00000003 + .type .L_2il0floatpacket.6,@object + .size .L_2il0floatpacket.6,16 + .align 8 +.L_2il0floatpacket.3: + .long 0x9999999a,0x3fc99999 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,8 + .align 8 +.L_2il0floatpacket.4: + .long 0x00000000,0x3fd00000 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,8 + .align 8 +.L_2il0floatpacket.7: + .long 0xa0b5ed8d,0x3eb0c6f7 + .type .L_2il0floatpacket.7,@object + .size .L_2il0floatpacket.7,8 + .align 8 +.L_2il0floatpacket.9: + .long 0x00000000,0x3ff00000 + .type .L_2il0floatpacket.9,@object + .size .L_2il0floatpacket.9,8 + .section .rodata.str1.4, "aMS",@progbits,1 + .align 4 + .align 4 +__STRLITPACK_2: + .long 1950949411 + .long 1952543333 + .long 1936617321 + .word 8250 + .byte 0 + .type __STRLITPACK_2,@object + .size __STRLITPACK_2,15 + .space 1, 0x00 # pad + .align 4 +__STRLITPACK_1: + .long 1919242272 + .long 1836216166 + .long 1701015137 + .word 8250 + .byte 0 + .type __STRLITPACK_1,@object + .size __STRLITPACK_1,15 + .space 1, 0x00 # pad + .align 4 +__STRLITPACK_0: + .long 1431063840 + .word 29520 + .byte 0 + .type __STRLITPACK_0,@object + .size __STRLITPACK_0,7 + .space 1, 0x00 # pad + .align 4 +__STRLITPACK_10: + .byte 0 + .type __STRLITPACK_10,@object + .size __STRLITPACK_10,1 + .data + .section .note.GNU-stack, "" +# End diff --git a/examples/gs/gs.s.tx2.clang.s b/examples/gs/gs.s.tx2.clang.s new file mode 100644 index 0000000..efce506 --- /dev/null +++ b/examples/gs/gs.s.tx2.clang.s @@ -0,0 +1,1194 @@ + .text + .file "gs-e4c67a.ll" + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 // -- Begin function MAIN_ +.LCPI0_0: + .xword 4596373779694328218 // double 0.20000000000000001 +.LCPI0_1: + .xword 4696837146684686336 // double 1.0E+6 + .text + .globl MAIN_ + .p2align 6 + .type MAIN_,@function +MAIN_: // @MAIN_ + .cfi_startproc +// %bb.0: // %L.entry + stp d9, d8, [sp, #-112]! // 16-byte Folded Spill + stp x28, x27, [sp, #16] // 16-byte Folded Spill + stp x26, x25, [sp, #32] // 16-byte Folded Spill + stp x24, x23, [sp, #48] // 16-byte Folded Spill + stp x22, x21, [sp, #64] // 16-byte Folded Spill + stp x20, x19, [sp, #80] // 16-byte Folded Spill + stp x29, x30, [sp, #96] // 16-byte Folded Spill + sub sp, sp, #432 // =432 + .cfi_def_cfa_offset 544 + .cfi_offset w30, -8 + .cfi_offset w29, -16 + .cfi_offset w19, -24 + .cfi_offset w20, -32 + .cfi_offset w21, -40 + .cfi_offset w22, -48 + .cfi_offset w23, -56 + .cfi_offset w24, -64 + .cfi_offset w25, -72 + .cfi_offset w26, -80 + .cfi_offset w27, -88 + .cfi_offset w28, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + adrp x19, .C283_MAIN_ + add x19, x19, :lo12:.C283_MAIN_ + mov x0, x19 + bl fort_init + adrp x0, .C329_MAIN_ + adrp x1, .C327_MAIN_ + add x0, x0, :lo12:.C329_MAIN_ + add x1, x1, :lo12:.C327_MAIN_ + orr w2, wzr, #0x6 + str xzr, [sp, #424] + bl f90io_src_info03a + adrp x0, .C330_MAIN_ + mov x1, xzr + mov x2, x19 + mov x3, x19 + add x0, x0, :lo12:.C330_MAIN_ + bl f90io_ldr_init + adrp x20, .C334_MAIN_ + adrp x21, .C285_MAIN_ + add x20, x20, :lo12:.C334_MAIN_ + add x21, x21, :lo12:.C285_MAIN_ + mov x0, x20 + mov x1, x21 + mov x2, x19 + add x3, sp, #420 // =420 + bl f90io_ldra + mov x0, x20 + mov x1, x21 + mov x2, x19 + add x3, sp, #416 // =416 + bl f90io_ldra + bl f90io_ldr_end + ldrsw x24, [sp, #416] + ldr w22, [sp, #420] + sxtw x21, w22 + and x8, x24, #0xffffffff + str x8, [sp, #160] // 8-byte Folded Spill + add x9, x21, #1 // =1 + add x8, x24, #1 // =1 + adrp x1, .C366_MAIN_ + mul x23, x9, x8 + stp xzr, xzr, [sp] + adrp x2, .C365_MAIN_ + adrp x6, .C286_MAIN_ + adrp x7, .C284_MAIN_ + mov x3, xzr + mov x5, xzr + add x1, x1, :lo12:.C366_MAIN_ + add x2, x2, :lo12:.C365_MAIN_ + add x6, x6, :lo12:.C286_MAIN_ + add x7, x7, :lo12:.C284_MAIN_ + add x0, sp, #408 // =408 + lsl x20, x23, #1 + add x4, sp, #424 // =424 + str x9, [sp, #360] // 8-byte Folded Spill + str x20, [sp, #408] + bl f90_alloc04_chka_i8 + str x22, [sp, #200] // 8-byte Folded Spill + cmp w24, #2 // =2 + b.lt .LBB0_30 +// %bb.1: // %L.LB1_367.preheader + cmp w22, #2 // =2 + b.lt .LBB0_30 +// %bb.2: // %L.LB1_367.preheader64 + mvn w9, w22 + orr w10, wzr, #0xfffffffd + ldr x8, [sp, #424] + cmn w9, #3 // =3 + csinv w9, w10, w22, le + ldr x18, [sp, #160] // 8-byte Folded Reload + add w11, w22, w9 + add x12, x23, x21 + mvn w16, w18 + add w9, w11, #1 // =1 + add x10, x9, #1 // =1 + add x13, x12, x9 + add x9, x21, x9 + add x15, x8, x13, lsl #3 + add x13, x8, x9, lsl #3 + add x4, x8, x21, lsl #3 + add x9, x8, x12, lsl #3 + add x14, x4, #16 // =16 + add x15, x15, #24 // =24 + add x12, x13, #24 // =24 + add x13, x9, #16 // =16 + and x16, x16, #0x1 + cmp w18, #2 // =2 + b.ne .LBB0_10 +// %bb.3: + orr w9, wzr, #0x1 + cbz w16, .LBB0_30 +.LBB0_4: // %L.LB1_367.epil + cmp x10, #8 // =8 + b.lo .LBB0_7 +// %bb.5: // %vector.memcheck.epil + cmp x14, x15 + b.hs .LBB0_27 +// %bb.6: // %vector.memcheck.epil + cmp x13, x12 + b.hs .LBB0_27 +.LBB0_7: + orr w10, wzr, #0x1 + mov w11, w22 +.LBB0_8: // %L.LB1_370.preheader.epil + ldr x14, [sp, #360] // 8-byte Folded Reload + add x13, x9, x24 + add x12, x8, x10, lsl #3 + lsl x13, x13, #3 + add x13, x13, #8 // =8 + madd x9, x9, x14, x10 + madd x12, x13, x14, x12 + add x8, x8, x9, lsl #3 + add w9, w11, #1 // =1 + .p2align 6 +.LBB0_9: // %L.LB1_370.epil + // =>This Inner Loop Header: Depth=1 + str xzr, [x8], #8 + str xzr, [x12], #8 + sub w9, w9, #1 // =1 + cmp w9, #2 // =2 + b.gt .LBB0_9 + b .LBB0_30 +.LBB0_10: // %L.LB1_367.preheader64.new + mvn x17, x16 + cmp x14, x15 + add w1, w11, #2 // =2 + add x5, x23, x21, lsl #1 + add x17, x17, x18 + cset w18, lo + cmp x13, x12 + cset w0, lo + and w18, w18, w0 + and w0, w1, #0x7 + sub x1, x10, x0 + add x6, x9, #8 // =8 + ldr x9, [sp, #360] // 8-byte Folded Reload + movi v0.2d, #0000000000000000 + sub w3, w22, w1 + add x22, x8, x5, lsl #3 + lsl x5, x21, #4 + add x25, x8, x5 + add x7, x8, x9, lsl #3 + add x2, x1, #1 // =1 + add x4, x4, #64 // =64 + add x5, x5, #16 // =16 + add x19, x25, #40 // =40 + add x22, x22, #16 // =16 + add x25, x25, #16 // =16 + orr w9, wzr, #0x1 + .p2align 6 +.LBB0_11: // %L.LB1_367 + // =>This Loop Header: Depth=1 + // Child Loop BB0_14 Depth 2 + // Child Loop BB0_17 Depth 2 + // Child Loop BB0_21 Depth 2 + // Child Loop BB0_24 Depth 2 + cmp x10, #8 // =8 + cset w26, lo + orr w26, w26, w18 + tbz w26, #0, .LBB0_13 +// %bb.12: // in Loop: Header=BB0_11 Depth=1 + ldr x28, [sp, #200] // 8-byte Folded Reload + orr w27, wzr, #0x1 + mov w29, w28 + b .LBB0_16 + .p2align 6 +.LBB0_13: // %vector.ph + // in Loop: Header=BB0_11 Depth=1 + mov x27, x4 + mov x28, x1 + .p2align 6 +.LBB0_14: // %vector.body + // Parent Loop BB0_11 Depth=1 + // => This Inner Loop Header: Depth=2 + add x29, x27, x23, lsl #3 + stp q0, q0, [x27, #-48] + stp q0, q0, [x27, #-16] + add x27, x27, #64 // =64 + stp q0, q0, [x29, #-48] + stp q0, q0, [x29, #-16] + subs x28, x28, #8 // =8 + b.ne .LBB0_14 +// %bb.15: // %middle.block + // in Loop: Header=BB0_11 Depth=1 + mov x27, x2 + mov w29, w3 + cbz w0, .LBB0_18 +.LBB0_16: // %L.LB1_370.preheader + // in Loop: Header=BB0_11 Depth=1 + lsl x28, x27, #3 + add x27, x6, x28 + add x28, x7, x28 + add w29, w29, #1 // =1 + .p2align 6 +.LBB0_17: // %L.LB1_370 + // Parent Loop BB0_11 Depth=1 + // => This Inner Loop Header: Depth=2 + str xzr, [x28], #8 + str xzr, [x27], #8 + sub w29, w29, #1 // =1 + cmp w29, #2 // =2 + b.gt .LBB0_17 +.LBB0_18: // %L.LB1_371 + // in Loop: Header=BB0_11 Depth=1 + tbz w26, #0, .LBB0_20 +// %bb.19: // in Loop: Header=BB0_11 Depth=1 + ldr x27, [sp, #200] // 8-byte Folded Reload + orr w26, wzr, #0x1 + mov w28, w27 + b .LBB0_23 + .p2align 6 +.LBB0_20: // %vector.ph.1 + // in Loop: Header=BB0_11 Depth=1 + mov x26, x19 + mov x27, x1 + .p2align 6 +.LBB0_21: // %vector.body.1 + // Parent Loop BB0_11 Depth=1 + // => This Inner Loop Header: Depth=2 + add x28, x26, x23, lsl #3 + stp q0, q0, [x26] + stur q0, [x26, #-16] + str q0, [x26, #32] + add x26, x26, #64 // =64 + stp q0, q0, [x28, #-16] + stp q0, q0, [x28, #16] + subs x27, x27, #8 // =8 + b.ne .LBB0_21 +// %bb.22: // %middle.block.1 + // in Loop: Header=BB0_11 Depth=1 + mov x26, x2 + mov w28, w3 + cbz w0, .LBB0_25 +.LBB0_23: // %L.LB1_370.preheader.1 + // in Loop: Header=BB0_11 Depth=1 + lsl x27, x26, #3 + add x26, x22, x27 + add x27, x25, x27 + add w28, w28, #1 // =1 + .p2align 6 +.LBB0_24: // %L.LB1_370.1 + // Parent Loop BB0_11 Depth=1 + // => This Inner Loop Header: Depth=2 + str xzr, [x27], #8 + str xzr, [x26], #8 + sub w28, w28, #1 // =1 + cmp w28, #2 // =2 + b.gt .LBB0_24 +.LBB0_25: // %L.LB1_371.1 + // in Loop: Header=BB0_11 Depth=1 + add x4, x4, x5 + add x6, x6, x5 + add x7, x7, x5 + add x19, x19, x5 + add x22, x22, x5 + add x9, x9, #2 // =2 + add x25, x25, x5 + subs x17, x17, #2 // =2 + b.ne .LBB0_11 +// %bb.26: // %L.LB1_368.loopexit.unr-lcssa.loopexit + ldr x22, [sp, #200] // 8-byte Folded Reload + cbnz w16, .LBB0_4 + b .LBB0_30 +.LBB0_27: // %vector.ph.epil + ldr x16, [sp, #360] // 8-byte Folded Reload + add x12, x9, x24 + movi v0.2d, #0000000000000000 + lsl x12, x12, #3 + add x15, x12, #8 // =8 + mul x14, x9, x16 + add w11, w11, #2 // =2 + and w12, w11, #0x7 + madd x15, x15, x16, x8 + sub x13, x10, x12 + sub w11, w22, w13 + add x10, x13, #1 // =1 + add x14, x8, x14, lsl #3 + add x14, x14, #40 // =40 + add x15, x15, #40 // =40 + .p2align 6 +.LBB0_28: // %vector.body.epil + // =>This Inner Loop Header: Depth=1 + stp q0, q0, [x14, #-32] + stp q0, q0, [x14], #64 + stp q0, q0, [x15, #-32] + stp q0, q0, [x15], #64 + subs x13, x13, #8 // =8 + b.ne .LBB0_28 +// %bb.29: // %middle.block.epil + cbnz w12, .LBB0_8 +.LBB0_30: // %L.LB1_368 + tbnz w22, #31, .LBB0_33 +// %bb.31: // %L.LB1_373.preheader + orr w8, wzr, #0xfffffffe + sub w12, w8, w22 + ldr x10, [sp, #424] + cmn w12, #2 // =2 + csel w8, w12, w8, gt + add w13, w22, w8 + mvn x11, x21 + add w14, w13, #2 // =2 + add w9, w22, #1 // =1 + add x12, x14, #1 // =1 + cmp x12, #8 // =8 + b.hs .LBB0_34 +// %bb.32: + ldr x6, [sp, #160] // 8-byte Folded Reload + mov x8, xzr + b .LBB0_43 +.LBB0_33: + ldr x6, [sp, #160] // 8-byte Folded Reload + fmov d0, xzr + tbz w6, #31, .LBB0_47 + b .LBB0_49 +.LBB0_34: // %vector.memcheck159 + add x16, x23, x14 + add x14, x20, x14 + add x17, x10, x16, lsl #3 + sub x16, x16, x21 + add x15, x23, x11 + add x6, x17, #8 // =8 + sub x14, x14, x21 + add x18, x10, x16, lsl #3 + add x16, x20, x11 + add x15, x10, x15, lsl #3 + add x2, x10, x14, lsl #3 + add x4, x10, x12, lsl #3 + add x0, x10, x16, lsl #3 + cmp x15, x2 + cset w7, lo + cmp x0, x18 + cset w19, lo + cmp x15, x4 + cset w14, lo + add x5, x10, x23, lsl #3 + cmp x10, x18 + cset w16, lo + cmp x15, x6 + cset w15, lo + cmp x5, x18 + cset w18, lo + cmp x0, x4 + cset w17, lo + cmp x10, x2 + cset w1, lo + cmp x0, x6 + cset w0, lo + cmp x5, x2 + cset w3, lo + cmp x10, x6 + cset w2, lo + ldr x6, [sp, #160] // 8-byte Folded Reload + mov x8, xzr + cmp x5, x4 + cset w4, lo + and w5, w7, w19 + tbnz w5, #0, .LBB0_43 +// %bb.35: // %vector.memcheck159 + and w14, w14, w16 + tbnz w14, #0, .LBB0_43 +// %bb.36: // %vector.memcheck159 + and w14, w15, w18 + tbnz w14, #0, .LBB0_43 +// %bb.37: // %vector.memcheck159 + and w14, w17, w1 + tbnz w14, #0, .LBB0_43 +// %bb.38: // %vector.memcheck159 + and w14, w0, w3 + tbnz w14, #0, .LBB0_43 +// %bb.39: // %vector.memcheck159 + and w14, w2, w4 + tbnz w14, #0, .LBB0_43 +// %bb.40: // %vector.ph160 + add w8, w13, #3 // =3 + and w13, w8, #0x7 + fmov v0.2d, #1.00000000 + movi v1.2d, #0000000000000000 + sub x8, x12, x13 + lsl x14, x23, #4 + lsl x15, x21, #3 + lsl x12, x23, #3 + sub x14, x14, x15 + sub w9, w9, w8 + sub x12, x12, x15 + mov x15, x10 + mov x16, x8 + .p2align 6 +.LBB0_41: // %vector.body115 + // =>This Inner Loop Header: Depth=1 + add x17, x15, x12 + stur q0, [x17, #-8] + stur q0, [x17, #8] + stur q0, [x17, #24] + stur q0, [x17, #40] + add x17, x15, x14 + stur q0, [x17, #-8] + stur q0, [x17, #8] + stur q0, [x17, #24] + stur q0, [x17, #40] + add x17, x15, x23, lsl #3 + stp q1, q1, [x15] + stp q1, q1, [x15, #32] + add x15, x15, #64 // =64 + stp q1, q1, [x17] + stp q1, q1, [x17, #32] + subs x16, x16, #8 // =8 + b.ne .LBB0_41 +// %bb.42: // %middle.block116 + cbz w13, .LBB0_46 +.LBB0_43: // %L.LB1_373.preheader189 + add x15, x8, x23 + add x16, x8, x20 + add x14, x15, x11 + add x11, x16, x11 + mov x12, xzr + add w9, w9, #1 // =1 + add x13, x10, x8, lsl #3 + add x14, x10, x14, lsl #3 + add x11, x10, x11, lsl #3 + add x10, x10, x15, lsl #3 + orr x15, xzr, #0x3ff0000000000000 + .p2align 6 +.LBB0_44: // %L.LB1_373 + // =>This Inner Loop Header: Depth=1 + lsl x16, x12, #3 + add x12, x12, #1 // =1 + sub w9, w9, #1 // =1 + str x15, [x14, x16] + str x15, [x11, x16] + str xzr, [x13, x16] + str xzr, [x10, x16] + cmp w9, #1 // =1 + b.gt .LBB0_44 +// %bb.45: // %L.LB1_374.loopexit.loopexit + add w8, w8, w12 +.LBB0_46: // %L.LB1_374.loopexit + scvtf d0, w8 + tbnz w6, #31, .LBB0_49 +.LBB0_47: // %L.LB1_382.preheader + ldr s1, [sp, #420] + ldr x8, [sp, #424] + lsl x10, x21, #3 + add x9, x10, #8 // =8 + sshll v1.2d, v1.2s, #0 + add x10, x10, x23, lsl #3 + add w11, w24, #2 // =2 + scvtf d1, d1 + fdiv d0, d0, d1 + .p2align 6 +.LBB0_48: // %L.LB1_382 + // =>This Inner Loop Header: Depth=1 + str d0, [x8] + sub w11, w11, #1 // =1 + str d0, [x8, x23, lsl #3] + str d0, [x8, x21, lsl #3] + str d0, [x8, x10] + add x8, x8, x9 + cmp w11, #1 // =1 + b.gt .LBB0_48 +.LBB0_49: // %L.LB1_383 + sub w9, w6, #1 // =1 + and w25, w9, #0x7 + mvn x9, x25 + add x9, x9, x6 + str x9, [sp, #168] // 8-byte Folded Spill + lsl x9, x21, #6 + lsl x28, x21, #1 + mov w19, #10 + add x29, x9, #64 // =64 + add x9, x21, #2 // =2 + str x9, [sp, #152] // 8-byte Folded Spill + add x9, x28, #4 // =4 + str x9, [sp, #144] // 8-byte Folded Spill + add x9, x28, x21 + add x10, x9, #4 // =4 + str x10, [sp, #136] // 8-byte Folded Spill + add x10, x28, #3 // =3 + str x10, [sp, #128] // 8-byte Folded Spill + add x10, x9, #5 // =5 + lsl x9, x9, #1 + str x10, [sp, #120] // 8-byte Folded Spill + lsl x10, x21, #2 + add x11, x10, #5 // =5 + str x11, [sp, #112] // 8-byte Folded Spill + add x11, x10, #6 // =6 + add x10, x10, x21 + str x11, [sp, #104] // 8-byte Folded Spill + add x11, x10, #6 // =6 + add x10, x10, #7 // =7 + lsl x8, x21, #3 + add x24, x8, #8 // =8 + fmov d9, #0.25000000 + sub x23, x6, #2 // =2 + add w20, w22, #1 // =1 + stp x25, x23, [sp, #176] // 16-byte Folded Spill + stp x10, x11, [sp, #88] // 16-byte Folded Spill + add x10, x9, #7 // =7 + add x9, x9, #8 // =8 + stp x9, x10, [sp, #72] // 16-byte Folded Spill + sub x9, x8, x21 + add x10, x9, #8 // =8 + add x9, x9, #9 // =9 + stp x9, x10, [sp, #56] // 16-byte Folded Spill + add x9, x8, #9 // =9 + str x9, [sp, #48] // 8-byte Folded Spill + add x9, x8, #10 // =10 + add x8, x8, x21 + add x8, x8, #10 // =10 + stp x8, x9, [sp, #32] // 16-byte Folded Spill + adrp x8, .LCPI0_0 + ldr d8, [x8, :lo12:.LCPI0_0] + .p2align 6 +.LBB0_50: // %L.LB1_471 + // =>This Loop Header: Depth=1 + // Child Loop BB0_55 Depth 2 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_60 Depth 4 + // Child Loop BB0_62 Depth 4 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + // Child Loop BB0_68 Depth 4 + // Child Loop BB0_70 Depth 4 + // Child Loop BB0_72 Depth 4 + // Child Loop BB0_74 Depth 4 + // Child Loop BB0_78 Depth 3 + // Child Loop BB0_79 Depth 4 + lsl w8, w19, #1 + add x0, sp, #400 // =400 + add x1, sp, #392 // =392 + str w8, [sp, #196] // 4-byte Folded Spill + bl timing_ + cbz w19, .LBB0_53 +// %bb.51: // %L.LB1_392.preheader + // in Loop: Header=BB0_50 Depth=1 + ldr x8, [sp, #160] // 8-byte Folded Reload + cmp w8, #2 // =2 + b.ge .LBB0_54 +// %bb.52: // %L.LB1_392.us.preheader + // in Loop: Header=BB0_50 Depth=1 + ldr w9, [sp, #196] // 4-byte Folded Reload + mvn w8, w9 + cmn w8, #2 // =2 + orr w8, wzr, #0xfffffffe + csinv w8, w8, w9, le + add w8, w8, w9 + add w26, w8, #3 // =3 + b .LBB0_82 + .p2align 6 +.LBB0_53: // in Loop: Header=BB0_50 Depth=1 + orr w26, wzr, #0x1 + b .LBB0_82 + .p2align 6 +.LBB0_54: // %L.LB1_392.preheader90 + // in Loop: Header=BB0_50 Depth=1 + ldr x10, [sp, #424] + ldr x8, [sp, #360] // 8-byte Folded Reload + add x9, x10, x8, lsl #3 + orr w26, wzr, #0x1 + ldr x8, [sp, #152] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #328] // 8-byte Folded Spill + ldr x8, [sp, #144] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #320] // 8-byte Folded Spill + ldr x8, [sp, #136] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #312] // 8-byte Folded Spill + ldr x8, [sp, #128] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #304] // 8-byte Folded Spill + ldr x8, [sp, #120] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #296] // 8-byte Folded Spill + ldr x8, [sp, #112] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #288] // 8-byte Folded Spill + ldr x8, [sp, #104] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #280] // 8-byte Folded Spill + ldr x8, [sp, #96] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #272] // 8-byte Folded Spill + ldr x8, [sp, #88] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #264] // 8-byte Folded Spill + ldr x8, [sp, #80] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #256] // 8-byte Folded Spill + ldr x8, [sp, #72] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #248] // 8-byte Folded Spill + ldr x8, [sp, #64] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #240] // 8-byte Folded Spill + ldr x8, [sp, #56] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #232] // 8-byte Folded Spill + ldr x8, [sp, #48] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #224] // 8-byte Folded Spill + ldr x8, [sp, #40] // 8-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #216] // 8-byte Folded Spill + add x8, x10, #8 // =8 + str x8, [sp, #352] // 8-byte Folded Spill + add x8, x10, #16 // =16 + stp x10, x8, [sp, #336] // 16-byte Folded Spill + ldr x8, [sp, #32] // 8-byte Folded Reload + ldr w30, [sp, #196] // 4-byte Folded Reload + add x8, x10, x8, lsl #3 + str x8, [sp, #208] // 8-byte Folded Spill + .p2align 6 +.LBB0_55: // %L.LB1_392 + // Parent Loop BB0_50 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_60 Depth 4 + // Child Loop BB0_62 Depth 4 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + // Child Loop BB0_68 Depth 4 + // Child Loop BB0_70 Depth 4 + // Child Loop BB0_72 Depth 4 + // Child Loop BB0_74 Depth 4 + // Child Loop BB0_78 Depth 3 + // Child Loop BB0_79 Depth 4 + cmp w22, #2 // =2 + b.lt .LBB0_81 +// %bb.56: // %L.LB1_395.preheader + // in Loop: Header=BB0_55 Depth=2 + cmp x23, #7 // =7 + b.hs .LBB0_58 +// %bb.57: // in Loop: Header=BB0_55 Depth=2 + mov x11, xzr + orr w12, wzr, #0x1 + cbnz w25, .LBB0_77 + b .LBB0_81 + .p2align 6 +.LBB0_58: // %L.LB1_395.preheader199 + // in Loop: Header=BB0_55 Depth=2 + ldp x10, x5, [sp, #208] // 16-byte Folded Reload + ldp x4, x3, [sp, #224] // 16-byte Folded Reload + ldp x2, x1, [sp, #240] // 16-byte Folded Reload + ldp x0, x18, [sp, #256] // 16-byte Folded Reload + ldp x17, x16, [sp, #272] // 16-byte Folded Reload + ldp x15, x14, [sp, #288] // 16-byte Folded Reload + ldp x13, x25, [sp, #304] // 16-byte Folded Reload + ldp x19, x27, [sp, #320] // 16-byte Folded Reload + ldr x8, [sp, #336] // 8-byte Folded Reload + ldr x6, [sp, #168] // 8-byte Folded Reload + mov x11, xzr + orr w12, wzr, #0x1 + str w26, [sp, #372] // 4-byte Folded Spill + .p2align 6 +.LBB0_59: // %L.LB1_395 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_60 Depth 4 + // Child Loop BB0_62 Depth 4 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + // Child Loop BB0_68 Depth 4 + // Child Loop BB0_70 Depth 4 + // Child Loop BB0_72 Depth 4 + // Child Loop BB0_74 Depth 4 + mul x7, x24, x11 + mov w22, w20 + ldr d0, [x9, x7] + mov x7, x8 + .p2align 6 +.LBB0_60: // %L.LB1_398 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + add x23, x7, x28, lsl #3 + add x26, x7, x21, lsl #3 + ldr d1, [x23, #24] + ldr d2, [x26, #24] + ldr d3, [x7, #8]! + fadd d0, d1, d0 + fadd d1, d2, d3 + sub w22, w22, #1 // =1 + fadd d0, d0, d1 + fmul d0, d0, d9 + str d0, [x26, #16] + cmp w22, #2 // =2 + b.gt .LBB0_60 +// %bb.61: // %L.LB1_399 + // in Loop: Header=BB0_59 Depth=3 + orr x7, x11, #0x1 + mov x22, x19 + mul x7, x24, x7 + mov x23, x27 + mov w26, w20 + ldr d0, [x9, x7] + mov x7, x25 + .p2align 6 + // OSACA-BEGIN +.LBB0_62: // %L.LB1_398.1 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr d1, [x7], #8 + fadd d0, d1, d0 + ldr d2, [x22] + ldr d3, [x23], #8 + fadd d2, d2, d3 + fadd d0, d0, d2 + sub w26, w26, #1 // =1 + fmul d0, d0, d9 + stur d0, [x22, #-8] + add x22, x22, #8 // =8 + cmp w26, #2 // =2 + b.gt .LBB0_62 + // OSACA-END +// %bb.63: // %L.LB1_399.1 + // in Loop: Header=BB0_59 Depth=3 + orr x7, x11, #0x2 + mov x22, x14 + mul x7, x24, x7 + mov x23, x13 + mov w26, w20 + ldr d0, [x9, x7] + mov x7, x15 + .p2align 6 +.LBB0_64: // %L.LB1_398.2 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr d1, [x7], #8 + fadd d0, d1, d0 + ldr d2, [x22] + ldr d3, [x23], #8 + fadd d2, d2, d3 + fadd d0, d0, d2 + sub w26, w26, #1 // =1 + fmul d0, d0, d9 + stur d0, [x22, #-8] + add x22, x22, #8 // =8 + cmp w26, #2 // =2 + b.gt .LBB0_64 +// %bb.65: // %L.LB1_399.2 + // in Loop: Header=BB0_59 Depth=3 + orr x22, x11, #0x3 + mov x7, xzr + mul x22, x24, x22 + ldr d0, [x9, x22] + mov w22, w20 + .p2align 6 +.LBB0_66: // %L.LB1_398.3 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + add x23, x16, x7 + sub w22, w22, #1 // =1 + ldr d1, [x17, x7] + ldr d2, [x25, x7] + ldr d3, [x23] + fadd d2, d3, d2 + fadd d0, d1, d0 + add x7, x7, #8 // =8 + fadd d0, d0, d2 + fmul d0, d0, d9 + stur d0, [x23, #-8] + cmp w22, #2 // =2 + b.gt .LBB0_66 +// %bb.67: // %L.LB1_399.3 + // in Loop: Header=BB0_59 Depth=3 + orr x22, x11, #0x4 + mov x7, xzr + mul x22, x24, x22 + ldr d0, [x9, x22] + mov w22, w20 + .p2align 6 +.LBB0_68: // %L.LB1_398.4 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + add x23, x18, x7 + sub w22, w22, #1 // =1 + ldr d1, [x0, x7] + ldr d2, [x15, x7] + ldr d3, [x23] + fadd d2, d3, d2 + fadd d0, d1, d0 + add x7, x7, #8 // =8 + fadd d0, d0, d2 + fmul d0, d0, d9 + stur d0, [x23, #-8] + cmp w22, #2 // =2 + b.gt .LBB0_68 +// %bb.69: // %L.LB1_399.4 + // in Loop: Header=BB0_59 Depth=3 + mov w22, #5 + orr x22, x11, x22 + mul x22, x24, x22 + mov x7, xzr + ldr d0, [x9, x22] + mov w22, w20 + .p2align 6 +.LBB0_70: // %L.LB1_398.5 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + add x23, x1, x7 + sub w22, w22, #1 // =1 + ldr d1, [x2, x7] + ldr d2, [x17, x7] + ldr d3, [x23] + fadd d2, d3, d2 + fadd d0, d1, d0 + add x7, x7, #8 // =8 + fadd d0, d0, d2 + fmul d0, d0, d9 + stur d0, [x23, #-8] + cmp w22, #2 // =2 + b.gt .LBB0_70 +// %bb.71: // %L.LB1_399.5 + // in Loop: Header=BB0_59 Depth=3 + orr x22, x11, #0x6 + mov x7, xzr + mul x22, x24, x22 + ldr d0, [x9, x22] + mov w22, w20 + .p2align 6 +.LBB0_72: // %L.LB1_398.6 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + add x23, x3, x7 + sub w22, w22, #1 // =1 + ldr d1, [x4, x7] + ldr d2, [x0, x7] + ldr d3, [x23] + fadd d2, d3, d2 + fadd d0, d1, d0 + add x7, x7, #8 // =8 + fadd d0, d0, d2 + fmul d0, d0, d9 + stur d0, [x23, #-8] + cmp w22, #2 // =2 + b.gt .LBB0_72 +// %bb.73: // %L.LB1_399.6 + // in Loop: Header=BB0_59 Depth=3 + orr x22, x11, #0x7 + mov x7, xzr + mul x22, x24, x22 + add x12, x12, #8 // =8 + ldr d0, [x9, x22] + mov w22, w20 + .p2align 6 +.LBB0_74: // %L.LB1_398.7 + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_59 Depth=3 + // => This Inner Loop Header: Depth=4 + add x23, x5, x7 + sub w22, w22, #1 // =1 + ldr d1, [x10, x7] + ldr d2, [x2, x7] + ldr d3, [x23] + fadd d2, d3, d2 + fadd d0, d1, d0 + add x7, x7, #8 // =8 + fadd d0, d0, d2 + fmul d0, d0, d9 + stur d0, [x23, #-8] + cmp w22, #2 // =2 + b.gt .LBB0_74 +// %bb.75: // %L.LB1_399.7 + // in Loop: Header=BB0_59 Depth=3 + add x8, x8, x29 + add x27, x27, x29 + add x19, x19, x29 + add x25, x25, x29 + add x13, x13, x29 + add x11, x11, #8 // =8 + add x14, x14, x29 + add x15, x15, x29 + add x16, x16, x29 + add x17, x17, x29 + add x18, x18, x29 + add x0, x0, x29 + add x1, x1, x29 + add x2, x2, x29 + add x3, x3, x29 + add x4, x4, x29 + add x5, x5, x29 + add x10, x10, x29 + subs x6, x6, #8 // =8 + b.ne .LBB0_59 +// %bb.76: // %L.LB1_396.loopexit.unr-lcssa.loopexit + // in Loop: Header=BB0_55 Depth=2 + ldp x25, x23, [sp, #176] // 16-byte Folded Reload + ldr x22, [sp, #200] // 8-byte Folded Reload + ldr w26, [sp, #372] // 4-byte Folded Reload + cbz w25, .LBB0_81 +.LBB0_77: // %L.LB1_395.epil.preheader + // in Loop: Header=BB0_55 Depth=2 + ldr x13, [sp, #360] // 8-byte Folded Reload + mul x8, x13, x12 + ldr x14, [sp, #344] // 8-byte Folded Reload + sub x10, x12, #1 // =1 + add x12, x12, #1 // =1 + mul x10, x13, x10 + mul x12, x13, x12 + add x8, x14, x8, lsl #3 + mov x13, x25 + ldr x14, [sp, #352] // 8-byte Folded Reload + add x10, x14, x10, lsl #3 + add x12, x14, x12, lsl #3 + .p2align 6 +.LBB0_78: // %L.LB1_395.epil + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_79 Depth 4 + mul x14, x24, x11 + mov x15, x8 + mov x16, x10 + ldr d0, [x9, x14] + mov x14, x12 + mov w17, w20 + .p2align 6 +.LBB0_79: // %L.LB1_398.epil + // Parent Loop BB0_50 Depth=1 + // Parent Loop BB0_55 Depth=2 + // Parent Loop BB0_78 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr d1, [x14], #8 + fadd d0, d1, d0 + ldr d2, [x15] + ldr d3, [x16], #8 + fadd d2, d2, d3 + fadd d0, d0, d2 + sub w17, w17, #1 // =1 + fmul d0, d0, d9 + stur d0, [x15, #-8] + add x15, x15, #8 // =8 + cmp w17, #2 // =2 + b.gt .LBB0_79 +// %bb.80: // %L.LB1_399.epil + // in Loop: Header=BB0_78 Depth=3 + add x10, x10, x24 + add x8, x8, x24 + add x12, x12, x24 + add x11, x11, #1 // =1 + subs x13, x13, #1 // =1 + b.ne .LBB0_78 +.LBB0_81: // %L.LB1_396 + // in Loop: Header=BB0_55 Depth=2 + add w26, w26, #1 // =1 + subs w30, w30, #1 // =1 + b.gt .LBB0_55 +.LBB0_82: // %L.LB1_393 + // in Loop: Header=BB0_50 Depth=1 + add x0, sp, #384 // =384 + add x1, sp, #376 // =376 + bl timing_ + ldr d0, [sp, #384] + ldr d1, [sp, #400] + fsub d0, d0, d1 + ldr w19, [sp, #196] // 4-byte Folded Reload + mov w8, #51712 + movk w8, #15258, lsl #16 + fcmp d0, d8 + ccmp w19, w8, #2, lt + b.lo .LBB0_50 +// %bb.83: // %L.LB1_391 + adrp x0, .C345_MAIN_ + adrp x1, .C327_MAIN_ + cmp w26, w19 + add x0, x0, :lo12:.C345_MAIN_ + add x1, x1, :lo12:.C327_MAIN_ + orr w2, wzr, #0x6 + csel w19, w19, w26, gt + bl f90io_src_info03a + adrp x20, .C283_MAIN_ + add x20, x20, :lo12:.C283_MAIN_ + adrp x0, .C326_MAIN_ + mov x1, xzr + mov x2, x20 + mov x3, x20 + add x0, x0, :lo12:.C326_MAIN_ + bl f90io_print_init + adrp x0, .C348_MAIN_ + add x0, x0, :lo12:.C348_MAIN_ + orr w1, wzr, #0xe + orr w2, wzr, #0xe + bl f90io_sc_ch_ldw + mov w0, w19 + mov w1, #25 + bl f90io_sc_i_ldw + adrp x0, .C349_MAIN_ + add x0, x0, :lo12:.C349_MAIN_ + orr w1, wzr, #0xe + orr w2, wzr, #0xe + bl f90io_sc_ch_ldw + ldr w8, [sp, #416] + sub w8, w8, #1 // =1 + orr w0, wzr, #0x1c + scvtf d0, w19 + scvtf d1, w8 + ldr w8, [sp, #420] + sub w8, w8, #1 // =1 + scvtf d2, w8 + fmul d0, d1, d0 + ldr d1, [sp, #384] + adrp x8, .LCPI0_1 + fmul d0, d0, d2 + ldr d2, [sp, #400] + fsub d1, d1, d2 + ldr d2, [x8, :lo12:.LCPI0_1] + fmul d1, d1, d2 + fdiv d0, d0, d1 + bl f90io_sc_d_ldw + adrp x0, .C351_MAIN_ + add x0, x0, :lo12:.C351_MAIN_ + orr w1, wzr, #0xe + orr w2, wzr, #0x6 + bl f90io_sc_ch_ldw + bl f90io_ldw_end + mov x0, x20 + mov x1, xzr + mov x2, xzr + bl f90_stop08a + add sp, sp, #432 // =432 + ldp x29, x30, [sp, #96] // 16-byte Folded Reload + ldp x20, x19, [sp, #80] // 16-byte Folded Reload + ldp x22, x21, [sp, #64] // 16-byte Folded Reload + ldp x24, x23, [sp, #48] // 16-byte Folded Reload + ldp x26, x25, [sp, #32] // 16-byte Folded Reload + ldp x28, x27, [sp, #16] // 16-byte Folded Reload + ldp d9, d8, [sp], #112 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size MAIN_, .Lfunc_end0-MAIN_ + .cfi_endproc + // -- End function + .type .C351_MAIN_,@object // @.C351_MAIN_ + .section .rodata,"a",@progbits + .p2align 2 +.C351_MAIN_: + .asciz " MLUPs" + .size .C351_MAIN_, 7 + + .type .C349_MAIN_,@object // @.C349_MAIN_ + .p2align 2 +.C349_MAIN_: + .asciz " Performance: " + .size .C349_MAIN_, 15 + + .type .C348_MAIN_,@object // @.C348_MAIN_ + .p2align 2 +.C348_MAIN_: + .asciz "# Iterations: " + .size .C348_MAIN_, 15 + + .type .C326_MAIN_,@object // @.C326_MAIN_ + .p2align 2 +.C326_MAIN_: + .word 6 // 0x6 + .size .C326_MAIN_, 4 + + .type .C345_MAIN_,@object // @.C345_MAIN_ + .p2align 2 +.C345_MAIN_: + .word 72 // 0x48 + .size .C345_MAIN_, 4 + + .type .C366_MAIN_,@object // @.C366_MAIN_ + .p2align 3 +.C366_MAIN_: + .xword 28 // 0x1c + .size .C366_MAIN_, 8 + + .type .C365_MAIN_,@object // @.C365_MAIN_ + .p2align 3 +.C365_MAIN_: + .xword 8 // 0x8 + .size .C365_MAIN_, 8 + + .type .C286_MAIN_,@object // @.C286_MAIN_ + .p2align 3 +.C286_MAIN_: + .xword 1 // 0x1 + .size .C286_MAIN_, 8 + + .type .C285_MAIN_,@object // @.C285_MAIN_ + .p2align 2 +.C285_MAIN_: + .word 1 // 0x1 + .size .C285_MAIN_, 4 + + .type .C334_MAIN_,@object // @.C334_MAIN_ + .p2align 2 +.C334_MAIN_: + .word 25 // 0x19 + .size .C334_MAIN_, 4 + + .type .C330_MAIN_,@object // @.C330_MAIN_ + .p2align 2 +.C330_MAIN_: + .word 5 // 0x5 + .size .C330_MAIN_, 4 + + .type .C327_MAIN_,@object // @.C327_MAIN_ + .p2align 2 +.C327_MAIN_: + .asciz "gs.f90" + .size .C327_MAIN_, 7 + + .type .C329_MAIN_,@object // @.C329_MAIN_ + .p2align 2 +.C329_MAIN_: + .word 12 // 0xc + .size .C329_MAIN_, 4 + + .type .C284_MAIN_,@object // @.C284_MAIN_ + .p2align 3 +.C284_MAIN_: + .xword 0 // 0x0 + .size .C284_MAIN_, 8 + + .type .C283_MAIN_,@object // @.C283_MAIN_ + .p2align 2 +.C283_MAIN_: + .word 0 // 0x0 + .size .C283_MAIN_, 4 + + + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym .C351_MAIN_ + .addrsig_sym .C349_MAIN_ + .addrsig_sym .C348_MAIN_ + .addrsig_sym .C326_MAIN_ + .addrsig_sym .C345_MAIN_ + .addrsig_sym .C366_MAIN_ + .addrsig_sym .C365_MAIN_ + .addrsig_sym .C286_MAIN_ + .addrsig_sym .C285_MAIN_ + .addrsig_sym .C334_MAIN_ + .addrsig_sym .C330_MAIN_ + .addrsig_sym .C327_MAIN_ + .addrsig_sym .C329_MAIN_ + .addrsig_sym .C284_MAIN_ + .addrsig_sym .C283_MAIN_ diff --git a/examples/gs/gs.s.tx2.gcc.s b/examples/gs/gs.s.tx2.gcc.s new file mode 100644 index 0000000..d8ddc17 --- /dev/null +++ b/examples/gs/gs.s.tx2.gcc.s @@ -0,0 +1,737 @@ + .arch armv8.1-a+crypto+crc + .file "gs.f90" + .text + .align 2 + .p2align 4,,15 + .type MAIN__, %function +MAIN__: +.LFB0: + .cfi_startproc + sub sp, sp, #720 + .cfi_def_cfa_offset 720 + mov x0, 128 + mov w1, 12 + stp x29, x30, [sp] + .cfi_offset 29, -720 + .cfi_offset 30, -712 + mov x29, sp + movk x0, 0x5, lsl 32 + stp x19, x20, [sp, 16] + .cfi_offset 19, -704 + .cfi_offset 20, -696 + adrp x19, .LC0 + add x19, x19, :lo12:.LC0 + stp x21, x22, [sp, 32] + stp x0, x19, [sp, 192] + add x0, sp, 192 + stp x23, x24, [sp, 48] + stp x25, x26, [sp, 64] + stp x27, x28, [sp, 80] + str w1, [sp, 208] + .cfi_offset 21, -688 + .cfi_offset 22, -680 + .cfi_offset 23, -672 + .cfi_offset 24, -664 + .cfi_offset 25, -656 + .cfi_offset 26, -648 + .cfi_offset 27, -640 + .cfi_offset 28, -632 + bl _gfortran_st_read + mov w2, 4 + add x1, sp, 144 + add x0, sp, 192 + bl _gfortran_transfer_integer + mov w2, 4 + add x1, sp, 148 + add x0, sp, 192 + bl _gfortran_transfer_integer + add x0, sp, 192 + bl _gfortran_st_read_done + ldp w24, w23, [sp, 144] + mov x3, -1 + mov x5, 4611686018427387904 + mov x2, 2305843009213693951 + sxtw x25, w24 + sxtw x20, w23 + cmp x25, 0 + csel x21, x25, x3, ge + cmp x20, 0 + csel x4, x20, x3, ge + add x21, x21, 1 + add x6, x4, 1 + mul x26, x6, x21 + cmp x26, x5 + lsl x27, x26, 1 + lsl x7, x26, 4 + cset w8, eq + cmp x27, x2 + cinc w9, w8, gt + cmp x25, 0 + ccmp x20, 0, 1, ge + csel x10, x7, xzr, ge + cbnz w9, .L159 + cmp x10, 0 + mov x28, 1 + csel x0, x10, x28, ne + bl malloc + stp d8, d9, [sp, 96] + .cfi_offset 73, -616 + .cfi_offset 72, -624 + cbz x0, .L160 + cmp w23, 1 + ble .L5 + cmp w24, 1 + ble .L6 + sub w12, w24, #2 + sub x4, x27, x26 + lsl x22, x21, 3 + mov w8, w28 + add x13, x21, x12 + mvn x14, x12 + add x10, x4, x21 + mov x6, x12 + add x15, x0, x13, lsl 3 + lsl x17, x14, 3 + mov x9, x21 + add x5, x15, 16 +.L10: + add x1, x17, x5 + sub x18, x10, x9 + sub x16, x5, x1 + sub x30, x16, #8 + lsr x3, x30, 3 + add x2, x3, 1 + ands x7, x2, 7 + beq .L7 + cmp x7, 1 + beq .L104 + cmp x7, 2 + beq .L105 + cmp x7, 3 + beq .L106 + cmp x7, 4 + beq .L107 + cmp x7, 5 + beq .L108 + cmp x7, 6 + beq .L109 + str xzr, [x1] + str xzr, [x1, x18, lsl 3] + add x1, x1, 8 +.L109: + str xzr, [x1] + str xzr, [x1, x18, lsl 3] + add x1, x1, 8 +.L108: + str xzr, [x1] + str xzr, [x1, x18, lsl 3] + add x1, x1, 8 +.L107: + str xzr, [x1] + str xzr, [x1, x18, lsl 3] + add x1, x1, 8 +.L106: + str xzr, [x1] + str xzr, [x1, x18, lsl 3] + add x1, x1, 8 +.L105: + str xzr, [x1] + str xzr, [x1, x18, lsl 3] + add x1, x1, 8 +.L104: + str xzr, [x1] + str xzr, [x1, x18, lsl 3] + add x1, x1, 8 + cmp x1, x5 + beq .L155 +.L7: + str xzr, [x1] + add x28, x1, 8 + add x16, x1, 16 + add x15, x1, 24 + str xzr, [x1, x18, lsl 3] + add x14, x1, 32 + add x13, x1, 40 + add x12, x1, 48 + str xzr, [x1, 8] + add x11, x1, 56 + add x1, x1, 64 + str xzr, [x28, x18, lsl 3] + str xzr, [x1, -48] + str xzr, [x16, x18, lsl 3] + str xzr, [x1, -40] + str xzr, [x15, x18, lsl 3] + str xzr, [x1, -32] + str xzr, [x14, x18, lsl 3] + str xzr, [x1, -24] + str xzr, [x13, x18, lsl 3] + str xzr, [x1, -16] + str xzr, [x12, x18, lsl 3] + str xzr, [x1, -8] + str xzr, [x11, x18, lsl 3] + cmp x1, x5 + bne .L7 +.L155: + add w8, w8, 1 + add x10, x10, x21 + add x9, x9, x21 + add x5, x5, x22 + cmp w23, w8 + bne .L10 +.L9: + mul x20, x21, x20 + fmov d0, 1.0e+0 + sub x17, x26, x27 + and w18, w24, 7 + mov x2, 1 + add x30, x4, x20 + neg x3, x20, lsl 3 + add x7, x0, x30, lsl 3 + str d0, [x7, x17, lsl 3] + add x1, x7, 8 + str d0, [x7] + str xzr, [x0] + str xzr, [x7, x3] + cmp w24, 1 + blt .L151 + cbz w18, .L13 + cmp w18, 1 + beq .L119 + cmp w18, 2 + beq .L120 + cmp w18, 3 + beq .L121 + cmp w18, 4 + beq .L122 + cmp w18, 5 + beq .L123 + cmp w18, 6 + beq .L124 + str d0, [x1, x17, lsl 3] + mov x2, 2 + str d0, [x1] + str xzr, [x0, 8] + str xzr, [x1, x3] + add x1, x1, 8 +.L124: + str d0, [x1, x17, lsl 3] + str d0, [x1] + str xzr, [x0, x2, lsl 3] + add x2, x2, 1 + str xzr, [x1, x3] + add x1, x1, 8 +.L123: + str d0, [x1, x17, lsl 3] + str d0, [x1] + str xzr, [x0, x2, lsl 3] + add x2, x2, 1 + str xzr, [x1, x3] + add x1, x1, 8 +.L122: + str d0, [x1, x17, lsl 3] + str d0, [x1] + str xzr, [x0, x2, lsl 3] + add x2, x2, 1 + str xzr, [x1, x3] + add x1, x1, 8 +.L121: + str d0, [x1, x17, lsl 3] + str d0, [x1] + str xzr, [x0, x2, lsl 3] + add x2, x2, 1 + str xzr, [x1, x3] + add x1, x1, 8 +.L120: + str d0, [x1, x17, lsl 3] + str d0, [x1] + str xzr, [x0, x2, lsl 3] + add x2, x2, 1 + str xzr, [x1, x3] + add x1, x1, 8 +.L119: + str d0, [x1, x17, lsl 3] + str d0, [x1] + str xzr, [x0, x2, lsl 3] + add x2, x2, 1 + str xzr, [x1, x3] + add x1, x1, 8 + cmp w24, w2 + blt .L151 +.L13: + str d0, [x1, x17, lsl 3] + add x28, x1, 8 + add x15, x2, 1 + add x16, x1, 16 + str d0, [x1] + add x13, x2, 2 + add x14, x1, 24 + add x12, x2, 3 + str xzr, [x0, x2, lsl 3] + add x9, x1, 32 + add x4, x2, 4 + add x8, x1, 40 + str xzr, [x1, x3] + add x11, x2, 5 + add x5, x1, 48 + add x10, x2, 6 + str d0, [x28, x17, lsl 3] + add x20, x1, 56 + add x18, x2, 7 + add x2, x2, 8 + str d0, [x1, 8] + add x1, x1, 64 + str xzr, [x0, x15, lsl 3] + str xzr, [x28, x3] + str d0, [x16, x17, lsl 3] + str d0, [x1, -48] + str xzr, [x0, x13, lsl 3] + str xzr, [x16, x3] + str d0, [x14, x17, lsl 3] + str d0, [x1, -40] + str xzr, [x0, x12, lsl 3] + str xzr, [x14, x3] + str d0, [x9, x17, lsl 3] + str d0, [x1, -32] + str xzr, [x0, x4, lsl 3] + str xzr, [x9, x3] + str d0, [x8, x17, lsl 3] + str d0, [x1, -24] + str xzr, [x0, x11, lsl 3] + str xzr, [x8, x3] + str d0, [x5, x17, lsl 3] + str d0, [x1, -16] + str xzr, [x0, x10, lsl 3] + str xzr, [x5, x3] + str d0, [x20, x17, lsl 3] + str d0, [x1, -8] + str xzr, [x0, x18, lsl 3] + str xzr, [x20, x3] + cmp w24, w2 + bge .L13 +.L151: + cmp w24, 0 + csel w17, w24, wzr, ge + add w11, w17, 1 +.L8: + tbnz w23, #31, .L11 +.L12: + scvtf d2, w11 + scvtf d1, w24 + sub x30, x27, x26 + sub x25, x25, x26 + add x26, x25, x26 + add x27, x25, x27 + mov w3, 1 + and w7, w23, 7 + add x2, x0, x22 + fdiv d3, d2, d1 + str d3, [x0] + str d3, [x0, x30, lsl 3] + str d3, [x0, x26, lsl 3] + str d3, [x0, x27, lsl 3] + cmp w23, w3 + blt .L11 + cbz w7, .L15 + cmp w7, 1 + beq .L113 + cmp w7, 2 + beq .L114 + cmp w7, 3 + beq .L115 + cmp w7, 4 + beq .L116 + cmp w7, 5 + beq .L117 + cmp w7, 6 + beq .L118 + str d3, [x2] + mov w3, 2 + str d3, [x2, x30, lsl 3] + str d3, [x2, x26, lsl 3] + str d3, [x2, x27, lsl 3] + add x2, x2, x22 +.L118: + str d3, [x2] + add w3, w3, 1 + str d3, [x2, x30, lsl 3] + str d3, [x2, x26, lsl 3] + str d3, [x2, x27, lsl 3] + add x2, x2, x22 +.L117: + str d3, [x2] + add w3, w3, 1 + str d3, [x2, x30, lsl 3] + str d3, [x2, x26, lsl 3] + str d3, [x2, x27, lsl 3] + add x2, x2, x22 +.L116: + str d3, [x2] + add w3, w3, 1 + str d3, [x2, x30, lsl 3] + str d3, [x2, x26, lsl 3] + str d3, [x2, x27, lsl 3] + add x2, x2, x22 +.L115: + str d3, [x2] + add w3, w3, 1 + str d3, [x2, x30, lsl 3] + str d3, [x2, x26, lsl 3] + str d3, [x2, x27, lsl 3] + add x2, x2, x22 +.L114: + str d3, [x2] + add w3, w3, 1 + str d3, [x2, x30, lsl 3] + str d3, [x2, x26, lsl 3] + str d3, [x2, x27, lsl 3] + add x2, x2, x22 +.L113: + str d3, [x2] + add w3, w3, 1 + str d3, [x2, x30, lsl 3] + str d3, [x2, x26, lsl 3] + str d3, [x2, x27, lsl 3] + add x2, x2, x22 + cmp w23, w3 + blt .L11 +.L15: + str d3, [x2] + add x1, x2, x22 + add w3, w3, 8 + str d3, [x2, x30, lsl 3] + add x28, x1, x22 + str d3, [x2, x26, lsl 3] + add x15, x28, x22 + str d3, [x2, x27, lsl 3] + add x14, x15, x22 + str d3, [x1] + add x16, x14, x22 + str d3, [x1, x30, lsl 3] + add x13, x16, x22 + str d3, [x1, x26, lsl 3] + add x12, x13, x22 + str d3, [x1, x27, lsl 3] + add x2, x12, x22 + str d3, [x28] + str d3, [x28, x30, lsl 3] + str d3, [x28, x26, lsl 3] + str d3, [x28, x27, lsl 3] + str d3, [x15] + str d3, [x15, x30, lsl 3] + str d3, [x15, x26, lsl 3] + str d3, [x15, x27, lsl 3] + str d3, [x14] + str d3, [x14, x30, lsl 3] + str d3, [x14, x26, lsl 3] + str d3, [x14, x27, lsl 3] + str d3, [x16] + str d3, [x16, x30, lsl 3] + str d3, [x16, x26, lsl 3] + str d3, [x16, x27, lsl 3] + str d3, [x13] + str d3, [x13, x30, lsl 3] + str d3, [x13, x26, lsl 3] + str d3, [x13, x27, lsl 3] + str d3, [x12] + str d3, [x12, x30, lsl 3] + str d3, [x12, x26, lsl 3] + str d3, [x12, x27, lsl 3] + cmp w23, w3 + bge .L15 +.L11: + add x6, x21, x6, uxtw + adrp x4, .LC6 + add x9, x22, 8 + fmov d9, 2.5e-1 + ldr d8, [x4, #:lo12:.LC6] + add x27, x0, x9 + mov w20, 51711 + add x0, x0, x6, lsl 3 + lsl x28, x21, 1 + mov w26, 10 + movk w20, 0x3b9a, lsl 16 + add x25, x0, 16 +.L14: + add x0, sp, 176 + add x1, sp, 160 + lsl w26, w26, 1 + bl timing_ + mov w0, 0 + .p2align 4 +.L18: + cmp w23, 1 + ble .L21 + cmp w24, 1 + ble .L21 + mov x11, 0 + mov w10, 1 + mov x7, x25 + mov x9, x28 + mov x8, x21 + mov x6, x27 + .p2align 4 +.L22: + sub x5, x7, x6 + add w10, w10, 1 + mov x15, x6 + sub x18, x11, x8 + sub x17, x5, #8 + sub x30, x9, x8 + ldr d30, [x6, -8] + lsr x3, x17, 3 + add x2, x3, 1 + ands x1, x2, 3 + beq .L20 + cmp x1, 1 + beq .L111 + cmp x1, 2 + beq .L112 + ldr d4, [x6, x18, lsl 3] + ldr d6, [x6, 8] + ldr d5, [x6, x30, lsl 3] + fadd d7, d4, d6 + fadd d16, d7, d30 + fadd d17, d16, d5 + fmul d30, d17, d9 + str d30, [x15], 8 +.L112: + ldr d18, [x15, x18, lsl 3] + ldr d20, [x15, 8] + ldr d19, [x15, x30, lsl 3] + fadd d21, d18, d20 + fadd d22, d21, d30 + fadd d23, d22, d19 + fmul d30, d23, d9 + str d30, [x15], 8 +.L111: + ldr d24, [x15, x18, lsl 3] + ldr d26, [x15, 8] + ldr d25, [x15, x30, lsl 3] + fadd d27, d24, d26 + fadd d28, d27, d30 + fadd d29, d28, d25 + fmul d30, d29, d9 + str d30, [x15], 8 + cmp x7, x15 + beq .L154 + // OSACA-BEGIN +.L20: + ldr d31, [x15, x18, lsl 3] + ldr d0, [x15, 8] + mov x14, x15 + add x16, x15, 24 + ldr d2, [x15, x30, lsl 3] + add x15, x15, 32 + fadd d1, d31, d0 + fadd d3, d1, d30 + fadd d4, d3, d2 + fmul d5, d4, d9 + str d5, [x14], 8 + ldr d6, [x14, x18, lsl 3] + ldr d16, [x14, 8] + add x13, x14, 8 + ldr d7, [x14, x30, lsl 3] + fadd d17, d6, d16 + fadd d18, d17, d5 + fadd d19, d18, d7 + fmul d20, d19, d9 + str d20, [x15, -24] + ldr d21, [x13, x18, lsl 3] + ldr d23, [x14, 16] + ldr d22, [x13, x30, lsl 3] + fadd d24, d21, d23 + fadd d25, d24, d20 + fadd d26, d25, d22 + fmul d27, d26, d9 + str d27, [x14, 8] + ldr d30, [x15] + ldr d28, [x16, x18, lsl 3] + ldr d29, [x16, x30, lsl 3] + fadd d31, d28, d30 + fadd d2, d31, d27 + fadd d0, d2, d29 + fmul d30, d0, d9 + str d30, [x15, -8] + cmp x7, x15 + bne .L20 + // OSACA-END +.L154: + add x6, x6, x22 + add x11, x11, x21 + add x8, x8, x21 + add x9, x9, x21 + add x7, x7, x22 + cmp w23, w10 + bne .L22 +.L21: + add w4, w0, 1 + cmp w26, w4 + beq .L17 + mov w0, w4 + b .L18 +.L17: + add w12, w0, 2 + add x1, sp, 152 + add x0, sp, 168 + str w12, [sp, 124] + str w12, [sp, 140] + bl timing_ + ldp d3, d1, [sp, 168] + ldr w5, [sp, 124] + fsub d4, d3, d1 + fcmpe d4, d8 + ccmp w26, w20, 0, lt + ble .L14 + cmp w5, w26 + ble .L23 + str w26, [sp, 140] +.L23: + mov x21, 128 + add x0, sp, 192 + mov w22, 72 + movk x21, 0x6, lsl 32 + str w22, [sp, 208] + sub w24, w24, #1 + sub w23, w23, #1 + stp x21, x19, [sp, 192] + bl _gfortran_st_write + adrp x19, .LANCHOR0 + adrp x27, .LC7 + add x28, x19, :lo12:.LANCHOR0 + mov x2, 14 + add x0, sp, 192 + mov x1, x28 + bl _gfortran_transfer_character_write + mov w2, 4 + add x1, sp, 140 + add x0, sp, 192 + bl _gfortran_transfer_integer_write + add x1, x28, 16 + mov x2, 14 + add x0, sp, 192 + bl _gfortran_transfer_character_write + ldr w25, [sp, 140] + scvtf d9, w24 + scvtf d8, w23 + ldr d5, [x27, #:lo12:.LC7] + ldp d18, d19, [sp, 168] + mov w2, 8 + add x1, sp, 184 + add x0, sp, 192 + scvtf d7, w25 + fsub d20, d18, d19 + fmul d6, d9, d8 + fmul d16, d7, d5 + fmul d17, d6, d16 + fdiv d21, d17, d20 + str d21, [sp, 184] + bl _gfortran_transfer_real_write + add x1, x28, 32 + mov x2, 6 + add x0, sp, 192 + bl _gfortran_transfer_character_write + add x0, sp, 192 + bl _gfortran_st_write_done + mov w2, 0 + mov x1, 0 + mov x0, 0 + bl _gfortran_stop_string +.L5: + tbnz w24, #31, .L25 +.L157: + sub x4, x27, x26 + lsl x22, x21, 3 + sub w6, w24, #2 + b .L9 +.L6: + tbz w24, #31, .L157 + mov w11, 0 + lsl x22, x21, 3 + sub w6, w24, #2 + b .L12 +.L159: + .cfi_restore 72 + .cfi_restore 73 + adrp x26, .LC1 + stp d8, d9, [sp, 96] + .cfi_offset 73, -616 + .cfi_offset 72, -624 + add x0, x26, :lo12:.LC1 + bl _gfortran_runtime_error +.L25: + mov w11, 0 + lsl x22, x21, 3 + sub w6, w24, #2 + b .L8 +.L160: + adrp x20, .LC2 + add x0, x20, :lo12:.LC2 + bl _gfortran_os_error + .cfi_endproc +.LFE0: + .size MAIN__, .-MAIN__ + .section .text.startup,"ax",@progbits + .align 2 + .p2align 4,,15 + .global main + .type main, %function +main: +.LFB1: + .cfi_startproc + stp x29, x30, [sp, -16]! + .cfi_def_cfa_offset 16 + .cfi_offset 29, -16 + .cfi_offset 30, -8 + mov x29, sp + bl _gfortran_set_args + adrp x1, .LANCHOR0 + mov w0, 7 + add x2, x1, :lo12:.LANCHOR0 + add x1, x2, 40 + bl _gfortran_set_options + bl MAIN__ + .cfi_endproc +.LFE1: + .size main, .-main + .section .rodata + .align 3 + .set .LANCHOR0,. + 0 +.LC3: + .ascii "# Iterations: " + .zero 2 +.LC4: + .ascii " Performance: " + .zero 2 +.LC5: + .ascii " MLUPs" + .zero 2 + .type options.8.2753, %object + .size options.8.2753, 28 +options.8.2753: + .word 68 + .word 8191 + .word 0 + .word 1 + .word 1 + .word 0 + .word 31 + .section .rodata.cst8,"aM",@progbits,8 + .align 3 +.LC6: + .word 2576980378 + .word 1070176665 +.LC7: + .word 2696277389 + .word 1051772663 + .section .rodata.str1.8,"aMS",@progbits,1 + .align 3 +.LC0: + .string "gs.f90" + .zero 1 +.LC1: + .string "Integer overflow when calculating the amount of memory to allocate" + .zero 5 +.LC2: + .string "Allocation would exceed memory limit" + .ident "GCC: (ARM-build-8) 8.2.0" + .section .note.GNU-stack,"",@progbits diff --git a/examples/gs/gs.s.zen.gcc.s b/examples/gs/gs.s.zen.gcc.s new file mode 100644 index 0000000..e4b854f --- /dev/null +++ b/examples/gs/gs.s.zen.gcc.s @@ -0,0 +1,1073 @@ + .file "gs.f90" + .text + .section .rodata.str1.1,"aMS",@progbits,1 +.LC0: + .string "gs.f90" + .section .rodata.str1.8,"aMS",@progbits,1 + .align 8 +.LC1: + .string "Integer overflow when calculating the amount of memory to allocate" + .align 8 +.LC2: + .string "Allocation would exceed memory limit" + .section .rodata.str1.1 +.LC8: + .string "# Iterations: " +.LC9: + .string " Performance: " +.LC11: + .string " MLUPs" + .text + .p2align 4 + .type MAIN__, @function +MAIN__: +.LFB0: + .cfi_startproc + pushq %r15 + .cfi_def_cfa_offset 16 + .cfi_offset 15, -16 + pushq %r14 + .cfi_def_cfa_offset 24 + .cfi_offset 14, -24 + movabsq $21474836608, %rax + movq $-1, %r14 + pushq %r13 + .cfi_def_cfa_offset 32 + .cfi_offset 13, -32 + pushq %r12 + .cfi_def_cfa_offset 40 + .cfi_offset 12, -40 + pushq %rbp + .cfi_def_cfa_offset 48 + .cfi_offset 6, -48 + pushq %rbx + .cfi_def_cfa_offset 56 + .cfi_offset 3, -56 + subq $664, %rsp + .cfi_def_cfa_offset 720 + leaq 128(%rsp), %rdi + movq %rax, 128(%rsp) + movq $.LC0, 136(%rsp) + movl $12, 144(%rsp) + call _gfortran_st_read + movl $4, %edx + leaq 80(%rsp), %rsi + leaq 128(%rsp), %rdi + call _gfortran_transfer_integer + movl $4, %edx + leaq 84(%rsp), %rsi + leaq 128(%rsp), %rdi + call _gfortran_transfer_integer + leaq 128(%rsp), %rdi + call _gfortran_st_read_done + movslq 80(%rsp), %rdi + movabsq $4611686018427387904, %rcx + movabsq $2305843009213693951, %r8 + movslq 84(%rsp), %rsi + testq %rdi, %rdi + movq %rdi, %r15 + movq %rdi, %r12 + movq %rdi, 16(%rsp) + cmovs %r14, %r15 + testq %rsi, %rsi + movq %rsi, %rbp + movq %rsi, 24(%rsp) + cmovns %rsi, %r14 + leaq 1(%r15), %rbx + xorl %edx, %edx + incq %r14 + imulq %rbx, %r14 + cmpq %rcx, %r14 + leaq (%r14,%r14), %r13 + sete %dl + cmpq %r8, %r13 + setg %r9b + movzbl %r9b, %r10d + addl %r10d, %edx + testq %rdi, %rdi + js .L37 + testq %rsi, %rsi + js .L37 + movq %r14, %r11 + salq $4, %r11 +.L2: + testl %edx, %edx + jne .L282 + testq %r11, %r11 + movl $1, %edi + cmovne %r11, %rdi + call malloc + testq %rax, %rax + je .L283 + cmpl $1, %ebp + jle .L5 + cmpl $1, %r12d + jle .L6 + leal -1(%r12), %r9d + movq %r13, %r10 + leal -2(%r12), %r11d + leaq (%rax,%r14,8), %rdi + movl %r9d, %ecx + movl %r9d, %r8d + subq %r14, %r10 + leaq 8(%rax), %rdx + shrl %ecx + andl $-2, %r8d + movl %r9d, 32(%rsp) + addq %rbx, %r10 + salq $4, %rcx + movl %r8d, 36(%rsp) + addq $2, %r15 + movq %rdi, 40(%rsp) + movq %rcx, 8(%rsp) + orl $1, %r9d + movl $1, (%rsp) + movq %r11, %r8 + movq %r11, 48(%rsp) + movq %rdx, 56(%rsp) + vxorps %xmm0, %xmm0, %xmm0 +.L14: + leaq -1(%r15), %rcx + leaq 1(%r10), %rsi + cmpq %rcx, %rsi + setl %dil + cmpq %r15, %r10 + setg %r11b + orl %r11d, %edi + andl $1, %edi + je .L9 + cmpl $3, %r8d + jbe .L9 + movq 8(%rsp), %r11 + leaq 0(,%r15,8), %rsi + xorl %edx, %edx + leaq (%rax,%rsi), %rdi + addq 40(%rsp), %rsi + subq $16, %r11 + shrq $4, %r11 + incq %r11 + andl $7, %r11d + je .L13 + cmpq $1, %r11 + je .L176 + cmpq $2, %r11 + je .L177 + cmpq $3, %r11 + je .L178 + cmpq $4, %r11 + je .L179 + cmpq $5, %r11 + je .L180 + cmpq $6, %r11 + je .L181 + vmovups %xmm0, (%rdi) + movl $16, %edx + vmovups %xmm0, (%rsi) +.L181: + vmovups %xmm0, (%rdi,%rdx) + vmovups %xmm0, (%rsi,%rdx) + addq $16, %rdx +.L180: + vmovups %xmm0, (%rdi,%rdx) + vmovups %xmm0, (%rsi,%rdx) + addq $16, %rdx +.L179: + vmovups %xmm0, (%rdi,%rdx) + vmovups %xmm0, (%rsi,%rdx) + addq $16, %rdx +.L178: + vmovups %xmm0, (%rdi,%rdx) + vmovups %xmm0, (%rsi,%rdx) + addq $16, %rdx +.L177: + vmovups %xmm0, (%rdi,%rdx) + vmovups %xmm0, (%rsi,%rdx) + addq $16, %rdx +.L176: + vmovups %xmm0, (%rdi,%rdx) + vmovups %xmm0, (%rsi,%rdx) + addq $16, %rdx + cmpq 8(%rsp), %rdx + je .L155 +.L13: + vmovups %xmm0, (%rdi,%rdx) + vmovups %xmm0, (%rsi,%rdx) + vmovups %xmm0, 16(%rdi,%rdx) + vmovups %xmm0, 16(%rsi,%rdx) + vmovups %xmm0, 32(%rdi,%rdx) + vmovups %xmm0, 32(%rsi,%rdx) + vmovups %xmm0, 48(%rdi,%rdx) + vmovups %xmm0, 48(%rsi,%rdx) + vmovups %xmm0, 64(%rdi,%rdx) + vmovups %xmm0, 64(%rsi,%rdx) + vmovups %xmm0, 80(%rdi,%rdx) + vmovups %xmm0, 80(%rsi,%rdx) + vmovups %xmm0, 96(%rdi,%rdx) + vmovups %xmm0, 96(%rsi,%rdx) + vmovups %xmm0, 112(%rdi,%rdx) + vmovups %xmm0, 112(%rsi,%rdx) + subq $-128, %rdx + cmpq 8(%rsp), %rdx + jne .L13 +.L155: + movl 36(%rsp), %esi + cmpl %esi, 32(%rsp) + je .L16 + addq %r9, %rcx + movq $0x000000000, (%rax,%rcx,8) + leaq (%r10,%r9), %rcx + movq $0x000000000, (%rax,%rcx,8) +.L16: + incl (%rsp) + addq %rbx, %r10 + addq %rbx, %r15 + movl (%rsp), %r11d + cmpl %r11d, %ebp + jne .L14 +.L11: + movq 24(%rsp), %r11 + movl $0, %edx + movq %r13, %rsi + imulq %rbx, %r11 + testl %r12d, %r12d + cmovns %r12d, %edx + subq %r14, %rsi + movq %r11, %r10 + leaq 1(%r11), %r9 + subq %r14, %r10 + movq %r9, (%rsp) + leaq (%r10,%r13), %rcx + leaq 1(%r13,%r10), %r15 + leaq 1(%rsi), %r10 + cmpq %rcx, %r10 + setl %r9b + cmpq %rsi, %r15 + setl %dil + orl %edi, %r9d + cmpq %rcx, (%rsp) + setl %dil + cmpq %r15, %r11 + setg 8(%rsp) + orw 8(%rsp), %di + andl %edi, %r9d + cmpq %r10, %r11 + setg %dil + cmpq %rsi, (%rsp) + setl %sil + orl %edi, %esi + andl %r9d, %esi + andl $1, %esi + je .L20 + cmpq $2, %r10 + seta %r10b + cmpq $2, %r15 + seta %r15b + andl %r15d, %r10d + cmpq $2, (%rsp) + seta %dil + cmpl $2, %edx + seta %r9b + andl %r9d, %edi + andl %edi, %r10d + andl $1, %r10d + je .L20 + incl %edx + leaq (%rax,%rcx,8), %rdi + xorl %ecx, %ecx + vmovaps .LC4(%rip), %xmm1 + movl %edx, %r15d + leaq (%rax,%r11,8), %r9 + leaq (%rax,%r14,8), %rsi + vxorps %xmm2, %xmm2, %xmm2 + shrl %r15d + salq $4, %r15 + leaq -16(%r15), %r10 + shrq $4, %r10 + incq %r10 + andl $7, %r10d + je .L22 + cmpq $1, %r10 + je .L188 + cmpq $2, %r10 + je .L189 + cmpq $3, %r10 + je .L190 + cmpq $4, %r10 + je .L191 + cmpq $5, %r10 + je .L192 + cmpq $6, %r10 + je .L193 + vmovups %xmm1, (%r9) + movl $16, %ecx + vmovups %xmm1, (%rdi) + vmovups %xmm2, (%rax) + vmovups %xmm2, (%rsi) +.L193: + vmovups %xmm1, (%r9,%rcx) + vmovups %xmm1, (%rdi,%rcx) + vmovups %xmm2, (%rax,%rcx) + vmovups %xmm2, (%rsi,%rcx) + addq $16, %rcx +.L192: + vmovups %xmm1, (%r9,%rcx) + vmovups %xmm1, (%rdi,%rcx) + vmovups %xmm2, (%rax,%rcx) + vmovups %xmm2, (%rsi,%rcx) + addq $16, %rcx +.L191: + vmovups %xmm1, (%r9,%rcx) + vmovups %xmm1, (%rdi,%rcx) + vmovups %xmm2, (%rax,%rcx) + vmovups %xmm2, (%rsi,%rcx) + addq $16, %rcx +.L190: + vmovups %xmm1, (%r9,%rcx) + vmovups %xmm1, (%rdi,%rcx) + vmovups %xmm2, (%rax,%rcx) + vmovups %xmm2, (%rsi,%rcx) + addq $16, %rcx +.L189: + vmovups %xmm1, (%r9,%rcx) + vmovups %xmm1, (%rdi,%rcx) + vmovups %xmm2, (%rax,%rcx) + vmovups %xmm2, (%rsi,%rcx) + addq $16, %rcx +.L188: + vmovups %xmm1, (%r9,%rcx) + vmovups %xmm1, (%rdi,%rcx) + vmovups %xmm2, (%rax,%rcx) + vmovups %xmm2, (%rsi,%rcx) + addq $16, %rcx + cmpq %r15, %rcx + je .L113 +.L22: + vmovups %xmm1, (%r9,%rcx) + vmovups %xmm1, (%rdi,%rcx) + vmovups %xmm2, (%rax,%rcx) + vmovups %xmm2, (%rsi,%rcx) + vmovups %xmm1, 16(%r9,%rcx) + vmovups %xmm1, 16(%rdi,%rcx) + vmovups %xmm2, 16(%rax,%rcx) + vmovups %xmm2, 16(%rsi,%rcx) + vmovups %xmm1, 32(%r9,%rcx) + vmovups %xmm1, 32(%rdi,%rcx) + vmovups %xmm2, 32(%rax,%rcx) + vmovups %xmm2, 32(%rsi,%rcx) + vmovups %xmm1, 48(%r9,%rcx) + vmovups %xmm1, 48(%rdi,%rcx) + vmovups %xmm2, 48(%rax,%rcx) + vmovups %xmm2, 48(%rsi,%rcx) + vmovups %xmm1, 64(%r9,%rcx) + vmovups %xmm1, 64(%rdi,%rcx) + vmovups %xmm2, 64(%rax,%rcx) + vmovups %xmm2, 64(%rsi,%rcx) + vmovups %xmm1, 80(%r9,%rcx) + vmovups %xmm1, 80(%rdi,%rcx) + vmovups %xmm2, 80(%rax,%rcx) + vmovups %xmm2, 80(%rsi,%rcx) + vmovups %xmm1, 96(%r9,%rcx) + vmovups %xmm1, 96(%rdi,%rcx) + vmovups %xmm2, 96(%rax,%rcx) + vmovups %xmm2, 96(%rsi,%rcx) + vmovups %xmm1, 112(%r9,%rcx) + vmovups %xmm1, 112(%rdi,%rcx) + vmovups %xmm2, 112(%rax,%rcx) + vmovups %xmm2, 112(%rsi,%rcx) + subq $-128, %rcx + cmpq %r15, %rcx + jne .L22 +.L113: + movl %edx, %r9d + andl $-2, %r9d + testb $1, %dl + je .L10 + vmovsd .LC5(%rip), %xmm3 + movslq %r9d, %r15 + movq %r15, %rdi + leaq (%r11,%r15), %r11 + subq %r14, %rdi + leaq 0(%r13,%rdi), %rsi + vmovsd %xmm3, (%rax,%r11,8) + addq %r14, %r11 + vmovsd %xmm3, (%rax,%r11,8) + movq $0x000000000, (%rax,%r15,8) + movq $0x000000000, (%rax,%rsi,8) +.L10: + testl %ebp, %ebp + js .L18 +.L19: + vxorps %xmm5, %xmm5, %xmm5 + movq 16(%rsp), %r15 + leaq 0(,%rbx,8), %rdi + movl $1, %r9d + vcvtsi2sdl %edx, %xmm5, %xmm6 + vcvtsi2sdl %r12d, %xmm5, %xmm7 + vdivsd %xmm7, %xmm6, %xmm8 + leaq (%rax,%rdi), %r10 + movq %r15, %rdx + subq %r14, %rdx + leaq 0(%r13,%rdx), %rcx + subq %r14, %r13 + movl %ebp, %r14d + andl $7, %r14d + vmovsd %xmm8, (%rax) + vmovsd %xmm8, (%rax,%r13,8) + vmovsd %xmm8, (%rax,%r15,8) + vmovsd %xmm8, (%rax,%rcx,8) + cmpl $1, %ebp + jl .L18 + testl %r14d, %r14d + je .L27 + cmpl $1, %r14d + je .L200 + cmpl $2, %r14d + je .L201 + cmpl $3, %r14d + je .L202 + cmpl $4, %r14d + je .L203 + cmpl $5, %r14d + je .L204 + cmpl $6, %r14d + je .L205 + vmovsd %xmm8, (%r10) + movl $2, %r9d + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 +.L205: + vmovsd %xmm8, (%r10) + incl %r9d + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 +.L204: + vmovsd %xmm8, (%r10) + incl %r9d + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 +.L203: + vmovsd %xmm8, (%r10) + incl %r9d + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 +.L202: + vmovsd %xmm8, (%r10) + incl %r9d + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 +.L201: + vmovsd %xmm8, (%r10) + incl %r9d + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 +.L200: + incl %r9d + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + cmpl %r9d, %ebp + jl .L18 +.L27: + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + addl $8, %r9d + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + vmovsd %xmm8, (%r10) + vmovsd %xmm8, (%r10,%r13,8) + vmovsd %xmm8, (%r10,%r15,8) + vmovsd %xmm8, (%r10,%rcx,8) + addq %rdi, %r10 + cmpl %r9d, %ebp + jge .L27 +.L18: + movl %r8d, %r8d + leaq 0(,%rbx,8), %r13 + leaq (%rbx,%rbx), %rsi + movl $10, (%rsp) + addq %rbx, %r8 + leaq 8(%rax,%r13), %r11 + movq %rsi, 8(%rsp) + leaq 16(%rax,%r8,8), %rax + movq %r11, 16(%rsp) + movq %rax, 24(%rsp) +.L26: + leaq 96(%rsp), %rsi + leaq 112(%rsp), %rdi + xorl %r15d, %r15d + sall (%rsp) + call timing_ + movq .LC6(%rip), %rdx + vmovq %rdx, %xmm9 + .p2align 4 + .p2align 3 +.L30: + cmpl $1, %ebp + jle .L33 + cmpl $1, %r12d + jle .L33 + movq 24(%rsp), %r8 + movq 8(%rsp), %r14 + movq %rbx, %r9 + xorl %r11d, %r11d + movq 16(%rsp), %rdi + movl $1, %r10d + .p2align 4 + .p2align 3 +.L34: + movq %r8, %rdx + movq %r11, %rsi + movq %r14, %rcx + incl %r10d + subq %rdi, %rdx + subq %r9, %rsi + subq %r9, %rcx + vmovsd -8(%rdi), %xmm8 + subq $8, %rdx + movq %rdi, %rax + shrq $3, %rdx + incq %rdx + andl $7, %edx + je .L32 + cmpq $1, %rdx + je .L194 + cmpq $2, %rdx + je .L195 + cmpq $3, %rdx + je .L196 + cmpq $4, %rdx + je .L197 + cmpq $5, %rdx + je .L198 + cmpq $6, %rdx + je .L199 + vmovsd (%rdi,%rsi,8), %xmm10 + vaddsd (%rdi,%rcx,8), %xmm8, %xmm12 + leaq 8(%rdi), %rax + vaddsd 8(%rdi), %xmm10, %xmm11 + vaddsd %xmm12, %xmm11, %xmm13 + vmulsd %xmm9, %xmm13, %xmm8 + vmovsd %xmm8, (%rdi) +.L199: + vmovsd (%rax,%rsi,8), %xmm14 + vaddsd (%rax,%rcx,8), %xmm8, %xmm0 + vaddsd 8(%rax), %xmm14, %xmm15 + addq $8, %rax + vaddsd %xmm0, %xmm15, %xmm1 + vmulsd %xmm9, %xmm1, %xmm8 + vmovsd %xmm8, -8(%rax) +.L198: + vmovsd (%rax,%rsi,8), %xmm2 + vaddsd (%rax,%rcx,8), %xmm8, %xmm4 + vaddsd 8(%rax), %xmm2, %xmm3 + addq $8, %rax + vaddsd %xmm4, %xmm3, %xmm5 + vmulsd %xmm9, %xmm5, %xmm8 + vmovsd %xmm8, -8(%rax) +.L197: + vmovsd (%rax,%rsi,8), %xmm6 + vaddsd (%rax,%rcx,8), %xmm8, %xmm8 + vaddsd 8(%rax), %xmm6, %xmm7 + addq $8, %rax + vaddsd %xmm8, %xmm7, %xmm10 + vmulsd %xmm9, %xmm10, %xmm8 + vmovsd %xmm8, -8(%rax) +.L196: + vmovsd (%rax,%rsi,8), %xmm11 + vaddsd (%rax,%rcx,8), %xmm8, %xmm13 + vaddsd 8(%rax), %xmm11, %xmm12 + addq $8, %rax + vaddsd %xmm13, %xmm12, %xmm14 + vmulsd %xmm9, %xmm14, %xmm8 + vmovsd %xmm8, -8(%rax) +.L195: + vmovsd (%rax,%rsi,8), %xmm15 + vaddsd (%rax,%rcx,8), %xmm8, %xmm0 + vaddsd 8(%rax), %xmm15, %xmm1 + addq $8, %rax + vaddsd %xmm0, %xmm1, %xmm2 + vmulsd %xmm9, %xmm2, %xmm8 + vmovsd %xmm8, -8(%rax) +.L194: + vmovsd (%rax,%rsi,8), %xmm3 + vaddsd (%rax,%rcx,8), %xmm8, %xmm5 + vaddsd 8(%rax), %xmm3, %xmm4 + addq $8, %rax + vaddsd %xmm5, %xmm4, %xmm6 + vmulsd %xmm9, %xmm6, %xmm8 + vmovsd %xmm8, -8(%rax) + cmpq %r8, %rax + je .L266 +# OSACA-BEGIN +.L32: + vmovsd (%rax,%rsi,8), %xmm7 + leaq 8(%rax), %rdx + vaddsd (%rax,%rcx,8), %xmm8, %xmm11 + vaddsd 8(%rax), %xmm7, %xmm10 + vaddsd %xmm11, %xmm10, %xmm12 + vmulsd %xmm9, %xmm12, %xmm13 + vmovsd %xmm13, (%rax) + vmovsd (%rdx,%rsi,8), %xmm14 + vaddsd (%rdx,%rcx,8), %xmm13, %xmm1 + leaq 16(%rax), %rdx + vaddsd 16(%rax), %xmm14, %xmm15 + vaddsd %xmm1, %xmm15, %xmm0 + vmulsd %xmm9, %xmm0, %xmm3 + vmovsd %xmm3, 8(%rax) + vmovsd (%rdx,%rsi,8), %xmm2 + vaddsd (%rdx,%rcx,8), %xmm3, %xmm5 + leaq 24(%rax), %rdx + vaddsd 24(%rax), %xmm2, %xmm4 + vaddsd %xmm5, %xmm4, %xmm6 + vmulsd %xmm9, %xmm6, %xmm8 + vmovsd %xmm8, 16(%rax) + vmovsd (%rdx,%rsi,8), %xmm7 + vaddsd (%rdx,%rcx,8), %xmm8, %xmm11 + leaq 32(%rax), %rdx + vaddsd 32(%rax), %xmm7, %xmm10 + vaddsd %xmm11, %xmm10, %xmm12 + vmulsd %xmm9, %xmm12, %xmm13 + vmovsd %xmm13, 24(%rax) + vmovsd (%rdx,%rsi,8), %xmm14 + vaddsd (%rdx,%rcx,8), %xmm13, %xmm1 + leaq 40(%rax), %rdx + vaddsd 40(%rax), %xmm14, %xmm15 + vaddsd %xmm1, %xmm15, %xmm0 + vmulsd %xmm9, %xmm0, %xmm3 + vmovsd %xmm3, 32(%rax) + vmovsd (%rdx,%rsi,8), %xmm2 + vaddsd (%rdx,%rcx,8), %xmm3, %xmm5 + leaq 48(%rax), %rdx + vaddsd 48(%rax), %xmm2, %xmm4 + vaddsd %xmm5, %xmm4, %xmm6 + vmulsd %xmm9, %xmm6, %xmm8 + vmovsd %xmm8, 40(%rax) + vmovsd (%rdx,%rsi,8), %xmm7 + vaddsd (%rdx,%rcx,8), %xmm8, %xmm11 + leaq 56(%rax), %rdx + vaddsd 56(%rax), %xmm7, %xmm10 + addq $64, %rax + vaddsd %xmm11, %xmm10, %xmm12 + vmulsd %xmm9, %xmm12, %xmm13 + vmovsd %xmm13, -16(%rax) + vmovsd (%rdx,%rsi,8), %xmm14 + vaddsd (%rdx,%rcx,8), %xmm13, %xmm1 + vaddsd (%rax), %xmm14, %xmm15 + vaddsd %xmm1, %xmm15, %xmm0 + vmulsd %xmm9, %xmm0, %xmm8 + vmovsd %xmm8, -8(%rax) + cmpq %r8, %rax + jne .L32 +# OSACA-END +.L266: + addq %r13, %rdi + addq %rbx, %r11 + addq %rbx, %r9 + addq %rbx, %r14 + addq %r13, %r8 + cmpl %r10d, %ebp + jne .L34 +.L33: + leal 1(%r15), %r8d + cmpl (%rsp), %r8d + je .L29 + movl %r8d, %r15d + jmp .L30 +.L9: + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + leaq (%rax,%r15,8), %r11 + leaq (%rdi,%r15), %rdx + movq %r10, %rdi + leaq (%rsi,%rdx,8), %rdx + subq %rcx, %rdi + movq %rdx, %rsi + movq %rdi, %rcx + subq %r11, %rsi + subq $8, %rsi + shrq $3, %rsi + incq %rsi + andl $7, %esi + je .L17 + cmpq $1, %rsi + je .L182 + cmpq $2, %rsi + je .L183 + cmpq $3, %rsi + je .L184 + cmpq $4, %rsi + je .L185 + cmpq $5, %rsi + je .L186 + cmpq $6, %rsi + je .L187 + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rdi,8) + addq $8, %r11 +.L187: + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rcx,8) + addq $8, %r11 +.L186: + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rcx,8) + addq $8, %r11 +.L185: + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rcx,8) + addq $8, %r11 +.L184: + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rcx,8) + addq $8, %r11 +.L183: + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rcx,8) + addq $8, %r11 +.L182: + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rcx,8) + addq $8, %r11 + cmpq %rdx, %r11 + je .L16 +.L17: + movq $0x000000000, (%r11) + movq $0x000000000, (%r11,%rcx,8) + movq $0x000000000, 8(%r11) + movq $0x000000000, 8(%r11,%rcx,8) + movq $0x000000000, 16(%r11) + movq $0x000000000, 16(%r11,%rcx,8) + movq $0x000000000, 24(%r11) + movq $0x000000000, 24(%r11,%rcx,8) + movq $0x000000000, 32(%r11) + movq $0x000000000, 32(%r11,%rcx,8) + movq $0x000000000, 40(%r11) + movq $0x000000000, 40(%r11,%rcx,8) + movq $0x000000000, 48(%r11) + movq $0x000000000, 48(%r11,%rcx,8) + movq $0x000000000, 56(%r11) + movq $0x000000000, 56(%r11,%rcx,8) + addq $64, %r11 + cmpq %rdx, %r11 + jne .L17 + jmp .L16 +.L37: + xorl %r11d, %r11d + jmp .L2 + .p2align 4 + .p2align 3 +.L29: + addl $2, %r15d + leaq 88(%rsp), %rsi + leaq 104(%rsp), %rdi + movl %r15d, 76(%rsp) + call timing_ + vmovsd 104(%rsp), %xmm9 + vsubsd 112(%rsp), %xmm9, %xmm3 + vcomisd .LC7(%rip), %xmm3 + jnb .L40 + cmpl $999999999, (%rsp) + jle .L26 +.L40: + movl (%rsp), %ebx + cmpl %ebx, %r15d + jle .L36 + movl %ebx, 76(%rsp) +.L36: + leaq 128(%rsp), %rdi + movabsq $25769803904, %r12 + vmovsd %xmm3, (%rsp) + movq $.LC0, 136(%rsp) + movl $72, 144(%rsp) + movq %r12, 128(%rsp) + decl %ebp + call _gfortran_st_write + movl $14, %edx + movl $.LC8, %esi + leaq 128(%rsp), %rdi + call _gfortran_transfer_character_write + movl $4, %edx + leaq 76(%rsp), %rsi + leaq 128(%rsp), %rdi + call _gfortran_transfer_integer_write + movl $14, %edx + movl $.LC9, %esi + leaq 128(%rsp), %rdi + call _gfortran_transfer_character_write + vxorps %xmm2, %xmm2, %xmm2 + vmovsd (%rsp), %xmm11 + movl $8, %edx + vcvtsi2sdl 76(%rsp), %xmm2, %xmm8 + vmulsd .LC10(%rip), %xmm8, %xmm7 + vcvtsi2sdl 32(%rsp), %xmm2, %xmm4 + vcvtsi2sdl %ebp, %xmm2, %xmm5 + vmulsd %xmm5, %xmm4, %xmm6 + leaq 120(%rsp), %rsi + leaq 128(%rsp), %rdi + vmulsd %xmm7, %xmm6, %xmm10 + vdivsd %xmm11, %xmm10, %xmm12 + vmovsd %xmm12, 120(%rsp) + call _gfortran_transfer_real_write + movl $6, %edx + movl $.LC11, %esi + leaq 128(%rsp), %rdi + call _gfortran_transfer_character_write + leaq 128(%rsp), %rdi + call _gfortran_st_write_done + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + call _gfortran_stop_string +.L5: + testl %r12d, %r12d + js .L38 +.L280: + leal -1(%r12), %r15d + leal -2(%r12), %r8d + movl %r15d, 32(%rsp) + jmp .L11 +.L6: + testl %r12d, %r12d + jns .L280 + leal -1(%r12), %esi + xorl %edx, %edx + leal -2(%r12), %r8d + movl %esi, 32(%rsp) + jmp .L19 +.L20: + vmovsd .LC5(%rip), %xmm4 + imulq $-8, %r14, %r10 + leaq (%rax,%r11,8), %r9 + addq %r13, %r11 + movl $1, %ecx + leaq (%r10,%r11,8), %r15 + leaq (%r10,%r13,8), %rdi + movl %r12d, %r11d + addq %rax, %r15 + addq %rax, %rdi + andl $7, %r11d + vmovsd %xmm4, (%r9) + vmovsd %xmm4, (%r15) + movq $0x000000000, (%rax) + movq $0x000000000, (%rdi) + cmpl $1, %r12d + jl .L45 + testl %r11d, %r11d + je .L25 + cmpl $1, %r11d + je .L206 + cmpl $2, %r11d + je .L207 + cmpl $3, %r11d + je .L208 + cmpl $4, %r11d + je .L209 + cmpl $5, %r11d + je .L210 + cmpl $6, %r11d + je .L211 + vmovsd %xmm4, 8(%r9) + movl $2, %ecx + vmovsd %xmm4, 8(%r15) + movq $0x000000000, 8(%rax) + movq $0x000000000, 8(%rdi) +.L211: + vmovsd %xmm4, (%r9,%rcx,8) + vmovsd %xmm4, (%r15,%rcx,8) + movq $0x000000000, (%rax,%rcx,8) + movq $0x000000000, (%rdi,%rcx,8) + incq %rcx +.L210: + vmovsd %xmm4, (%r9,%rcx,8) + vmovsd %xmm4, (%r15,%rcx,8) + movq $0x000000000, (%rax,%rcx,8) + movq $0x000000000, (%rdi,%rcx,8) + incq %rcx +.L209: + vmovsd %xmm4, (%r9,%rcx,8) + vmovsd %xmm4, (%r15,%rcx,8) + movq $0x000000000, (%rax,%rcx,8) + movq $0x000000000, (%rdi,%rcx,8) + incq %rcx +.L208: + vmovsd %xmm4, (%r9,%rcx,8) + vmovsd %xmm4, (%r15,%rcx,8) + movq $0x000000000, (%rax,%rcx,8) + movq $0x000000000, (%rdi,%rcx,8) + incq %rcx +.L207: + vmovsd %xmm4, (%r9,%rcx,8) + vmovsd %xmm4, (%r15,%rcx,8) + movq $0x000000000, (%rax,%rcx,8) + movq $0x000000000, (%rdi,%rcx,8) + incq %rcx +.L206: + vmovsd %xmm4, (%r9,%rcx,8) + vmovsd %xmm4, (%r15,%rcx,8) + movq $0x000000000, (%rax,%rcx,8) + movq $0x000000000, (%rdi,%rcx,8) + incq %rcx + cmpl %ecx, %r12d + jl .L45 +.L25: + leaq 1(%rcx), %rsi + vmovsd %xmm4, (%r9,%rcx,8) + leaq 2(%rcx), %r10 + vmovsd %xmm4, (%r15,%rcx,8) + leaq 3(%rcx), %r11 + movq $0x000000000, (%rax,%rcx,8) + movq $0x000000000, (%rdi,%rcx,8) + vmovsd %xmm4, (%r9,%rsi,8) + vmovsd %xmm4, (%r15,%rsi,8) + movq $0x000000000, (%rax,%rsi,8) + movq $0x000000000, (%rdi,%rsi,8) + leaq 4(%rcx), %rsi + vmovsd %xmm4, (%r9,%r10,8) + vmovsd %xmm4, (%r15,%r10,8) + movq $0x000000000, (%rax,%r10,8) + movq $0x000000000, (%rdi,%r10,8) + leaq 5(%rcx), %r10 + vmovsd %xmm4, (%r9,%r11,8) + vmovsd %xmm4, (%r15,%r11,8) + movq $0x000000000, (%rax,%r11,8) + movq $0x000000000, (%rdi,%r11,8) + leaq 6(%rcx), %r11 + vmovsd %xmm4, (%r9,%rsi,8) + vmovsd %xmm4, (%r15,%rsi,8) + movq $0x000000000, (%rax,%rsi,8) + movq $0x000000000, (%rdi,%rsi,8) + leaq 7(%rcx), %rsi + addq $8, %rcx + vmovsd %xmm4, (%r9,%r10,8) + vmovsd %xmm4, (%r15,%r10,8) + movq $0x000000000, (%rax,%r10,8) + movq $0x000000000, (%rdi,%r10,8) + vmovsd %xmm4, (%r9,%r11,8) + vmovsd %xmm4, (%r15,%r11,8) + movq $0x000000000, (%rax,%r11,8) + movq $0x000000000, (%rdi,%r11,8) + vmovsd %xmm4, (%r9,%rsi,8) + vmovsd %xmm4, (%r15,%rsi,8) + movq $0x000000000, (%rax,%rsi,8) + movq $0x000000000, (%rdi,%rsi,8) + cmpl %ecx, %r12d + jge .L25 +.L45: + incl %edx + jmp .L10 +.L282: + movl $.LC1, %edi + xorl %eax, %eax + call _gfortran_runtime_error +.L38: + leal -1(%r12), %r8d + xorl %edx, %edx + movl %r8d, 32(%rsp) + leal -2(%r12), %r8d + jmp .L10 +.L283: + movl $.LC2, %edi + call _gfortran_os_error + .cfi_endproc +.LFE0: + .size MAIN__, .-MAIN__ + .section .text.startup,"ax",@progbits + .p2align 4 + .globl main + .type main, @function +main: +.LFB1: + .cfi_startproc + subq $8, %rsp + .cfi_def_cfa_offset 16 + call _gfortran_set_args + movl $options.9.4008, %esi + movl $7, %edi + call _gfortran_set_options + call MAIN__ + .cfi_endproc +.LFE1: + .size main, .-main + .section .rodata + .align 16 + .type options.9.4008, @object + .size options.9.4008, 28 +options.9.4008: + .long 2116 + .long 4095 + .long 0 + .long 1 + .long 1 + .long 0 + .long 31 + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.LC4: + .long 0 + .long 1072693248 + .long 0 + .long 1072693248 + .section .rodata.cst8,"aM",@progbits,8 + .align 8 +.LC5: + .long 0 + .long 1072693248 + .align 8 +.LC6: + .long 0 + .long 1070596096 + .align 8 +.LC7: + .long 2576980378 + .long 1070176665 + .align 8 +.LC10: + .long 2696277389 + .long 1051772663 + .ident "GCC: (GNU) 9.1.0" + .section .note.GNU-stack,"",@progbits diff --git a/examples/j2d/j2d.s.csx.gcc.AVX.s b/examples/j2d/j2d.s.csx.gcc.AVX.s new file mode 100644 index 0000000..fafe556 --- /dev/null +++ b/examples/j2d/j2d.s.csx.gcc.AVX.s @@ -0,0 +1,40 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L21: + vmovupd (%r8,%rax), %ymm11 + vmovupd (%rsi,%rax), %ymm13 + vaddpd (%r9,%rax), %ymm11, %ymm12 + vaddpd (%rdi,%rax), %ymm13, %ymm14 + vmovupd 32(%r8,%rax), %ymm1 + vmovupd 32(%rsi,%rax), %ymm2 + vaddpd %ymm14, %ymm12, %ymm15 + vaddpd 32(%r9,%rax), %ymm1, %ymm5 + vaddpd 32(%rdi,%rax), %ymm2, %ymm7 + vmulpd %ymm8, %ymm15, %ymm0 + vmovupd 64(%r8,%rax), %ymm10 + vaddpd %ymm7, %ymm5, %ymm6 + vmovupd 64(%rsi,%rax), %ymm12 + vmovupd 96(%rsi,%rax), %ymm5 + vmovupd %ymm0, (%rdx,%rax) + vmovupd 96(%r8,%rax), %ymm0 + vaddpd 64(%r9,%rax), %ymm10, %ymm11 + vaddpd 64(%rdi,%rax), %ymm12, %ymm13 + vaddpd 96(%r9,%rax), %ymm0, %ymm1 + vaddpd 96(%rdi,%rax), %ymm5, %ymm2 + vaddpd %ymm13, %ymm11, %ymm14 + vmulpd %ymm8, %ymm6, %ymm9 + vaddpd %ymm2, %ymm1, %ymm7 + vmulpd %ymm8, %ymm14, %ymm15 + vmulpd %ymm8, %ymm7, %ymm6 + vmovupd %ymm9, 32(%rdx,%rax) + vmovupd %ymm15, 64(%rdx,%rax) + vmovupd %ymm6, 96(%rdx,%rax) + subq $-128, %rax + cmpq %rax, %r15 + jne .L21 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/j2d/j2d.s.csx.gcc.SSE.s b/examples/j2d/j2d.s.csx.gcc.SSE.s new file mode 100644 index 0000000..5872046 --- /dev/null +++ b/examples/j2d/j2d.s.csx.gcc.SSE.s @@ -0,0 +1,46 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L28: + movupd 16(%r8,%rax), %xmm11 + movupd 16(%rdi,%rax), %xmm12 + movupd 16(%rsi,%rax), %xmm13 + addpd %xmm11, %xmm15 + addpd %xmm13, %xmm12 + movupd 32(%rdi,%rax), %xmm14 + movupd 32(%rsi,%rax), %xmm0 + addpd %xmm15, %xmm12 + movupd 32(%r8,%rax), %xmm15 + addpd %xmm0, %xmm14 + addpd %xmm15, %xmm11 + movupd 48(%rdi,%rax), %xmm1 + movupd 48(%rsi,%rax), %xmm7 + addpd %xmm11, %xmm14 + addpd %xmm7, %xmm1 + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + movups %xmm12, 16(%rcx,%rax) + movups %xmm14, 32(%rcx,%rax) + movupd 48(%r8,%rax), %xmm14 + addpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + mulpd %xmm2, %xmm1 + movups %xmm1, 48(%rcx,%rax) + addq $64, %rax +.L21: + movupd (%r8,%rax), %xmm15 + movupd (%rdi,%rax), %xmm0 + movupd (%rsi,%rax), %xmm1 + addpd %xmm15, %xmm14 + addpd %xmm1, %xmm0 + leaq 16(%rax), %r10 + addpd %xmm0, %xmm14 + mulpd %xmm2, %xmm14 + movups %xmm14, (%rcx,%rax) + cmpq %r10, %r14 + jne .L28 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/j2d/j2d.s.csx.icc.AVX.s b/examples/j2d/j2d.s.csx.icc.AVX.s new file mode 100644 index 0000000..bb94516 --- /dev/null +++ b/examples/j2d/j2d.s.csx.icc.AVX.s @@ -0,0 +1,37 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.47: # Preds ..B1.47 ..B1.46 + # Execution count [1.15e+04] + vmovupd 10016(%r8,%rcx,8), %ymm1 #94.5 + vmovupd 10048(%r8,%rcx,8), %ymm6 #94.5 + vmovupd 10080(%r8,%rcx,8), %ymm11 #94.5 + vaddpd 16(%r12,%rcx,8), %ymm1, %ymm2 #94.5 + vaddpd 48(%r12,%rcx,8), %ymm6, %ymm7 #94.5 + vaddpd 80(%r12,%rcx,8), %ymm11, %ymm12 #94.5 + vaddpd 20032(%r10,%rcx,8), %ymm2, %ymm3 #94.5 + vaddpd 20064(%r10,%rcx,8), %ymm7, %ymm8 #94.5 + vaddpd 20096(%r10,%rcx,8), %ymm12, %ymm13 #94.5 + vaddpd 10032(%r8,%rcx,8), %ymm3, %ymm4 #94.5 + vaddpd 10064(%r8,%rcx,8), %ymm8, %ymm9 #94.5 + vaddpd 10096(%r8,%rcx,8), %ymm13, %ymm14 #94.5 + vmovupd 10112(%r8,%rcx,8), %ymm1 #94.5 + vmulpd %ymm4, %ymm0, %ymm5 #94.5 + vmulpd %ymm9, %ymm0, %ymm10 #94.5 + vmulpd %ymm14, %ymm0, %ymm15 #94.5 + vaddpd 112(%r12,%rcx,8), %ymm1, %ymm2 #94.5 + vmovupd %ymm5, 10016(%r9,%rcx,8) #94.5 + vmovupd %ymm10, 10048(%r9,%rcx,8) #94.5 + vmovupd %ymm15, 10080(%r9,%rcx,8) #94.5 + vaddpd 20128(%r10,%rcx,8), %ymm2, %ymm3 #94.5 + vaddpd 10128(%r8,%rcx,8), %ymm3, %ymm4 #94.5 + vmulpd %ymm4, %ymm0, %ymm5 #94.5 + vmovupd %ymm5, 10112(%r9,%rcx,8) #94.5 + addq $16, %rcx #94.5 + cmpq %r14, %rcx #94.5 + jb ..B1.47 # Prob 82% #94.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/j2d/j2d.s.csx.icc.AVX512.s b/examples/j2d/j2d.s.csx.icc.AVX512.s new file mode 100644 index 0000000..82e98cd --- /dev/null +++ b/examples/j2d/j2d.s.csx.icc.AVX512.s @@ -0,0 +1,69 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.47: # Preds ..B1.63 ..B1.46 + # Execution count [1.15e+04] + lea (%r12,%r11), %r8 #94.5 + # LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4 +..B1.48: # Preds ..B1.47 + # Execution count [1.73e+04] + vmovupd 10032(%r8,%rcx,8), %zmm2 #94.5 + vmovupd 10016(%r8,%rcx,8), %zmm0 #94.5 + # LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4 +..B1.51: # Preds ..B1.48 + # Execution count [1.15e+04] + lea (%r12,%r11), %r8 #94.5 + vaddpd 16(%r12,%rcx,8), %zmm0, %zmm0 #94.5 + vaddpd 20032(%r10,%rcx,8), %zmm0, %zmm1 #94.5 + vaddpd %zmm2, %zmm1, %zmm2 #94.5 + vmulpd %zmm2, %zmm4, %zmm3 #94.5 + vmovupd %zmm3, 10016(%r9,%rcx,8) #94.5 + # LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4 +..B1.52: # Preds ..B1.51 + # Execution count [1.73e+04] + vmovupd 10096(%r8,%rcx,8), %zmm2 #94.5 + vmovupd 10080(%r8,%rcx,8), %zmm0 #94.5 + # LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4 +..B1.55: # Preds ..B1.52 + # Execution count [1.15e+04] + lea (%r12,%r11), %r8 #94.5 + vaddpd 80(%r12,%rcx,8), %zmm0, %zmm0 #94.5 + vaddpd 20096(%r10,%rcx,8), %zmm0, %zmm1 #94.5 + vaddpd %zmm2, %zmm1, %zmm2 #94.5 + vmulpd %zmm2, %zmm4, %zmm3 #94.5 + vmovupd %zmm3, 10080(%r9,%rcx,8) #94.5 + # LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4 +..B1.56: # Preds ..B1.55 + # Execution count [1.73e+04] + vmovupd 10160(%r8,%rcx,8), %zmm2 #94.5 + vmovupd 10144(%r8,%rcx,8), %zmm0 #94.5 + # LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4 +..B1.59: # Preds ..B1.56 + # Execution count [1.15e+04] + lea (%r12,%r11), %r8 #94.5 + vaddpd 144(%r12,%rcx,8), %zmm0, %zmm0 #94.5 + vaddpd 20160(%r10,%rcx,8), %zmm0, %zmm1 #94.5 + vaddpd %zmm2, %zmm1, %zmm2 #94.5 + vmulpd %zmm2, %zmm4, %zmm3 #94.5 + vmovupd %zmm3, 10144(%r9,%rcx,8) #94.5 + # LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4 +..B1.60: # Preds ..B1.59 + # Execution count [1.73e+04] + vmovupd 10224(%r8,%rcx,8), %zmm2 #94.5 + vmovupd 10208(%r8,%rcx,8), %zmm0 #94.5 + # LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4 +..B1.63: # Preds ..B1.60 + # Execution count [1.15e+04] + vaddpd 208(%r12,%rcx,8), %zmm0, %zmm0 #94.5 + vaddpd 20224(%r10,%rcx,8), %zmm0, %zmm1 #94.5 + vaddpd %zmm2, %zmm1, %zmm2 #94.5 + vmulpd %zmm2, %zmm4, %zmm3 #94.5 + vmovupd %zmm3, 10208(%r9,%rcx,8) #94.5 + addq $32, %rcx #94.5 + cmpq %r14, %rcx #94.5 + jb ..B1.47 # Prob 82% #94.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/j2d/j2d.s.csx.icc.SSE.s b/examples/j2d/j2d.s.csx.icc.SSE.s new file mode 100644 index 0000000..a560be1 --- /dev/null +++ b/examples/j2d/j2d.s.csx.icc.SSE.s @@ -0,0 +1,40 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.42: # Preds ..B1.42 ..B1.41 + # Execution count [1.15e+04] + movups 10016(%r8,%rcx,8), %xmm0 #94.5 + addpd 16(%r12,%rcx,8), %xmm0 #94.5 + addpd 20032(%r10,%rcx,8), %xmm0 #94.5 + movups 10032(%r8,%rcx,8), %xmm2 #94.5 + movups 32(%r12,%rcx,8), %xmm1 #94.5 + addpd %xmm2, %xmm0 #94.5 + addpd %xmm1, %xmm2 #94.5 + mulpd %xmm7, %xmm0 #94.5 + addpd 20048(%r10,%rcx,8), %xmm2 #94.5 + movups 10048(%r8,%rcx,8), %xmm4 #94.5 + movups 48(%r12,%rcx,8), %xmm3 #94.5 + addpd %xmm4, %xmm2 #94.5 + addpd %xmm3, %xmm4 #94.5 + mulpd %xmm7, %xmm2 #94.5 + addpd 20064(%r10,%rcx,8), %xmm4 #94.5 + movups 10064(%r8,%rcx,8), %xmm6 #94.5 + movups 64(%r12,%rcx,8), %xmm5 #94.5 + addpd %xmm6, %xmm4 #94.5 + addpd %xmm5, %xmm6 #94.5 + mulpd %xmm7, %xmm4 #94.5 + addpd 20080(%r10,%rcx,8), %xmm6 #94.5 + addpd 10080(%r8,%rcx,8), %xmm6 #94.5 + mulpd %xmm7, %xmm6 #94.5 + movups %xmm0, 10016(%r9,%rcx,8) #94.5 + movups %xmm2, 10032(%r9,%rcx,8) #94.5 + movups %xmm4, 10048(%r9,%rcx,8) #94.5 + movups %xmm6, 10064(%r9,%rcx,8) #94.5 + addq $8, %rcx #94.5 + cmpq %r14, %rcx #94.5 + jb ..B1.42 # Prob 82% #94.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/j2d/j2d.s.tx2.clang.s b/examples/j2d/j2d.s.tx2.clang.s new file mode 100644 index 0000000..8a8eb8a --- /dev/null +++ b/examples/j2d/j2d.s.tx2.clang.s @@ -0,0 +1,131 @@ + // OSACA-BEGIN +.LBB1_29: // Parent Loop BB1_16 Depth=1 + // Parent Loop BB1_19 Depth=2 + // Parent Loop BB1_24 Depth=3 + // => This Inner Loop Header: Depth=4 + add x0, x5, x16 + add x18, x21, x16 + ldp q4, q5, [x0, #16] + ldp q6, q7, [x0, #48] + ldur q0, [x18, #8] + ldur q1, [x18, #24] + ldur q2, [x18, #40] + ldur q3, [x18, #56] + add x1, x28, x16 + add x15, x15, #32 // =32 + fadd v0.2d, v4.2d, v0.2d + fadd v4.2d, v5.2d, v1.2d + fadd v5.2d, v6.2d, v2.2d + fadd v6.2d, v7.2d, v3.2d + ldp q7, q16, [x1, #16] + fadd v1.2d, v7.2d, v1.2d + ldp q17, q18, [x1, #48] + ldur q19, [x18, #72] + fadd v0.2d, v0.2d, v1.2d + fadd v1.2d, v16.2d, v2.2d + fadd v2.2d, v17.2d, v3.2d + fadd v3.2d, v18.2d, v19.2d + ldp q16, q17, [x0, #80] + ldp q18, q19, [x0, #112] + fadd v1.2d, v4.2d, v1.2d + fadd v2.2d, v5.2d, v2.2d + fadd v3.2d, v6.2d, v3.2d + ldur q4, [x18, #72] + ldur q5, [x18, #88] + ldur q6, [x18, #104] + ldur q7, [x18, #120] + fadd v4.2d, v16.2d, v4.2d + fadd v16.2d, v17.2d, v5.2d + fadd v17.2d, v18.2d, v6.2d + fadd v18.2d, v19.2d, v7.2d + ldp q19, q20, [x1, #80] + fadd v5.2d, v19.2d, v5.2d + ldp q21, q22, [x1, #112] + ldur q23, [x18, #136] + fadd v4.2d, v4.2d, v5.2d + fadd v5.2d, v20.2d, v6.2d + fadd v6.2d, v21.2d, v7.2d + fadd v7.2d, v22.2d, v23.2d + ldp q20, q21, [x0, #144] + ldp q22, q23, [x0, #176] + fadd v5.2d, v16.2d, v5.2d + fadd v6.2d, v17.2d, v6.2d + fadd v7.2d, v18.2d, v7.2d + ldur q16, [x18, #136] + ldur q17, [x18, #152] + ldur q18, [x18, #168] + ldur q19, [x18, #184] + fadd v16.2d, v20.2d, v16.2d + fadd v20.2d, v21.2d, v17.2d + fadd v21.2d, v22.2d, v18.2d + fadd v22.2d, v23.2d, v19.2d + ldp q23, q24, [x1, #144] + fadd v17.2d, v23.2d, v17.2d + ldp q25, q26, [x1, #176] + fadd v16.2d, v16.2d, v17.2d + fadd v17.2d, v24.2d, v18.2d + fadd v18.2d, v25.2d, v19.2d + ldp q24, q25, [x0, #208] + ldur q23, [x18, #200] + fadd v17.2d, v20.2d, v17.2d + fadd v18.2d, v21.2d, v18.2d + ldur q20, [x18, #200] + ldur q21, [x18, #216] + fadd v19.2d, v26.2d, v23.2d + fadd v20.2d, v24.2d, v20.2d + fadd v24.2d, v25.2d, v21.2d + ldp q25, q26, [x1, #208] + fadd v21.2d, v25.2d, v21.2d + fadd v20.2d, v20.2d, v21.2d + ldp q21, q25, [x0, #240] + fadd v19.2d, v22.2d, v19.2d + ldur q22, [x18, #232] + fadd v21.2d, v21.2d, v22.2d + fadd v22.2d, v26.2d, v22.2d + fadd v22.2d, v24.2d, v22.2d + ldp q24, q26, [x1, #240] + ldur q23, [x18, #248] + fadd v25.2d, v25.2d, v23.2d + fadd v23.2d, v24.2d, v23.2d + add x18, x18, #264 // =264 + fmul v0.2d, v0.2d, v28.2d + fmul v1.2d, v1.2d, v28.2d + fmul v2.2d, v2.2d, v28.2d + fmul v5.2d, v5.2d, v28.2d + fadd v21.2d, v21.2d, v23.2d + ldr q23, [x18] + add x18, x25, x16 + stur q0, [x18, #8] + stur q1, [x18, #24] + fmul v3.2d, v3.2d, v28.2d + stur q2, [x18, #40] + fadd v23.2d, v26.2d, v23.2d + stur q5, [x18, #88] + fmul v4.2d, v4.2d, v28.2d + stur q3, [x18, #56] + fmul v6.2d, v6.2d, v28.2d + stur q4, [x18, #72] + fmul v0.2d, v7.2d, v28.2d + stur q6, [x18, #104] + fmul v1.2d, v16.2d, v28.2d + stur q0, [x18, #120] + fmul v2.2d, v17.2d, v28.2d + stur q1, [x18, #136] + fmul v4.2d, v19.2d, v28.2d + stur q2, [x18, #152] + fadd v5.2d, v25.2d, v23.2d + stur q4, [x18, #184] + fmul v3.2d, v18.2d, v28.2d + stur q3, [x18, #168] + fmul v6.2d, v20.2d, v28.2d + stur q6, [x18, #200] + fmul v0.2d, v22.2d, v28.2d + stur q0, [x18, #216] + fmul v1.2d, v21.2d, v28.2d + stur q1, [x18, #232] + add x16, x16, #256 // =256 + fmul v2.2d, v5.2d, v28.2d + stur q2, [x18, #248] + adds x17, x17, #4 // =4 + b.ne .LBB1_29 + // OSACA-END diff --git a/examples/j2d/j2d.s.tx2.gcc.s b/examples/j2d/j2d.s.tx2.gcc.s new file mode 100644 index 0000000..d75591e --- /dev/null +++ b/examples/j2d/j2d.s.tx2.gcc.s @@ -0,0 +1,43 @@ + // OSACA-BEGIN +.L93: + add x5, x0, 16 + ldr q2, [x14, x0] + ldr q5, [x25, x0] + add x7, x0, 32 + ldr q13, [x22, x0] + ldr q4, [x25, x5] + add x6, x0, 48 + ldr x9, [sp, 144] + ldr q19, [x22, x5] + ldr q7, [x14, x5] + ldr q6, [x14, x7] + ldr q3, [x25, x7] + ldr q18, [x22, x7] + fadd v17.2d, v2.2d, v30.2d + ldr q16, [x14, x6] + ldr q20, [x25, x6] + fadd v23.2d, v5.2d, v13.2d + ldr q22, [x22, x6] + fadd v24.2d, v4.2d, v19.2d + fadd v25.2d, v7.2d, v2.2d + fadd v27.2d, v6.2d, v7.2d + fadd v26.2d, v3.2d, v18.2d + fadd v28.2d, v16.2d, v6.2d + mov v30.16b, v16.16b + fadd v29.2d, v20.2d, v22.2d + fadd v31.2d, v23.2d, v17.2d + fadd v0.2d, v24.2d, v25.2d + fadd v2.2d, v26.2d, v27.2d + fadd v1.2d, v29.2d, v28.2d + fmul v5.2d, v31.2d, v21.2d + fmul v13.2d, v0.2d, v21.2d + fmul v4.2d, v2.2d, v21.2d + fmul v19.2d, v1.2d, v21.2d + str q5, [x28, x0] + add x0, x0, 64 + str q13, [x28, x5] + str q4, [x28, x7] + str q19, [x28, x6] + cmp x9, x0 + bne .L93 + // OSACA-END diff --git a/examples/j2d/j2d.s.zen.gcc.s b/examples/j2d/j2d.s.zen.gcc.s new file mode 100644 index 0000000..420d610 --- /dev/null +++ b/examples/j2d/j2d.s.zen.gcc.s @@ -0,0 +1,36 @@ + # OSACA-BEGIN +.L28: + vmovups (%r10,%rcx), %xmm5 + vmovups 32(%r10,%rax), %xmm13 + vmovups (%rdi,%rcx), %xmm1 + vmovups 32(%rdi,%rax), %xmm14 + vmovups 48(%rdi,%rax), %xmm9 + vaddpd (%r8,%rcx), %xmm1, %xmm10 + vaddpd 32(%r8,%rax), %xmm14, %xmm15 + vaddpd 48(%r8,%rax), %xmm9, %xmm1 + vaddpd %xmm5, %xmm8, %xmm8 + vaddpd %xmm13, %xmm5, %xmm6 + vmovups 48(%r10,%rax), %xmm5 + vaddpd %xmm8, %xmm10, %xmm11 + vaddpd %xmm6, %xmm15, %xmm0 + vmulpd %xmm2, %xmm11, %xmm12 + vaddpd %xmm5, %xmm13, %xmm4 + vmulpd %xmm2, %xmm0, %xmm7 + vaddpd %xmm4, %xmm1, %xmm10 + vmovups %xmm12, (%rsi,%rcx) + vmovups %xmm7, 32(%rsi,%rax) + vmulpd %xmm2, %xmm10, %xmm8 + vmovups %xmm8, 48(%rsi,%rax) + addq $64, %rax +.L21: + vmovups (%r10,%rax), %xmm8 + leaq 16(%rax), %rcx + vmovups (%rdi,%rax), %xmm9 + vaddpd (%r8,%rax), %xmm9, %xmm10 + vaddpd %xmm8, %xmm5, %xmm11 + vaddpd %xmm11, %xmm10, %xmm12 + vmulpd %xmm2, %xmm12, %xmm13 + vmovups %xmm13, (%rsi,%rax) + cmpq %rcx, %r14 + jne .L28 + # OSACA-END diff --git a/examples/striad/striad.s.csx.gcc.s b/examples/striad/striad.s.csx.gcc.s new file mode 100644 index 0000000..3904a89 --- /dev/null +++ b/examples/striad/striad.s.csx.gcc.s @@ -0,0 +1,44 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L19: + vmovupd (%r15,%rax), %ymm5 + vmovupd 0(%r13,%rax), %ymm6 + vmovupd 32(%r15,%rax), %ymm8 + vmovupd 32(%r13,%rax), %ymm7 + vmovupd 64(%r15,%rax), %ymm9 + vmovupd 64(%r13,%rax), %ymm10 + vmovupd 96(%r15,%rax), %ymm11 + vmovupd 96(%r13,%rax), %ymm12 + vmovupd 128(%r15,%rax), %ymm13 + vmovupd 128(%r13,%rax), %ymm14 + vmovupd 160(%r15,%rax), %ymm15 + vmovupd 160(%r13,%rax), %ymm2 + vmovupd 192(%r15,%rax), %ymm0 + vmovupd 192(%r13,%rax), %ymm1 + vmovupd 224(%r15,%rax), %ymm3 + vmovupd 224(%r13,%rax), %ymm4 + vfmadd132pd (%r14,%rax), %ymm6, %ymm5 + vfmadd132pd 32(%r14,%rax), %ymm7, %ymm8 + vfmadd132pd 64(%r14,%rax), %ymm10, %ymm9 + vfmadd132pd 96(%r14,%rax), %ymm12, %ymm11 + vfmadd132pd 128(%r14,%rax), %ymm14, %ymm13 + vfmadd132pd 160(%r14,%rax), %ymm2, %ymm15 + vfmadd132pd 192(%r14,%rax), %ymm1, %ymm0 + vfmadd132pd 224(%r14,%rax), %ymm4, %ymm3 + vmovupd %ymm5, (%r12,%rax) + vmovupd %ymm8, 32(%r12,%rax) + vmovupd %ymm9, 64(%r12,%rax) + vmovupd %ymm11, 96(%r12,%rax) + vmovupd %ymm13, 128(%r12,%rax) + vmovupd %ymm15, 160(%r12,%rax) + vmovupd %ymm0, 192(%r12,%rax) + vmovupd %ymm3, 224(%r12,%rax) + addq $256, %rax + cmpq %rax, %r8 + jne .L19 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/striad/striad.s.csx.icc.s b/examples/striad/striad.s.csx.icc.s new file mode 100644 index 0000000..7946191 --- /dev/null +++ b/examples/striad/striad.s.csx.icc.s @@ -0,0 +1,21 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.41: # Preds ..B1.41 ..B1.40 + # Execution count [2.22e+03] + vmovups (%rcx,%rax,8), %zmm2 #80.5 + vmovups 64(%rcx,%rax,8), %zmm4 #80.5 + vmovups (%r14,%rax,8), %zmm1 #80.5 + vmovups 64(%r14,%rax,8), %zmm3 #80.5 + vfmadd213pd (%r8,%rax,8), %zmm1, %zmm2 #80.5 + vfmadd213pd 64(%r8,%rax,8), %zmm3, %zmm4 #80.5 + vmovupd %zmm2, (%r13,%rax,8) #80.5 + vmovupd %zmm4, 64(%r13,%rax,8) #80.5 + addq $16, %rax #80.5 + cmpq %r12, %rax #80.5 + jb ..B1.41 # Prob 82% #80.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/striad/striad.s.tx2.clang.s b/examples/striad/striad.s.tx2.clang.s new file mode 100644 index 0000000..be76d46 --- /dev/null +++ b/examples/striad/striad.s.tx2.clang.s @@ -0,0 +1,112 @@ + // OSACA-BEGIN +.LBB1_29: // Parent Loop BB1_20 Depth=1 + // Parent Loop BB1_22 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp q0, q1, [x9, #-256] + ldp q2, q3, [x9, #-224] + ldp q4, q5, [x10, #-256] + ldp q6, q7, [x10, #-224] + ldp q16, q17, [x11, #-256] + ldp q18, q19, [x11, #-224] + fmla v0.2d, v16.2d, v4.2d + fmla v1.2d, v17.2d, v5.2d + stp q1, q0, [sp, #96] // 32-byte Folded Spill + fmla v2.2d, v18.2d, v6.2d + fmla v3.2d, v19.2d, v7.2d + ldp q4, q5, [x9, #-192] + ldp q6, q7, [x9, #-160] + ldp q16, q17, [x10, #-192] + ldp q18, q19, [x10, #-160] + ldp q20, q21, [x11, #-192] + ldp q22, q23, [x11, #-160] + fmla v4.2d, v20.2d, v16.2d + stp q3, q4, [x12, #-208] + fmla v5.2d, v21.2d, v17.2d + fmla v6.2d, v22.2d, v18.2d + stp q5, q6, [x12, #-176] + fmla v7.2d, v23.2d, v19.2d + ldp q16, q18, [x9, #-128] + ldp q17, q19, [x9, #-96] + ldp q20, q21, [x10, #-128] + ldp q22, q23, [x10, #-96] + ldp q24, q25, [x11, #-128] + ldp q26, q27, [x11, #-96] + fmla v16.2d, v24.2d, v20.2d + stp q7, q16, [x12, #-144] + fmla v18.2d, v25.2d, v21.2d + fmla v17.2d, v26.2d, v22.2d + stp q18, q17, [x12, #-112] + fmla v19.2d, v27.2d, v23.2d + ldp q22, q23, [x9, #-64] + ldp q20, q21, [x9, #-32] + ldp q24, q25, [x10, #-64] + ldp q26, q27, [x10, #-32] + ldp q28, q29, [x11, #-64] + ldp q30, q31, [x11, #-32] + fmla v22.2d, v28.2d, v24.2d + stp q19, q22, [x12, #-80] + fmla v23.2d, v29.2d, v25.2d + fmla v20.2d, v30.2d, v26.2d + stp q23, q20, [x12, #-48] + fmla v21.2d, v31.2d, v27.2d + stur q21, [x12, #-16] + ldp q24, q25, [x9] + ldp q26, q27, [x9, #32] + ldp q28, q29, [x10] + ldp q30, q31, [x10, #32] + ldp q8, q10, [x11] + ldp q11, q12, [x11, #32] + fmla v24.2d, v8.2d, v28.2d + fmla v25.2d, v10.2d, v29.2d + stp q24, q25, [x12] + fmla v26.2d, v11.2d, v30.2d + fmla v27.2d, v12.2d, v31.2d + stp q26, q27, [x12, #32] + ldp q28, q29, [x9, #64] + ldp q30, q31, [x9, #96] + ldp q8, q10, [x10, #64] + ldp q11, q12, [x10, #96] + ldp q13, q14, [x11, #64] + ldp q15, q9, [x11, #96] + fmla v28.2d, v13.2d, v8.2d + fmla v29.2d, v14.2d, v10.2d + stp q28, q29, [x12, #64] + fmla v30.2d, v15.2d, v11.2d + fmla v31.2d, v9.2d, v12.2d + stp q30, q31, [x12, #96] + ldp q8, q9, [x9, #128] + ldp q12, q13, [x10, #128] + ldp q14, q15, [x11, #128] + ldp q10, q11, [x9, #160] + fmla v8.2d, v14.2d, v12.2d + ldp q12, q14, [x10, #160] + fmla v9.2d, v15.2d, v13.2d + stp q8, q9, [x12, #128] + ldp q13, q15, [x11, #160] + fmla v10.2d, v13.2d, v12.2d + fmla v11.2d, v15.2d, v14.2d + stp q10, q11, [x12, #160] + ldp q12, q13, [x9, #192] + ldp q14, q15, [x10, #192] + ldp q0, q1, [x11, #192] + fmla v12.2d, v0.2d, v14.2d + ldr q0, [sp, #112] // 16-byte Folded Reload + stur q0, [x12, #-256] + ldr q0, [sp, #96] // 16-byte Folded Reload + stp q0, q2, [x12, #-240] + ldp q0, q2, [x9, #224] + ldp q3, q4, [x10, #224] + ldp q5, q6, [x11, #224] + fmla v13.2d, v1.2d, v15.2d + stp q12, q13, [x12, #192] + fmla v0.2d, v5.2d, v3.2d + fmla v2.2d, v6.2d, v4.2d + stp q0, q2, [x12, #224] + add x8, x8, #64 // =64 + add x12, x12, #512 // =512 + add x11, x11, #512 // =512 + add x10, x10, #512 // =512 + add x9, x9, #512 // =512 + adds x13, x13, #8 // =8 + b.ne .LBB1_29 + // OSACA-END diff --git a/examples/striad/striad.s.tx2.gcc.s b/examples/striad/striad.s.tx2.gcc.s new file mode 100644 index 0000000..8c455bb --- /dev/null +++ b/examples/striad/striad.s.tx2.gcc.s @@ -0,0 +1,53 @@ + // OSACA-BEGIN +.L17: + add x12, x11, 16 + ldr q29, [x22, x11] + ldr q30, [x20, x11] + add x7, x11, 32 + ldr q31, [x21, x11] + ldr q7, [x22, x12] + add x6, x11, 48 + add x5, x11, 64 + ldr q6, [x20, x12] + ldr q2, [x21, x12] + add x8, x11, 80 + add x0, x11, 96 + ldr q9, [x22, x7] + ldr q5, [x20, x7] + add x13, x11, 112 + ldr q1, [x21, x7] + ldr q16, [x22, x6] + ldr q4, [x20, x6] + ldr q0, [x21, x6] + fmla v30.2d, v29.2d, v31.2d + ldr q23, [x22, x5] + ldr q3, [x20, x5] + fmla v6.2d, v7.2d, v2.2d + ldr q22, [x21, x5] + ldr q21, [x22, x8] + ldr q24, [x20, x8] + ldr q20, [x21, x8] + fmla v5.2d, v9.2d, v1.2d + ldr q19, [x22, x0] + ldr q25, [x20, x0] + fmla v4.2d, v16.2d, v0.2d + ldr q18, [x21, x0] + ldr q17, [x22, x13] + ldr q26, [x20, x13] + ldr q27, [x21, x13] + fmla v3.2d, v23.2d, v22.2d + fmla v24.2d, v21.2d, v20.2d + str q30, [x19, x11] + add x11, x11, 128 + str q6, [x19, x12] + fmla v25.2d, v19.2d, v18.2d + str q5, [x19, x7] + fmla v26.2d, v17.2d, v27.2d + str q4, [x19, x6] + str q3, [x19, x5] + str q24, [x19, x8] + str q25, [x19, x0] + str q26, [x19, x13] + cmp x25, x11 + bne .L17 + // OSACA-END diff --git a/examples/striad/striad.s.zen.gcc.s b/examples/striad/striad.s.zen.gcc.s new file mode 100644 index 0000000..711ba1d --- /dev/null +++ b/examples/striad/striad.s.zen.gcc.s @@ -0,0 +1,38 @@ + # OSACA-BEGIN +.L19: + vmovups (%r14,%rax), %xmm0 + vmovups (%r12,%rax), %xmm5 + vmovups 16(%r14,%rax), %xmm3 + vmovups 16(%r12,%rax), %xmm6 + vmovups 32(%r14,%rax), %xmm4 + vmovups 32(%r12,%rax), %xmm7 + vmovups 48(%r14,%rax), %xmm8 + vmovups 48(%r12,%rax), %xmm9 + vmovups 64(%r14,%rax), %xmm10 + vmovups 64(%r12,%rax), %xmm11 + vmovups 80(%r14,%rax), %xmm12 + vmovups 80(%r12,%rax), %xmm13 + vmovups 96(%r14,%rax), %xmm14 + vmovups 96(%r12,%rax), %xmm15 + vmovups 112(%r14,%rax), %xmm2 + vmovups 112(%r12,%rax), %xmm1 + vfmadd132pd 0(%r13,%rax), %xmm5, %xmm0 + vfmadd132pd 16(%r13,%rax), %xmm6, %xmm3 + vfmadd132pd 32(%r13,%rax), %xmm7, %xmm4 + vfmadd132pd 48(%r13,%rax), %xmm9, %xmm8 + vfmadd132pd 64(%r13,%rax), %xmm11, %xmm10 + vfmadd132pd 80(%r13,%rax), %xmm13, %xmm12 + vfmadd132pd 96(%r13,%rax), %xmm15, %xmm14 + vfmadd132pd 112(%r13,%rax), %xmm1, %xmm2 + vmovups %xmm0, 0(%rbp,%rax) + vmovups %xmm3, 16(%rbp,%rax) + vmovups %xmm4, 32(%rbp,%rax) + vmovups %xmm8, 48(%rbp,%rax) + vmovups %xmm10, 64(%rbp,%rax) + vmovups %xmm12, 80(%rbp,%rax) + vmovups %xmm14, 96(%rbp,%rax) + vmovups %xmm2, 112(%rbp,%rax) + subq $-128, %rax + cmpq %rcx, %rax + jne .L19 + # OSACA-END diff --git a/examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s b/examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s new file mode 100644 index 0000000..08c0523 --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s @@ -0,0 +1,46 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + # LLVM-MCA-BEGIN +.L19: + vmovupd (%rcx), %ymm4 + vmovupd 32(%rcx), %ymm13 + vaddsd %xmm4, %xmm0, %xmm6 + vunpckhpd %xmm4, %xmm4, %xmm3 + vextractf64x2 $0x1, %ymm4, %xmm8 + vaddsd %xmm6, %xmm3, %xmm7 + vunpckhpd %xmm8, %xmm8, %xmm11 + vunpckhpd %xmm13, %xmm13, %xmm1 + vaddsd %xmm7, %xmm8, %xmm10 + vextractf64x2 $0x1, %ymm13, %xmm2 + vunpckhpd %xmm2, %xmm2, %xmm3 + vaddsd %xmm11, %xmm10, %xmm12 + vmovupd 64(%rcx), %ymm8 + vmovupd 96(%rcx), %ymm5 + vaddsd %xmm13, %xmm12, %xmm0 + vunpckhpd %xmm8, %xmm8, %xmm12 + vextractf64x2 $0x1, %ymm8, %xmm14 + vaddsd %xmm0, %xmm1, %xmm4 + vunpckhpd %xmm14, %xmm14, %xmm0 + vextractf64x2 $0x1, %ymm5, %xmm9 + vaddsd %xmm4, %xmm2, %xmm6 + subq $-128, %rcx + vaddsd %xmm3, %xmm6, %xmm7 + vaddsd %xmm8, %xmm7, %xmm11 + vunpckhpd %xmm5, %xmm5, %xmm7 + vaddsd %xmm11, %xmm12, %xmm13 + vunpckhpd %xmm9, %xmm9, %xmm12 + vaddsd %xmm13, %xmm14, %xmm1 + vaddsd %xmm0, %xmm1, %xmm4 + vaddsd %xmm5, %xmm4, %xmm3 + vaddsd %xmm3, %xmm7, %xmm8 + vaddsd %xmm8, %xmm9, %xmm11 + vaddsd %xmm12, %xmm11, %xmm0 + cmpq %rcx, %r15 + jne .L19 + # LLVM-MCA-END + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/sum_reduction/sum_reduction.s.csx.gcc.s b/examples/sum_reduction/sum_reduction.s.csx.gcc.s new file mode 100644 index 0000000..cc000a5 --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.csx.gcc.s @@ -0,0 +1,20 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L19: + vaddpd (%rcx), %ymm3, %ymm4 + addq $256, %rcx + vaddpd -224(%rcx), %ymm4, %ymm5 + vaddpd -192(%rcx), %ymm5, %ymm6 + vaddpd -160(%rcx), %ymm6, %ymm8 + vaddpd -128(%rcx), %ymm8, %ymm9 + vaddpd -96(%rcx), %ymm9, %ymm10 + vaddpd -64(%rcx), %ymm10, %ymm11 + vaddpd -32(%rcx), %ymm11, %ymm3 + cmpq %rcx, %r15 + jne .L19 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/sum_reduction/sum_reduction.s.csx.icc.s b/examples/sum_reduction/sum_reduction.s.csx.icc.s new file mode 100644 index 0000000..7a7038c --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.csx.icc.s @@ -0,0 +1,17 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.38: # Preds ..B1.38 ..B1.37 + # Execution count [2.22e+03] + vaddpd (%r13,%rax,8), %zmm4, %zmm4 #76.5 + vaddpd 64(%r13,%rax,8), %zmm3, %zmm3 #76.5 + vaddpd 128(%r13,%rax,8), %zmm2, %zmm2 #76.5 + vaddpd 192(%r13,%rax,8), %zmm1, %zmm1 #76.5 + addq $32, %rax #76.5 + cmpq %r14, %rax #76.5 + jb ..B1.38 # Prob 82% #76.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/sum_reduction/sum_reduction.s.tx2.clang.s b/examples/sum_reduction/sum_reduction.s.tx2.clang.s new file mode 100644 index 0000000..83658c5 --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.tx2.clang.s @@ -0,0 +1,57 @@ + // OSACA-BEGIN +.LBB1_29: // Parent Loop BB1_20 Depth=1 + // Parent Loop BB1_22 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp q4, q5, [x9, #-256] + fadd v0.2d, v4.2d, v0.2d + fadd v1.2d, v5.2d, v1.2d + ldp q4, q5, [x9, #-192] + ldp q16, q17, [x9, #-128] + fadd v4.2d, v4.2d, v16.2d + ldp q6, q7, [x9, #-224] + fadd v2.2d, v6.2d, v2.2d + fadd v3.2d, v7.2d, v3.2d + fadd v0.2d, v0.2d, v4.2d + fadd v4.2d, v5.2d, v17.2d + ldp q6, q7, [x9, #-160] + ldp q18, q19, [x9, #-96] + ldp q16, q17, [x9] + add x8, x8, #64 // =64 + fadd v1.2d, v1.2d, v4.2d + fadd v4.2d, v6.2d, v18.2d + fadd v2.2d, v2.2d, v4.2d + fadd v4.2d, v7.2d, v19.2d + ldp q6, q7, [x9, #-32] + ldp q18, q19, [x9, #32] + fadd v6.2d, v6.2d, v18.2d + fadd v7.2d, v7.2d, v19.2d + fadd v3.2d, v3.2d, v4.2d + ldp q4, q5, [x9, #-64] + fadd v4.2d, v4.2d, v16.2d + fadd v5.2d, v5.2d, v17.2d + ldp q16, q17, [x9, #64] + fadd v4.2d, v4.2d, v16.2d + fadd v5.2d, v5.2d, v17.2d + ldp q16, q17, [x9, #128] + fadd v0.2d, v0.2d, v16.2d + fadd v1.2d, v1.2d, v17.2d + ldp q16, q17, [x9, #192] + ldp q18, q19, [x9, #96] + fadd v6.2d, v6.2d, v18.2d + fadd v7.2d, v7.2d, v19.2d + fadd v4.2d, v4.2d, v16.2d + ldp q18, q19, [x9, #160] + fadd v2.2d, v2.2d, v18.2d + fadd v3.2d, v3.2d, v19.2d + fadd v0.2d, v0.2d, v4.2d + fadd v4.2d, v5.2d, v17.2d + ldp q18, q19, [x9, #224] + add x9, x9, #512 // =512 + fadd v1.2d, v1.2d, v4.2d + fadd v4.2d, v6.2d, v18.2d + fadd v2.2d, v2.2d, v4.2d + fadd v4.2d, v7.2d, v19.2d + fadd v3.2d, v3.2d, v4.2d + adds x10, x10, #8 // =8 + b.ne .LBB1_29 + // OSACA-END diff --git a/examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s b/examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s new file mode 100644 index 0000000..b192341 --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s @@ -0,0 +1,47 @@ + // OSACA-BEGIN +.L17: + mov x17, x16 + ldr q4, [x17], 16 + ldr q5, [x16, 16] + add x16, x16, 128 + ldr q3, [x16, -80] + ldr q2, [x16, -64] + ldr q0, [x16, -48] + ldr q1, [x16, -32] + ldr q7, [x16, -16] + dup d16, v4.d[0] + dup d6, v4.d[1] + ldr q4, [x17, 16] + dup d22, v5.d[0] + dup d5, v5.d[1] + dup d20, v3.d[0] + dup d3, v3.d[1] + dup d19, v2.d[0] + dup d2, v2.d[1] + dup d21, v4.d[0] + dup d4, v4.d[1] + fadd d10, d8, d16 + dup d18, v0.d[0] + dup d0, v0.d[1] + dup d8, v1.d[0] + dup d1, v1.d[1] + dup d17, v7.d[0] + dup d7, v7.d[1] + fadd d23, d6, d10 + fadd d24, d23, d22 + fadd d25, d5, d24 + fadd d26, d25, d21 + fadd d27, d4, d26 + fadd d28, d27, d20 + fadd d29, d3, d28 + fadd d30, d29, d19 + fadd d31, d2, d30 + fadd d16, d31, d18 + fadd d6, d0, d16 + fadd d22, d6, d8 + fadd d5, d1, d22 + fadd d20, d5, d17 + fadd d8, d7, d20 + cmp x22, x16 + bne .L17 + // OSACA-END diff --git a/examples/sum_reduction/sum_reduction.s.tx2.gcc.s b/examples/sum_reduction/sum_reduction.s.tx2.gcc.s new file mode 100644 index 0000000..752cd5b --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.tx2.gcc.s @@ -0,0 +1,23 @@ + // OSACA-BEGIN +.L17: + mov x17, x16 + ldr q10, [x17], 16 + ldr q16, [x16, 16] + add x16, x16, 128 + ldr q17, [x16, -80] + ldr q18, [x16, -64] + ldr q19, [x16, -48] + ldr q20, [x16, -32] + ldr q21, [x16, -16] + fadd v22.2d, v1.2d, v10.2d + ldr q23, [x17, 16] + fadd v24.2d, v22.2d, v16.2d + fadd v25.2d, v24.2d, v23.2d + fadd v26.2d, v25.2d, v17.2d + fadd v27.2d, v26.2d, v18.2d + fadd v28.2d, v27.2d, v19.2d + fadd v29.2d, v28.2d, v20.2d + fadd v1.2d, v29.2d, v21.2d + cmp x22, x16 + bne .L17 + // OSACA-END diff --git a/examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s b/examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s new file mode 100644 index 0000000..ce152c5 --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s @@ -0,0 +1,38 @@ + # OSACA-BEGIN +.L19: + vmovsd (%r10), %xmm8 + vmovsd 8(%r10), %xmm10 + subq $-128, %r10 + vmovsd -112(%r10), %xmm12 + vmovsd -104(%r10), %xmm14 + vmovsd -96(%r10), %xmm1 + vmovsd -88(%r10), %xmm2 + vmovsd -80(%r10), %xmm3 + vmovsd -72(%r10), %xmm6 + vaddsd %xmm8, %xmm7, %xmm9 + vmovsd -64(%r10), %xmm8 + vaddsd %xmm9, %xmm10, %xmm11 + vmovsd -56(%r10), %xmm10 + vaddsd %xmm12, %xmm11, %xmm13 + vmovsd -48(%r10), %xmm12 + vaddsd %xmm13, %xmm14, %xmm15 + vmovsd -40(%r10), %xmm14 + vaddsd %xmm1, %xmm15, %xmm4 + vmovsd -32(%r10), %xmm1 + vaddsd %xmm4, %xmm2, %xmm0 + vmovsd -24(%r10), %xmm2 + vaddsd %xmm3, %xmm0, %xmm5 + vmovsd -16(%r10), %xmm3 + vaddsd %xmm5, %xmm6, %xmm7 + vmovsd -8(%r10), %xmm6 + vaddsd %xmm8, %xmm7, %xmm9 + vaddsd %xmm9, %xmm10, %xmm11 + vaddsd %xmm12, %xmm11, %xmm13 + vaddsd %xmm13, %xmm14, %xmm15 + vaddsd %xmm1, %xmm15, %xmm4 + vaddsd %xmm4, %xmm2, %xmm0 + vaddsd %xmm3, %xmm0, %xmm5 + vaddsd %xmm5, %xmm6, %xmm7 + cmpq %r10, %r14 + jne .L19 + # OSACA-END diff --git a/examples/sum_reduction/sum_reduction.s.zen.gcc.s b/examples/sum_reduction/sum_reduction.s.zen.gcc.s new file mode 100644 index 0000000..4cb1ece --- /dev/null +++ b/examples/sum_reduction/sum_reduction.s.zen.gcc.s @@ -0,0 +1,14 @@ + # OSACA-BEGIN +.L19: + vaddpd (%r10), %xmm3, %xmm1 + subq $-128, %r10 + vaddpd -112(%r10), %xmm1, %xmm4 + vaddpd -96(%r10), %xmm4, %xmm5 + vaddpd -80(%r10), %xmm5, %xmm6 + vaddpd -64(%r10), %xmm6, %xmm8 + vaddpd -48(%r10), %xmm8, %xmm9 + vaddpd -32(%r10), %xmm9, %xmm10 + vaddpd -16(%r10), %xmm10, %xmm3 + cmpq %r10, %r14 + jne .L19 + # OSACA-END diff --git a/examples/triad/triad.s.csx.gcc.s b/examples/triad/triad.s.csx.gcc.s new file mode 100644 index 0000000..ae160e5 --- /dev/null +++ b/examples/triad/triad.s.csx.gcc.s @@ -0,0 +1,36 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L19: + vmovupd (%r14,%rsi), %ymm14 + vmovupd 32(%r14,%rsi), %ymm15 + vmovupd 64(%r14,%rsi), %ymm1 + vmovupd 96(%r14,%rsi), %ymm0 + vmovupd 128(%r14,%rsi), %ymm3 + vmovupd 160(%r14,%rsi), %ymm4 + vmovupd 192(%r14,%rsi), %ymm5 + vmovupd 224(%r14,%rsi), %ymm7 + vfmadd213pd 0(%r13,%rsi), %ymm6, %ymm14 + vfmadd213pd 32(%r13,%rsi), %ymm6, %ymm15 + vfmadd213pd 64(%r13,%rsi), %ymm6, %ymm1 + vfmadd213pd 96(%r13,%rsi), %ymm6, %ymm0 + vfmadd213pd 128(%r13,%rsi), %ymm6, %ymm3 + vfmadd213pd 160(%r13,%rsi), %ymm6, %ymm4 + vfmadd213pd 192(%r13,%rsi), %ymm6, %ymm5 + vfmadd213pd 224(%r13,%rsi), %ymm6, %ymm7 + vmovupd %ymm14, (%r12,%rsi) + vmovupd %ymm15, 32(%r12,%rsi) + vmovupd %ymm1, 64(%r12,%rsi) + vmovupd %ymm0, 96(%r12,%rsi) + vmovupd %ymm3, 128(%r12,%rsi) + vmovupd %ymm4, 160(%r12,%rsi) + vmovupd %ymm5, 192(%r12,%rsi) + vmovupd %ymm7, 224(%r12,%rsi) + addq $256, %rsi + cmpq %rsi, %rcx + jne .L19 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/triad/triad.s.csx.icc.s b/examples/triad/triad.s.csx.icc.s new file mode 100644 index 0000000..9cf06fb --- /dev/null +++ b/examples/triad/triad.s.csx.icc.s @@ -0,0 +1,16 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.40: # Preds ..B1.40 ..B1.39 + # Execution count [2.22e+03] + vmovups (%r13,%rax,8), %zmm1 #78.5 + vfmadd213pd (%rcx,%rax,8), %zmm2, %zmm1 #78.5 + vmovupd %zmm1, (%r14,%rax,8) #78.5 + addq $8, %rax #78.5 + cmpq %r12, %rax #78.5 + jb ..B1.40 # Prob 82% #78.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/triad/triad.s.tx2.clang.s b/examples/triad/triad.s.tx2.clang.s new file mode 100644 index 0000000..c7e78e0 --- /dev/null +++ b/examples/triad/triad.s.tx2.clang.s @@ -0,0 +1,118 @@ + // OSACA-BEGIN +.LBB1_29: // Parent Loop BB1_20 Depth=1 + // Parent Loop BB1_22 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp q2, q3, [x9, #-256] + ldp q0, q1, [x9, #-224] + ldp q4, q5, [x10, #-256] + ldp q6, q7, [x10, #-224] + fmla v2.2d, v4.2d, v16.2d + fmla v3.2d, v5.2d, v16.2d + stp q2, q3, [x11, #-256] + fmla v0.2d, v6.2d, v16.2d + fmla v1.2d, v7.2d, v16.2d + stp q0, q1, [x11, #-224] + ldp q6, q7, [x9, #-192] + ldp q16, q17, [x10, #-192] + ldr q20, [sp, #80] // 16-byte Folded Reload + fmla v6.2d, v16.2d, v20.2d + ldr q16, [sp, #80] // 16-byte Folded Reload + ldp q4, q5, [x9, #-160] + ldp q18, q19, [x10, #-160] + fmla v7.2d, v17.2d, v16.2d + stp q6, q7, [x11, #-192] + ldr q16, [sp, #80] // 16-byte Folded Reload + fmla v4.2d, v18.2d, v16.2d + ldr q16, [sp, #80] // 16-byte Folded Reload + fmla v5.2d, v19.2d, v16.2d + stp q4, q5, [x11, #-160] + ldp q17, q19, [x9, #-128] + ldp q20, q21, [x10, #-128] + ldr q24, [sp, #80] // 16-byte Folded Reload + fmla v17.2d, v20.2d, v24.2d + ldr q20, [sp, #80] // 16-byte Folded Reload + ldp q16, q18, [x9, #-96] + ldp q22, q23, [x10, #-96] + fmla v19.2d, v21.2d, v20.2d + stp q17, q19, [x11, #-128] + ldr q20, [sp, #80] // 16-byte Folded Reload + fmla v16.2d, v22.2d, v20.2d + ldr q20, [sp, #80] // 16-byte Folded Reload + ldp q24, q25, [x10, #-64] + fmla v18.2d, v23.2d, v20.2d + stp q16, q18, [x11, #-96] + ldp q20, q22, [x9, #-64] + ldr q28, [sp, #80] // 16-byte Folded Reload + fmla v20.2d, v24.2d, v28.2d + ldr q24, [sp, #80] // 16-byte Folded Reload + ldp q21, q23, [x9, #-32] + ldp q26, q27, [x10, #-32] + fmla v22.2d, v25.2d, v24.2d + stp q20, q22, [x11, #-64] + ldr q24, [sp, #80] // 16-byte Folded Reload + fmla v21.2d, v26.2d, v24.2d + ldr q24, [sp, #80] // 16-byte Folded Reload + ldp q28, q29, [x10] + ldr q8, [sp, #80] // 16-byte Folded Reload + ldp q30, q31, [x10, #32] + ldr q9, [sp, #80] // 16-byte Folded Reload + fmla v23.2d, v27.2d, v24.2d + stp q21, q23, [x11, #-32] + ldp q24, q25, [x9] + fmla v24.2d, v28.2d, v8.2d + ldr q28, [sp, #80] // 16-byte Folded Reload + ldp q26, q27, [x9, #32] + ldp q8, q10, [x10, #64] + ldp q11, q12, [x10, #96] + fmla v25.2d, v29.2d, v28.2d + stp q24, q25, [x11] + ldr q28, [sp, #80] // 16-byte Folded Reload + fmla v26.2d, v30.2d, v28.2d + ldr q28, [sp, #80] // 16-byte Folded Reload + ldp q13, q14, [x10, #128] + ldr q2, [sp, #80] // 16-byte Folded Reload + ldp q1, q3, [x10, #192] + fmla v27.2d, v31.2d, v28.2d + stp q26, q27, [x11, #32] + ldp q28, q29, [x9, #64] + fmla v28.2d, v8.2d, v9.2d + ldr q8, [sp, #80] // 16-byte Folded Reload + ldp q30, q31, [x9, #96] + ldr q9, [sp, #80] // 16-byte Folded Reload + ldr q6, [sp, #80] // 16-byte Folded Reload + ldr q5, [sp, #80] // 16-byte Folded Reload + fmla v29.2d, v10.2d, v8.2d + stp q28, q29, [x11, #64] + ldr q8, [sp, #80] // 16-byte Folded Reload + fmla v30.2d, v11.2d, v8.2d + ldr q8, [sp, #80] // 16-byte Folded Reload + ldr q16, [sp, #80] // 16-byte Folded Reload + add x8, x8, #64 // =64 + fmla v31.2d, v12.2d, v8.2d + stp q30, q31, [x11, #96] + ldp q8, q10, [x9, #128] + fmla v8.2d, v13.2d, v9.2d + ldr q9, [sp, #80] // 16-byte Folded Reload + ldp q11, q12, [x9, #160] + fmla v10.2d, v14.2d, v9.2d + stp q8, q10, [x11, #128] + ldp q13, q14, [x10, #160] + fmla v12.2d, v14.2d, v2.2d + ldp q2, q0, [x9, #192] + ldr q9, [sp, #80] // 16-byte Folded Reload + fmla v2.2d, v1.2d, v6.2d + ldp q1, q4, [x9, #224] + fmla v0.2d, v3.2d, v5.2d + stp q2, q0, [x11, #192] + ldp q3, q5, [x10, #224] + fmla v11.2d, v13.2d, v9.2d + stp q11, q12, [x11, #160] + fmla v1.2d, v3.2d, v16.2d + fmla v4.2d, v5.2d, v16.2d + stp q1, q4, [x11, #224] + add x11, x11, #512 // =512 + add x10, x10, #512 // =512 + add x9, x9, #512 // =512 + adds x12, x12, #8 // =8 + b.ne .LBB1_29 + // OSACA-END diff --git a/examples/triad/triad.s.tx2.gcc.s b/examples/triad/triad.s.tx2.gcc.s new file mode 100644 index 0000000..9fe4798 --- /dev/null +++ b/examples/triad/triad.s.tx2.gcc.s @@ -0,0 +1,45 @@ + // OSACA-BEGIN +.L17: + add x0, x10, 16 + ldr q23, [x20, x10] + ldr q24, [x21, x10] + add x7, x10, 32 + ldr q25, [x20, x0] + ldr q26, [x21, x0] + add x6, x10, 48 + add x5, x10, 64 + ldr q27, [x20, x7] + ldr q28, [x21, x7] + add x4, x10, 80 + add x11, x10, 96 + ldr q29, [x20, x6] + ldr q30, [x21, x6] + add x2, x10, 112 + fmla v23.2d, v3.2d, v24.2d + ldr q31, [x20, x5] + ldr q4, [x21, x5] + fmla v25.2d, v3.2d, v26.2d + ldr q2, [x20, x4] + ldr q5, [x21, x4] + fmla v27.2d, v3.2d, v28.2d + ldr q1, [x20, x11] + ldr q6, [x21, x11] + fmla v29.2d, v3.2d, v30.2d + ldr q0, [x20, x2] + ldr q7, [x21, x2] + fmla v31.2d, v3.2d, v4.2d + fmla v2.2d, v3.2d, v5.2d + fmla v1.2d, v3.2d, v6.2d + str q23, [x19, x10] + add x10, x10, 128 + fmla v0.2d, v3.2d, v7.2d + str q25, [x19, x0] + str q27, [x19, x7] + str q29, [x19, x6] + str q31, [x19, x5] + str q2, [x19, x4] + str q1, [x19, x11] + str q0, [x19, x2] + cmp x24, x10 + bne .L17 + // OSACA-END diff --git a/examples/triad/triad.s.zen.gcc.s b/examples/triad/triad.s.zen.gcc.s new file mode 100644 index 0000000..4513061 --- /dev/null +++ b/examples/triad/triad.s.zen.gcc.s @@ -0,0 +1,30 @@ + # OSACA-BEGIN +.L19: + vmovups 0(%r13,%rax), %xmm12 + vmovups 16(%r13,%rax), %xmm13 + vmovups 32(%r13,%rax), %xmm14 + vmovups 48(%r13,%rax), %xmm15 + vmovups 64(%r13,%rax), %xmm1 + vmovups 80(%r13,%rax), %xmm0 + vmovups 96(%r13,%rax), %xmm4 + vmovups 112(%r13,%rax), %xmm5 + vfmadd213pd (%r12,%rax), %xmm3, %xmm12 + vfmadd213pd 16(%r12,%rax), %xmm3, %xmm13 + vfmadd213pd 32(%r12,%rax), %xmm3, %xmm14 + vfmadd213pd 48(%r12,%rax), %xmm3, %xmm15 + vfmadd213pd 64(%r12,%rax), %xmm3, %xmm1 + vfmadd213pd 80(%r12,%rax), %xmm3, %xmm0 + vfmadd213pd 96(%r12,%rax), %xmm3, %xmm4 + vfmadd213pd 112(%r12,%rax), %xmm3, %xmm5 + vmovups %xmm12, 0(%rbp,%rax) + vmovups %xmm13, 16(%rbp,%rax) + vmovups %xmm14, 32(%rbp,%rax) + vmovups %xmm15, 48(%rbp,%rax) + vmovups %xmm1, 64(%rbp,%rax) + vmovups %xmm0, 80(%rbp,%rax) + vmovups %xmm4, 96(%rbp,%rax) + vmovups %xmm5, 112(%rbp,%rax) + subq $-128, %rax + cmpq %rbx, %rax + jne .L19 + # OSACA-END diff --git a/examples/update/update.s.csx.gcc.s b/examples/update/update.s.csx.gcc.s new file mode 100644 index 0000000..085ce83 --- /dev/null +++ b/examples/update/update.s.csx.gcc.s @@ -0,0 +1,28 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +.L19: + vmulpd (%rcx), %ymm3, %ymm12 + vmulpd 32(%rcx), %ymm3, %ymm13 + vmulpd 64(%rcx), %ymm3, %ymm14 + vmulpd 96(%rcx), %ymm3, %ymm15 + vmulpd 128(%rcx), %ymm3, %ymm0 + vmulpd 160(%rcx), %ymm3, %ymm1 + vmulpd 192(%rcx), %ymm3, %ymm7 + vmulpd 224(%rcx), %ymm3, %ymm4 + vmovupd %ymm12, (%rcx) + vmovupd %ymm13, 32(%rcx) + vmovupd %ymm14, 64(%rcx) + vmovupd %ymm15, 96(%rcx) + vmovupd %ymm0, 128(%rcx) + vmovupd %ymm1, 160(%rcx) + vmovupd %ymm7, 192(%rcx) + vmovupd %ymm4, 224(%rcx) + addq $256, %rcx + cmpq %r15, %rcx + jne .L19 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/update/update.s.csx.icc.s b/examples/update/update.s.csx.icc.s new file mode 100644 index 0000000..bb71a55 --- /dev/null +++ b/examples/update/update.s.csx.icc.s @@ -0,0 +1,17 @@ + movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY +..B1.38: # Preds ..B1.38 ..B1.37 + # Execution count [2.22e+03] + vmulpd (%r13,%rax,8), %zmm3, %zmm1 #75.5 + vmulpd 64(%r13,%rax,8), %zmm3, %zmm2 #75.5 + vmovupd %zmm1, (%r13,%rax,8) #75.5 + vmovupd %zmm2, 64(%r13,%rax,8) #75.5 + addq $16, %rax #75.5 + cmpq %r14, %rax #75.5 + jb ..B1.38 # Prob 82% #75.5 + movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY + .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY diff --git a/examples/update/update.s.tx2.clang.s b/examples/update/update.s.tx2.clang.s new file mode 100644 index 0000000..a090807 --- /dev/null +++ b/examples/update/update.s.tx2.clang.s @@ -0,0 +1,15 @@ + // OSACA-BEGIN +.LBB1_32: // Parent Loop BB1_20 Depth=1 + // Parent Loop BB1_22 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp q0, q1, [x8] + ldp q2, q3, [x8, #-32] + fmul v2.2d, v2.2d, v26.2d + fmul v3.2d, v3.2d, v26.2d + stp q2, q3, [x8, #-32] + fmul v0.2d, v0.2d, v26.2d + fmul v1.2d, v1.2d, v26.2d + stp q0, q1, [x8], #64 + adds x9, x9, #1 // =1 + b.ne .LBB1_32 + // OSACA-END diff --git a/examples/update/update.s.tx2.gcc.s b/examples/update/update.s.tx2.gcc.s new file mode 100644 index 0000000..8cf7d9e --- /dev/null +++ b/examples/update/update.s.tx2.gcc.s @@ -0,0 +1,31 @@ + // OSACA-BEGIN +.L17: + ldr q23, [x16] + mov x17, x16 + add x16, x16, 128 + fmul v24.2d, v23.2d, v2.2d + str q24, [x17], 16 + ldr q25, [x16, -112] + fmul v26.2d, v25.2d, v2.2d + str q26, [x16, -112] + ldr q27, [x17, 16] + fmul v28.2d, v27.2d, v2.2d + str q28, [x17, 16] + ldr q29, [x16, -80] + ldr q30, [x16, -64] + ldr q31, [x16, -48] + ldr q1, [x16, -32] + ldr q0, [x16, -16] + fmul v5.2d, v29.2d, v2.2d + fmul v4.2d, v30.2d, v2.2d + fmul v3.2d, v31.2d, v2.2d + fmul v6.2d, v1.2d, v2.2d + fmul v7.2d, v0.2d, v2.2d + str q5, [x16, -80] + str q4, [x16, -64] + str q3, [x16, -48] + str q6, [x16, -32] + str q7, [x16, -16] + cmp x22, x16 + bne .L17 + // OSACA-END diff --git a/examples/update/update.s.zen.gcc.s b/examples/update/update.s.zen.gcc.s new file mode 100644 index 0000000..176d978 --- /dev/null +++ b/examples/update/update.s.zen.gcc.s @@ -0,0 +1,22 @@ + # OSACA-BEGIN +.L19: + vmulpd (%r10), %xmm3, %xmm11 + subq $-128, %r10 + vmulpd -112(%r10), %xmm3, %xmm12 + vmulpd -96(%r10), %xmm3, %xmm13 + vmulpd -80(%r10), %xmm3, %xmm14 + vmulpd -64(%r10), %xmm3, %xmm15 + vmulpd -48(%r10), %xmm3, %xmm0 + vmovups %xmm11, -128(%r10) + vmulpd -32(%r10), %xmm3, %xmm7 + vmovups %xmm12, -112(%r10) + vmulpd -16(%r10), %xmm3, %xmm1 + vmovups %xmm13, -96(%r10) + vmovups %xmm14, -80(%r10) + vmovups %xmm15, -64(%r10) + vmovups %xmm0, -48(%r10) + vmovups %xmm7, -32(%r10) + vmovups %xmm1, -16(%r10) + cmpq %r10, %r14 + jne .L19 + # OSACA-END