added example kernels

This commit is contained in:
JanLJL
2020-02-03 13:19:18 +01:00
parent f5489621fa
commit cadedeba7b
52 changed files with 7214 additions and 0 deletions

43
examples/README.md Normal file
View File

@@ -0,0 +1,43 @@
# Examples
We collected sample kernels for the user to run examples with OSACA.
The assembly files contain only the extracted and already marked kernel for code compiled with on Intel Cascade Lake (CSX), AMD Zen and Marvell ThunderX2 (TX2), but can be run on any system supporting the ISA and supported by OSACA.
The used compilers were Intel Parallel Studio 19.0up05 and GNU 9.1.0 in case of the x86 systems and ARM HPC Compiler for Linux version 19.2 and GNU 8.2.0 for the ARM-based TX2.
To analyze the kernels with OSACA, run
```
osaca --arch ARCH filepath
```
While all Zen and TX2 kernels use the comment-style OSACA markers, the kernels for Intel Cascade Lake (*.csx.*.s) use the byte markers to be able to be analyzed by IACA as well.
For this use
```
iaca -arch SKX filepath
```
------------
The kernels will be explained briefly in the following.
### Copy
```c
double * restrict a, * restrict b;
for(long i=0; i < size; ++i){
a[i] = b[i];
}
```
### Vector add
### Vector update
### Sum reduction
### DAXPY
### STREAM triad
### Schönauer triad
### Gauss-Seidel method
### Jacobi 2D

View File

@@ -0,0 +1,36 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r14,%rax), %ymm3
vmovupd 32(%r14,%rax), %ymm4
vmovupd 64(%r14,%rax), %ymm6
vmovupd 96(%r14,%rax), %ymm9
vmovupd 128(%r14,%rax), %ymm11
vmovupd 160(%r14,%rax), %ymm13
vmovupd 192(%r14,%rax), %ymm15
vmovupd 224(%r14,%rax), %ymm0
vaddpd 0(%r13,%rax), %ymm3, %ymm7
vaddpd 32(%r13,%rax), %ymm4, %ymm5
vaddpd 64(%r13,%rax), %ymm6, %ymm8
vaddpd 96(%r13,%rax), %ymm9, %ymm10
vaddpd 128(%r13,%rax), %ymm11, %ymm12
vaddpd 160(%r13,%rax), %ymm13, %ymm14
vaddpd 192(%r13,%rax), %ymm15, %ymm1
vaddpd 224(%r13,%rax), %ymm0, %ymm2
vmovupd %ymm7, (%r12,%rax)
vmovupd %ymm5, 32(%r12,%rax)
vmovupd %ymm8, 64(%r12,%rax)
vmovupd %ymm10, 96(%r12,%rax)
vmovupd %ymm12, 128(%r12,%rax)
vmovupd %ymm14, 160(%r12,%rax)
vmovupd %ymm1, 192(%r12,%rax)
vmovupd %ymm2, 224(%r12,%rax)
addq $256, %rax
cmpq %rax, %rcx
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,19 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.40: # Preds ..B1.40 ..B1.39
# Execution count [2.22e+03]
vmovups (%rcx,%rax,8), %zmm1 #78.5
vmovups 64(%rcx,%rax,8), %zmm3 #78.5
vaddpd (%r13,%rax,8), %zmm1, %zmm2 #78.5
vaddpd 64(%r13,%rax,8), %zmm3, %zmm4 #78.5
vmovupd %zmm2, (%r14,%rax,8) #78.5
vmovupd %zmm4, 64(%r14,%rax,8) #78.5
addq $16, %rax #78.5
cmpq %r12, %rax #78.5
jb ..B1.40 # Prob 82% #78.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,91 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x9, #-256]
ldp q4, q5, [x9, #-224]
ldp q2, q3, [x10, #-256]
ldp q6, q7, [x10, #-224]
fadd v2.2d, v2.2d, v0.2d
fadd v3.2d, v3.2d, v1.2d
stp q2, q3, [x11, #-256]
fadd v0.2d, v6.2d, v4.2d
fadd v1.2d, v7.2d, v5.2d
stp q0, q1, [x11, #-224]
ldp q4, q5, [x9, #-192]
ldp q16, q17, [x9, #-160]
ldp q6, q7, [x10, #-192]
ldp q18, q19, [x10, #-160]
fadd v6.2d, v6.2d, v4.2d
fadd v7.2d, v7.2d, v5.2d
stp q6, q7, [x11, #-192]
fadd v4.2d, v18.2d, v16.2d
fadd v5.2d, v19.2d, v17.2d
stp q4, q5, [x11, #-160]
ldp q16, q17, [x9, #-128]
ldp q19, q20, [x9, #-96]
ldp q18, q21, [x10, #-128]
ldp q22, q23, [x10, #-96]
fadd v16.2d, v18.2d, v16.2d
fadd v18.2d, v21.2d, v17.2d
stp q16, q18, [x11, #-128]
fadd v17.2d, v22.2d, v19.2d
fadd v19.2d, v23.2d, v20.2d
stp q17, q19, [x11, #-96]
ldp q20, q21, [x9, #-64]
ldp q24, q25, [x10, #-64]
ldp q22, q23, [x9, #-32]
ldp q26, q27, [x10, #-32]
fadd v20.2d, v24.2d, v20.2d
fadd v21.2d, v25.2d, v21.2d
stp q20, q21, [x11, #-64]
ldp q24, q25, [x9]
ldp q28, q29, [x10]
fadd v22.2d, v26.2d, v22.2d
fadd v23.2d, v27.2d, v23.2d
stp q22, q23, [x11, #-32]
ldp q26, q27, [x9, #32]
ldp q30, q31, [x10, #32]
fadd v24.2d, v28.2d, v24.2d
fadd v25.2d, v29.2d, v25.2d
stp q24, q25, [x11]
ldp q28, q29, [x9, #64]
ldp q8, q10, [x10, #64]
fadd v26.2d, v30.2d, v26.2d
fadd v27.2d, v31.2d, v27.2d
stp q26, q27, [x11, #32]
ldp q30, q31, [x9, #96]
ldp q11, q12, [x10, #96]
fadd v28.2d, v8.2d, v28.2d
fadd v29.2d, v10.2d, v29.2d
stp q28, q29, [x11, #64]
ldp q8, q10, [x9, #128]
ldp q13, q14, [x10, #128]
ldp q3, q0, [x9, #192]
ldp q1, q6, [x10, #192]
fadd v30.2d, v11.2d, v30.2d
fadd v31.2d, v12.2d, v31.2d
stp q30, q31, [x11, #96]
ldp q11, q12, [x9, #160]
fadd v8.2d, v13.2d, v8.2d
fadd v10.2d, v14.2d, v10.2d
stp q8, q10, [x11, #128]
ldp q13, q14, [x10, #160]
fadd v1.2d, v1.2d, v3.2d
ldp q3, q4, [x9, #224]
fadd v0.2d, v6.2d, v0.2d
stp q1, q0, [x11, #192]
ldp q5, q6, [x10, #224]
fadd v11.2d, v13.2d, v11.2d
fadd v2.2d, v14.2d, v12.2d
stp q11, q2, [x11, #160]
fadd v3.2d, v5.2d, v3.2d
fadd v4.2d, v6.2d, v4.2d
stp q3, q4, [x11, #224]
add x8, x8, #64 // =64
add x11, x11, #512 // =512
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x12, x12, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,45 @@
// OSACA-BEGIN
.L17:
add x0, x10, 16
ldr q29, [x21, x10]
ldr q30, [x20, x10]
add x7, x10, 32
ldr q31, [x21, x0]
ldr q2, [x20, x0]
add x6, x10, 48
add x5, x10, 64
ldr q5, [x21, x7]
ldr q1, [x20, x7]
add x4, x10, 80
add x11, x10, 96
ldr q4, [x21, x6]
ldr q0, [x20, x6]
add x2, x10, 112
fadd v7.2d, v29.2d, v30.2d
ldr q3, [x21, x5]
ldr q9, [x20, x5]
fadd v6.2d, v31.2d, v2.2d
ldr q19, [x21, x4]
ldr q18, [x20, x4]
fadd v20.2d, v5.2d, v1.2d
ldr q21, [x21, x11]
ldr q17, [x20, x11]
fadd v22.2d, v4.2d, v0.2d
ldr q23, [x21, x2]
ldr q16, [x20, x2]
fadd v24.2d, v3.2d, v9.2d
fadd v25.2d, v19.2d, v18.2d
fadd v26.2d, v21.2d, v17.2d
str q7, [x19, x10]
add x10, x10, 128
fadd v27.2d, v23.2d, v16.2d
str q6, [x19, x0]
str q20, [x19, x7]
str q22, [x19, x6]
str q24, [x19, x5]
str q25, [x19, x4]
str q26, [x19, x11]
str q27, [x19, x2]
cmp x24, x10
bne .L17
// OSACA-END

View File

@@ -0,0 +1,30 @@
# OSACA-BEGIN
.L19:
vmovups 0(%r13,%rax), %xmm0
vmovups 16(%r13,%rax), %xmm3
vmovups 32(%r13,%rax), %xmm4
vmovups 48(%r13,%rax), %xmm6
vmovups 64(%r13,%rax), %xmm9
vmovups 80(%r13,%rax), %xmm11
vmovups 96(%r13,%rax), %xmm13
vmovups 112(%r13,%rax), %xmm15
vaddpd (%r12,%rax), %xmm0, %xmm7
vaddpd 16(%r12,%rax), %xmm3, %xmm2
vaddpd 32(%r12,%rax), %xmm4, %xmm5
vaddpd 48(%r12,%rax), %xmm6, %xmm8
vaddpd 64(%r12,%rax), %xmm9, %xmm10
vaddpd 80(%r12,%rax), %xmm11, %xmm12
vaddpd 96(%r12,%rax), %xmm13, %xmm14
vaddpd 112(%r12,%rax), %xmm15, %xmm1
vmovups %xmm7, 0(%rbp,%rax)
vmovups %xmm2, 16(%rbp,%rax)
vmovups %xmm5, 32(%rbp,%rax)
vmovups %xmm8, 48(%rbp,%rax)
vmovups %xmm10, 64(%rbp,%rax)
vmovups %xmm12, 80(%rbp,%rax)
vmovups %xmm14, 96(%rbp,%rax)
vmovups %xmm1, 112(%rbp,%rax)
subq $-128, %rax
cmpq %rbx, %rax
jne .L19
# OSACA-END

View File

@@ -0,0 +1,28 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r12,%rcx), %ymm10
vmovupd 32(%r12,%rcx), %ymm11
vmovupd 64(%r12,%rcx), %ymm12
vmovupd 96(%r12,%rcx), %ymm13
vmovupd 128(%r12,%rcx), %ymm14
vmovupd 160(%r12,%rcx), %ymm15
vmovupd 192(%r12,%rcx), %ymm0
vmovupd 224(%r12,%rcx), %ymm1
vmovupd %ymm10, 0(%r13,%rcx)
vmovupd %ymm11, 32(%r13,%rcx)
vmovupd %ymm12, 64(%r13,%rcx)
vmovupd %ymm13, 96(%r13,%rcx)
vmovupd %ymm14, 128(%r13,%rcx)
vmovupd %ymm15, 160(%r13,%rcx)
vmovupd %ymm0, 192(%r13,%rcx)
vmovupd %ymm1, 224(%r13,%rcx)
addq $256, %rcx
cmpq %rcx, %r10
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,15 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.39: # Preds ..B1.39 ..B1.38
# Execution count [2.22e+03]
vmovups (%r14,%rax,8), %zmm1 #79.5
vmovupd %zmm1, (%r13,%rax,8) #79.5
addq $8, %rax #79.5
cmpq %r12, %rax #79.5
jb ..B1.39 # Prob 82% #79.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,42 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x9, #-256]
ldp q2, q3, [x9, #-224]
stp q0, q1, [x10, #-256]
stp q2, q3, [x10, #-224]
add x8, x8, #64 // =64
ldp q0, q1, [x9]
ldp q2, q3, [x9, #32]
stp q0, q1, [x10]
stp q2, q3, [x10, #32]
ldp q0, q1, [x9, #-192]
ldp q2, q3, [x9, #-160]
stp q0, q1, [x10, #-192]
stp q2, q3, [x10, #-160]
ldp q0, q1, [x9, #64]
ldp q2, q3, [x9, #96]
stp q0, q1, [x10, #64]
stp q2, q3, [x10, #96]
ldp q0, q1, [x9, #-128]
ldp q2, q3, [x9, #-96]
stp q0, q1, [x10, #-128]
stp q2, q3, [x10, #-96]
ldp q0, q1, [x9, #128]
ldp q2, q3, [x9, #160]
stp q0, q1, [x10, #128]
stp q2, q3, [x10, #160]
ldp q0, q1, [x9, #-64]
ldp q2, q3, [x9, #-32]
stp q0, q1, [x10, #-64]
stp q2, q3, [x10, #-32]
ldp q0, q1, [x9, #192]
ldp q2, q3, [x9, #224]
add x9, x9, #512 // =512
stp q0, q1, [x10, #192]
stp q2, q3, [x10, #224]
add x10, x10, #512 // =512
adds x11, x11, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,29 @@
// OSACA-BEGIN
.L17:
add x16, x15, 16
ldr q9, [x19, x15]
add x30, x15, 32
add x17, x15, 48
ldr q16, [x19, x16]
ldr q18, [x19, x30]
add x18, x15, 64
add x1, x15, 80
ldr q17, [x19, x17]
ldr q19, [x19, x18]
add x3, x15, 96
add x2, x15, 112
ldr q20, [x19, x1]
ldr q21, [x19, x3]
str q9, [x20, x15]
ldr q22, [x19, x2]
add x15, x15, 128
str q16, [x20, x16]
str q18, [x20, x30]
str q17, [x20, x17]
str q19, [x20, x18]
str q20, [x20, x1]
str q21, [x20, x3]
str q22, [x20, x2]
cmp x23, x15
bne .L17
// OSACA-END

View File

@@ -0,0 +1,22 @@
# OSACA-BEGIN
.L19:
vmovups 0(%rbp,%r10), %xmm9
vmovups 16(%rbp,%r10), %xmm10
vmovups 32(%rbp,%r10), %xmm11
vmovups 48(%rbp,%r10), %xmm12
vmovups 64(%rbp,%r10), %xmm13
vmovups 80(%rbp,%r10), %xmm14
vmovups 96(%rbp,%r10), %xmm15
vmovups 112(%rbp,%r10), %xmm0
vmovups %xmm9, (%r12,%r10)
vmovups %xmm10, 16(%r12,%r10)
vmovups %xmm11, 32(%r12,%r10)
vmovups %xmm12, 48(%r12,%r10)
vmovups %xmm13, 64(%r12,%r10)
vmovups %xmm14, 80(%r12,%r10)
vmovups %xmm15, 96(%r12,%r10)
vmovups %xmm0, 112(%r12,%r10)
subq $-128, %r10
cmpq %r10, %r15
jne .L19
# OSACA-END

View File

@@ -0,0 +1,36 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd 0(%r13,%rsi), %ymm14
vmovupd 32(%r13,%rsi), %ymm15
vmovupd 64(%r13,%rsi), %ymm1
vmovupd 96(%r13,%rsi), %ymm0
vmovupd 128(%r13,%rsi), %ymm3
vmovupd 160(%r13,%rsi), %ymm4
vmovupd 192(%r13,%rsi), %ymm5
vmovupd 224(%r13,%rsi), %ymm7
vfmadd213pd (%r12,%rsi), %ymm6, %ymm14
vfmadd213pd 32(%r12,%rsi), %ymm6, %ymm15
vfmadd213pd 64(%r12,%rsi), %ymm6, %ymm1
vfmadd213pd 96(%r12,%rsi), %ymm6, %ymm0
vfmadd213pd 128(%r12,%rsi), %ymm6, %ymm3
vfmadd213pd 160(%r12,%rsi), %ymm6, %ymm4
vfmadd213pd 192(%r12,%rsi), %ymm6, %ymm5
vfmadd213pd 224(%r12,%rsi), %ymm6, %ymm7
vmovupd %ymm14, (%r12,%rsi)
vmovupd %ymm15, 32(%r12,%rsi)
vmovupd %ymm1, 64(%r12,%rsi)
vmovupd %ymm0, 96(%r12,%rsi)
vmovupd %ymm3, 128(%r12,%rsi)
vmovupd %ymm4, 160(%r12,%rsi)
vmovupd %ymm5, 192(%r12,%rsi)
vmovupd %ymm7, 224(%r12,%rsi)
addq $256, %rsi
cmpq %rsi, %r10
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,16 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.39: # Preds ..B1.39 ..B1.38
# Execution count [2.22e+03]
vmovups (%r13,%rax,8), %zmm1 #77.5
vfmadd213pd (%r14,%rax,8), %zmm2, %zmm1 #77.5
vmovupd %zmm1, (%r14,%rax,8) #77.5
addq $8, %rax #77.5
cmpq %rbx, %rax #77.5
jb ..B1.39 # Prob 82% #77.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,90 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q1, q2, [x9, #-256]
ldp q3, q0, [x9, #-224]
ldp q4, q5, [x10, #-256]
ldp q6, q7, [x10, #-224]
fmla v1.2d, v4.2d, v31.2d
fmla v2.2d, v5.2d, v31.2d
stp q1, q2, [x9, #-256]
fmla v3.2d, v6.2d, v31.2d
fmla v0.2d, v7.2d, v31.2d
stp q3, q0, [x9, #-224]
ldp q5, q6, [x9, #-192]
ldp q7, q4, [x9, #-160]
ldp q16, q17, [x10, #-192]
ldp q18, q19, [x10, #-160]
fmla v5.2d, v16.2d, v31.2d
fmla v6.2d, v17.2d, v31.2d
stp q5, q6, [x9, #-192]
fmla v7.2d, v18.2d, v31.2d
fmla v4.2d, v19.2d, v31.2d
stp q7, q4, [x9, #-160]
ldp q19, q18, [x9, #-128]
ldp q16, q17, [x9, #-96]
ldp q20, q21, [x10, #-128]
ldp q22, q23, [x10, #-96]
fmla v18.2d, v21.2d, v31.2d
fmla v16.2d, v22.2d, v31.2d
ldp q21, q22, [x9, #-64]
ldp q24, q25, [x10, #-64]
fmla v19.2d, v20.2d, v31.2d
stp q19, q18, [x9, #-128]
fmla v17.2d, v23.2d, v31.2d
stp q16, q17, [x9, #-96]
ldp q23, q20, [x9, #-32]
ldp q26, q27, [x10, #-32]
fmla v21.2d, v24.2d, v31.2d
fmla v22.2d, v25.2d, v31.2d
stp q21, q22, [x9, #-64]
ldp q24, q25, [x9]
ldp q28, q29, [x10]
fmla v23.2d, v26.2d, v31.2d
fmla v20.2d, v27.2d, v31.2d
stp q23, q20, [x9, #-32]
ldp q26, q27, [x9, #32]
fmla v24.2d, v28.2d, v31.2d
fmla v25.2d, v29.2d, v31.2d
stp q24, q25, [x9]
ldp q28, q29, [x10, #32]
fmla v26.2d, v28.2d, v31.2d
fmla v27.2d, v29.2d, v31.2d
stp q26, q27, [x9, #32]
ldp q24, q25, [x9, #64]
ldp q28, q29, [x10, #64]
ldp q26, q27, [x9, #96]
fmla v24.2d, v28.2d, v31.2d
fmla v25.2d, v29.2d, v31.2d
stp q24, q25, [x9, #64]
ldp q28, q29, [x10, #96]
fmla v26.2d, v28.2d, v31.2d
fmla v27.2d, v29.2d, v31.2d
stp q26, q27, [x9, #96]
ldp q24, q25, [x9, #128]
ldp q26, q27, [x10, #128]
fmla v24.2d, v26.2d, v31.2d
fmla v25.2d, v27.2d, v31.2d
stp q24, q25, [x9, #128]
ldp q26, q27, [x9, #160]
ldp q1, q2, [x10, #160]
fmla v26.2d, v1.2d, v31.2d
fmla v27.2d, v2.2d, v31.2d
stp q26, q27, [x9, #160]
ldp q0, q1, [x9, #192]
ldp q2, q3, [x10, #192]
fmla v0.2d, v2.2d, v31.2d
fmla v1.2d, v3.2d, v31.2d
stp q0, q1, [x9, #192]
ldp q2, q3, [x9, #224]
ldp q4, q5, [x10, #224]
fmla v2.2d, v4.2d, v31.2d
fmla v3.2d, v5.2d, v31.2d
stp q2, q3, [x9, #224]
add x8, x8, #64 // =64
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x11, x11, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,41 @@
// OSACA-BEGIN
.L17:
mov x5, x3
ldr q23, [x10]
ldr q24, [x5], 16
mov x6, x10
ldr q25, [x3, 16]
ldr q26, [x3, 48]
add x10, x10, 128
add x3, x3, 128
ldr q27, [x3, -64]
ldr q28, [x3, -48]
ldr q29, [x3, -32]
ldr q30, [x3, -16]
fmla v23.2d, v3.2d, v24.2d
ldr q31, [x5, 16]
str q23, [x6], 16
ldr q0, [x10, -112]
fmla v0.2d, v3.2d, v25.2d
str q0, [x10, -112]
ldr q2, [x6, 16]
fmla v2.2d, v3.2d, v31.2d
str q2, [x6, 16]
ldr q5, [x10, -80]
ldr q4, [x10, -64]
ldr q6, [x10, -48]
ldr q1, [x10, -32]
ldr q7, [x10, -16]
fmla v5.2d, v3.2d, v26.2d
fmla v4.2d, v3.2d, v27.2d
fmla v6.2d, v3.2d, v28.2d
fmla v1.2d, v3.2d, v29.2d
fmla v7.2d, v3.2d, v30.2d
str q5, [x10, -80]
str q4, [x10, -64]
str q6, [x10, -48]
str q1, [x10, -32]
str q7, [x10, -16]
cmp x23, x10
bne .L17
// OSACA-END

View File

@@ -0,0 +1,30 @@
# OSACA-BEGIN
.L19:
vmovups (%r12,%rax), %xmm12
vmovups 16(%r12,%rax), %xmm13
vmovups 32(%r12,%rax), %xmm14
vmovups 48(%r12,%rax), %xmm15
vmovups 64(%r12,%rax), %xmm1
vmovups 80(%r12,%rax), %xmm0
vmovups 96(%r12,%rax), %xmm4
vmovups 112(%r12,%rax), %xmm5
vfmadd213pd 0(%rbp,%rax), %xmm3, %xmm12
vfmadd213pd 16(%rbp,%rax), %xmm3, %xmm13
vfmadd213pd 32(%rbp,%rax), %xmm3, %xmm14
vfmadd213pd 48(%rbp,%rax), %xmm3, %xmm15
vfmadd213pd 64(%rbp,%rax), %xmm3, %xmm1
vfmadd213pd 80(%rbp,%rax), %xmm3, %xmm0
vfmadd213pd 96(%rbp,%rax), %xmm3, %xmm4
vfmadd213pd 112(%rbp,%rax), %xmm3, %xmm5
vmovups %xmm12, 0(%rbp,%rax)
vmovups %xmm13, 16(%rbp,%rax)
vmovups %xmm14, 32(%rbp,%rax)
vmovups %xmm15, 48(%rbp,%rax)
vmovups %xmm1, 64(%rbp,%rax)
vmovups %xmm0, 80(%rbp,%rax)
vmovups %xmm4, 96(%rbp,%rax)
vmovups %xmm5, 112(%rbp,%rax)
subq $-128, %rax
cmpq %r15, %rax
jne .L19
# OSACA-END

1144
examples/gs/gs.s.csx.gcc.s Normal file

File diff suppressed because it is too large Load Diff

1123
examples/gs/gs.s.csx.icc.s Normal file

File diff suppressed because it is too large Load Diff

1194
examples/gs/gs.s.tx2.clang.s Normal file

File diff suppressed because it is too large Load Diff

737
examples/gs/gs.s.tx2.gcc.s Normal file
View File

@@ -0,0 +1,737 @@
.arch armv8.1-a+crypto+crc
.file "gs.f90"
.text
.align 2
.p2align 4,,15
.type MAIN__, %function
MAIN__:
.LFB0:
.cfi_startproc
sub sp, sp, #720
.cfi_def_cfa_offset 720
mov x0, 128
mov w1, 12
stp x29, x30, [sp]
.cfi_offset 29, -720
.cfi_offset 30, -712
mov x29, sp
movk x0, 0x5, lsl 32
stp x19, x20, [sp, 16]
.cfi_offset 19, -704
.cfi_offset 20, -696
adrp x19, .LC0
add x19, x19, :lo12:.LC0
stp x21, x22, [sp, 32]
stp x0, x19, [sp, 192]
add x0, sp, 192
stp x23, x24, [sp, 48]
stp x25, x26, [sp, 64]
stp x27, x28, [sp, 80]
str w1, [sp, 208]
.cfi_offset 21, -688
.cfi_offset 22, -680
.cfi_offset 23, -672
.cfi_offset 24, -664
.cfi_offset 25, -656
.cfi_offset 26, -648
.cfi_offset 27, -640
.cfi_offset 28, -632
bl _gfortran_st_read
mov w2, 4
add x1, sp, 144
add x0, sp, 192
bl _gfortran_transfer_integer
mov w2, 4
add x1, sp, 148
add x0, sp, 192
bl _gfortran_transfer_integer
add x0, sp, 192
bl _gfortran_st_read_done
ldp w24, w23, [sp, 144]
mov x3, -1
mov x5, 4611686018427387904
mov x2, 2305843009213693951
sxtw x25, w24
sxtw x20, w23
cmp x25, 0
csel x21, x25, x3, ge
cmp x20, 0
csel x4, x20, x3, ge
add x21, x21, 1
add x6, x4, 1
mul x26, x6, x21
cmp x26, x5
lsl x27, x26, 1
lsl x7, x26, 4
cset w8, eq
cmp x27, x2
cinc w9, w8, gt
cmp x25, 0
ccmp x20, 0, 1, ge
csel x10, x7, xzr, ge
cbnz w9, .L159
cmp x10, 0
mov x28, 1
csel x0, x10, x28, ne
bl malloc
stp d8, d9, [sp, 96]
.cfi_offset 73, -616
.cfi_offset 72, -624
cbz x0, .L160
cmp w23, 1
ble .L5
cmp w24, 1
ble .L6
sub w12, w24, #2
sub x4, x27, x26
lsl x22, x21, 3
mov w8, w28
add x13, x21, x12
mvn x14, x12
add x10, x4, x21
mov x6, x12
add x15, x0, x13, lsl 3
lsl x17, x14, 3
mov x9, x21
add x5, x15, 16
.L10:
add x1, x17, x5
sub x18, x10, x9
sub x16, x5, x1
sub x30, x16, #8
lsr x3, x30, 3
add x2, x3, 1
ands x7, x2, 7
beq .L7
cmp x7, 1
beq .L104
cmp x7, 2
beq .L105
cmp x7, 3
beq .L106
cmp x7, 4
beq .L107
cmp x7, 5
beq .L108
cmp x7, 6
beq .L109
str xzr, [x1]
str xzr, [x1, x18, lsl 3]
add x1, x1, 8
.L109:
str xzr, [x1]
str xzr, [x1, x18, lsl 3]
add x1, x1, 8
.L108:
str xzr, [x1]
str xzr, [x1, x18, lsl 3]
add x1, x1, 8
.L107:
str xzr, [x1]
str xzr, [x1, x18, lsl 3]
add x1, x1, 8
.L106:
str xzr, [x1]
str xzr, [x1, x18, lsl 3]
add x1, x1, 8
.L105:
str xzr, [x1]
str xzr, [x1, x18, lsl 3]
add x1, x1, 8
.L104:
str xzr, [x1]
str xzr, [x1, x18, lsl 3]
add x1, x1, 8
cmp x1, x5
beq .L155
.L7:
str xzr, [x1]
add x28, x1, 8
add x16, x1, 16
add x15, x1, 24
str xzr, [x1, x18, lsl 3]
add x14, x1, 32
add x13, x1, 40
add x12, x1, 48
str xzr, [x1, 8]
add x11, x1, 56
add x1, x1, 64
str xzr, [x28, x18, lsl 3]
str xzr, [x1, -48]
str xzr, [x16, x18, lsl 3]
str xzr, [x1, -40]
str xzr, [x15, x18, lsl 3]
str xzr, [x1, -32]
str xzr, [x14, x18, lsl 3]
str xzr, [x1, -24]
str xzr, [x13, x18, lsl 3]
str xzr, [x1, -16]
str xzr, [x12, x18, lsl 3]
str xzr, [x1, -8]
str xzr, [x11, x18, lsl 3]
cmp x1, x5
bne .L7
.L155:
add w8, w8, 1
add x10, x10, x21
add x9, x9, x21
add x5, x5, x22
cmp w23, w8
bne .L10
.L9:
mul x20, x21, x20
fmov d0, 1.0e+0
sub x17, x26, x27
and w18, w24, 7
mov x2, 1
add x30, x4, x20
neg x3, x20, lsl 3
add x7, x0, x30, lsl 3
str d0, [x7, x17, lsl 3]
add x1, x7, 8
str d0, [x7]
str xzr, [x0]
str xzr, [x7, x3]
cmp w24, 1
blt .L151
cbz w18, .L13
cmp w18, 1
beq .L119
cmp w18, 2
beq .L120
cmp w18, 3
beq .L121
cmp w18, 4
beq .L122
cmp w18, 5
beq .L123
cmp w18, 6
beq .L124
str d0, [x1, x17, lsl 3]
mov x2, 2
str d0, [x1]
str xzr, [x0, 8]
str xzr, [x1, x3]
add x1, x1, 8
.L124:
str d0, [x1, x17, lsl 3]
str d0, [x1]
str xzr, [x0, x2, lsl 3]
add x2, x2, 1
str xzr, [x1, x3]
add x1, x1, 8
.L123:
str d0, [x1, x17, lsl 3]
str d0, [x1]
str xzr, [x0, x2, lsl 3]
add x2, x2, 1
str xzr, [x1, x3]
add x1, x1, 8
.L122:
str d0, [x1, x17, lsl 3]
str d0, [x1]
str xzr, [x0, x2, lsl 3]
add x2, x2, 1
str xzr, [x1, x3]
add x1, x1, 8
.L121:
str d0, [x1, x17, lsl 3]
str d0, [x1]
str xzr, [x0, x2, lsl 3]
add x2, x2, 1
str xzr, [x1, x3]
add x1, x1, 8
.L120:
str d0, [x1, x17, lsl 3]
str d0, [x1]
str xzr, [x0, x2, lsl 3]
add x2, x2, 1
str xzr, [x1, x3]
add x1, x1, 8
.L119:
str d0, [x1, x17, lsl 3]
str d0, [x1]
str xzr, [x0, x2, lsl 3]
add x2, x2, 1
str xzr, [x1, x3]
add x1, x1, 8
cmp w24, w2
blt .L151
.L13:
str d0, [x1, x17, lsl 3]
add x28, x1, 8
add x15, x2, 1
add x16, x1, 16
str d0, [x1]
add x13, x2, 2
add x14, x1, 24
add x12, x2, 3
str xzr, [x0, x2, lsl 3]
add x9, x1, 32
add x4, x2, 4
add x8, x1, 40
str xzr, [x1, x3]
add x11, x2, 5
add x5, x1, 48
add x10, x2, 6
str d0, [x28, x17, lsl 3]
add x20, x1, 56
add x18, x2, 7
add x2, x2, 8
str d0, [x1, 8]
add x1, x1, 64
str xzr, [x0, x15, lsl 3]
str xzr, [x28, x3]
str d0, [x16, x17, lsl 3]
str d0, [x1, -48]
str xzr, [x0, x13, lsl 3]
str xzr, [x16, x3]
str d0, [x14, x17, lsl 3]
str d0, [x1, -40]
str xzr, [x0, x12, lsl 3]
str xzr, [x14, x3]
str d0, [x9, x17, lsl 3]
str d0, [x1, -32]
str xzr, [x0, x4, lsl 3]
str xzr, [x9, x3]
str d0, [x8, x17, lsl 3]
str d0, [x1, -24]
str xzr, [x0, x11, lsl 3]
str xzr, [x8, x3]
str d0, [x5, x17, lsl 3]
str d0, [x1, -16]
str xzr, [x0, x10, lsl 3]
str xzr, [x5, x3]
str d0, [x20, x17, lsl 3]
str d0, [x1, -8]
str xzr, [x0, x18, lsl 3]
str xzr, [x20, x3]
cmp w24, w2
bge .L13
.L151:
cmp w24, 0
csel w17, w24, wzr, ge
add w11, w17, 1
.L8:
tbnz w23, #31, .L11
.L12:
scvtf d2, w11
scvtf d1, w24
sub x30, x27, x26
sub x25, x25, x26
add x26, x25, x26
add x27, x25, x27
mov w3, 1
and w7, w23, 7
add x2, x0, x22
fdiv d3, d2, d1
str d3, [x0]
str d3, [x0, x30, lsl 3]
str d3, [x0, x26, lsl 3]
str d3, [x0, x27, lsl 3]
cmp w23, w3
blt .L11
cbz w7, .L15
cmp w7, 1
beq .L113
cmp w7, 2
beq .L114
cmp w7, 3
beq .L115
cmp w7, 4
beq .L116
cmp w7, 5
beq .L117
cmp w7, 6
beq .L118
str d3, [x2]
mov w3, 2
str d3, [x2, x30, lsl 3]
str d3, [x2, x26, lsl 3]
str d3, [x2, x27, lsl 3]
add x2, x2, x22
.L118:
str d3, [x2]
add w3, w3, 1
str d3, [x2, x30, lsl 3]
str d3, [x2, x26, lsl 3]
str d3, [x2, x27, lsl 3]
add x2, x2, x22
.L117:
str d3, [x2]
add w3, w3, 1
str d3, [x2, x30, lsl 3]
str d3, [x2, x26, lsl 3]
str d3, [x2, x27, lsl 3]
add x2, x2, x22
.L116:
str d3, [x2]
add w3, w3, 1
str d3, [x2, x30, lsl 3]
str d3, [x2, x26, lsl 3]
str d3, [x2, x27, lsl 3]
add x2, x2, x22
.L115:
str d3, [x2]
add w3, w3, 1
str d3, [x2, x30, lsl 3]
str d3, [x2, x26, lsl 3]
str d3, [x2, x27, lsl 3]
add x2, x2, x22
.L114:
str d3, [x2]
add w3, w3, 1
str d3, [x2, x30, lsl 3]
str d3, [x2, x26, lsl 3]
str d3, [x2, x27, lsl 3]
add x2, x2, x22
.L113:
str d3, [x2]
add w3, w3, 1
str d3, [x2, x30, lsl 3]
str d3, [x2, x26, lsl 3]
str d3, [x2, x27, lsl 3]
add x2, x2, x22
cmp w23, w3
blt .L11
.L15:
str d3, [x2]
add x1, x2, x22
add w3, w3, 8
str d3, [x2, x30, lsl 3]
add x28, x1, x22
str d3, [x2, x26, lsl 3]
add x15, x28, x22
str d3, [x2, x27, lsl 3]
add x14, x15, x22
str d3, [x1]
add x16, x14, x22
str d3, [x1, x30, lsl 3]
add x13, x16, x22
str d3, [x1, x26, lsl 3]
add x12, x13, x22
str d3, [x1, x27, lsl 3]
add x2, x12, x22
str d3, [x28]
str d3, [x28, x30, lsl 3]
str d3, [x28, x26, lsl 3]
str d3, [x28, x27, lsl 3]
str d3, [x15]
str d3, [x15, x30, lsl 3]
str d3, [x15, x26, lsl 3]
str d3, [x15, x27, lsl 3]
str d3, [x14]
str d3, [x14, x30, lsl 3]
str d3, [x14, x26, lsl 3]
str d3, [x14, x27, lsl 3]
str d3, [x16]
str d3, [x16, x30, lsl 3]
str d3, [x16, x26, lsl 3]
str d3, [x16, x27, lsl 3]
str d3, [x13]
str d3, [x13, x30, lsl 3]
str d3, [x13, x26, lsl 3]
str d3, [x13, x27, lsl 3]
str d3, [x12]
str d3, [x12, x30, lsl 3]
str d3, [x12, x26, lsl 3]
str d3, [x12, x27, lsl 3]
cmp w23, w3
bge .L15
.L11:
add x6, x21, x6, uxtw
adrp x4, .LC6
add x9, x22, 8
fmov d9, 2.5e-1
ldr d8, [x4, #:lo12:.LC6]
add x27, x0, x9
mov w20, 51711
add x0, x0, x6, lsl 3
lsl x28, x21, 1
mov w26, 10
movk w20, 0x3b9a, lsl 16
add x25, x0, 16
.L14:
add x0, sp, 176
add x1, sp, 160
lsl w26, w26, 1
bl timing_
mov w0, 0
.p2align 4
.L18:
cmp w23, 1
ble .L21
cmp w24, 1
ble .L21
mov x11, 0
mov w10, 1
mov x7, x25
mov x9, x28
mov x8, x21
mov x6, x27
.p2align 4
.L22:
sub x5, x7, x6
add w10, w10, 1
mov x15, x6
sub x18, x11, x8
sub x17, x5, #8
sub x30, x9, x8
ldr d30, [x6, -8]
lsr x3, x17, 3
add x2, x3, 1
ands x1, x2, 3
beq .L20
cmp x1, 1
beq .L111
cmp x1, 2
beq .L112
ldr d4, [x6, x18, lsl 3]
ldr d6, [x6, 8]
ldr d5, [x6, x30, lsl 3]
fadd d7, d4, d6
fadd d16, d7, d30
fadd d17, d16, d5
fmul d30, d17, d9
str d30, [x15], 8
.L112:
ldr d18, [x15, x18, lsl 3]
ldr d20, [x15, 8]
ldr d19, [x15, x30, lsl 3]
fadd d21, d18, d20
fadd d22, d21, d30
fadd d23, d22, d19
fmul d30, d23, d9
str d30, [x15], 8
.L111:
ldr d24, [x15, x18, lsl 3]
ldr d26, [x15, 8]
ldr d25, [x15, x30, lsl 3]
fadd d27, d24, d26
fadd d28, d27, d30
fadd d29, d28, d25
fmul d30, d29, d9
str d30, [x15], 8
cmp x7, x15
beq .L154
// OSACA-BEGIN
.L20:
ldr d31, [x15, x18, lsl 3]
ldr d0, [x15, 8]
mov x14, x15
add x16, x15, 24
ldr d2, [x15, x30, lsl 3]
add x15, x15, 32
fadd d1, d31, d0
fadd d3, d1, d30
fadd d4, d3, d2
fmul d5, d4, d9
str d5, [x14], 8
ldr d6, [x14, x18, lsl 3]
ldr d16, [x14, 8]
add x13, x14, 8
ldr d7, [x14, x30, lsl 3]
fadd d17, d6, d16
fadd d18, d17, d5
fadd d19, d18, d7
fmul d20, d19, d9
str d20, [x15, -24]
ldr d21, [x13, x18, lsl 3]
ldr d23, [x14, 16]
ldr d22, [x13, x30, lsl 3]
fadd d24, d21, d23
fadd d25, d24, d20
fadd d26, d25, d22
fmul d27, d26, d9
str d27, [x14, 8]
ldr d30, [x15]
ldr d28, [x16, x18, lsl 3]
ldr d29, [x16, x30, lsl 3]
fadd d31, d28, d30
fadd d2, d31, d27
fadd d0, d2, d29
fmul d30, d0, d9
str d30, [x15, -8]
cmp x7, x15
bne .L20
// OSACA-END
.L154:
add x6, x6, x22
add x11, x11, x21
add x8, x8, x21
add x9, x9, x21
add x7, x7, x22
cmp w23, w10
bne .L22
.L21:
add w4, w0, 1
cmp w26, w4
beq .L17
mov w0, w4
b .L18
.L17:
add w12, w0, 2
add x1, sp, 152
add x0, sp, 168
str w12, [sp, 124]
str w12, [sp, 140]
bl timing_
ldp d3, d1, [sp, 168]
ldr w5, [sp, 124]
fsub d4, d3, d1
fcmpe d4, d8
ccmp w26, w20, 0, lt
ble .L14
cmp w5, w26
ble .L23
str w26, [sp, 140]
.L23:
mov x21, 128
add x0, sp, 192
mov w22, 72
movk x21, 0x6, lsl 32
str w22, [sp, 208]
sub w24, w24, #1
sub w23, w23, #1
stp x21, x19, [sp, 192]
bl _gfortran_st_write
adrp x19, .LANCHOR0
adrp x27, .LC7
add x28, x19, :lo12:.LANCHOR0
mov x2, 14
add x0, sp, 192
mov x1, x28
bl _gfortran_transfer_character_write
mov w2, 4
add x1, sp, 140
add x0, sp, 192
bl _gfortran_transfer_integer_write
add x1, x28, 16
mov x2, 14
add x0, sp, 192
bl _gfortran_transfer_character_write
ldr w25, [sp, 140]
scvtf d9, w24
scvtf d8, w23
ldr d5, [x27, #:lo12:.LC7]
ldp d18, d19, [sp, 168]
mov w2, 8
add x1, sp, 184
add x0, sp, 192
scvtf d7, w25
fsub d20, d18, d19
fmul d6, d9, d8
fmul d16, d7, d5
fmul d17, d6, d16
fdiv d21, d17, d20
str d21, [sp, 184]
bl _gfortran_transfer_real_write
add x1, x28, 32
mov x2, 6
add x0, sp, 192
bl _gfortran_transfer_character_write
add x0, sp, 192
bl _gfortran_st_write_done
mov w2, 0
mov x1, 0
mov x0, 0
bl _gfortran_stop_string
.L5:
tbnz w24, #31, .L25
.L157:
sub x4, x27, x26
lsl x22, x21, 3
sub w6, w24, #2
b .L9
.L6:
tbz w24, #31, .L157
mov w11, 0
lsl x22, x21, 3
sub w6, w24, #2
b .L12
.L159:
.cfi_restore 72
.cfi_restore 73
adrp x26, .LC1
stp d8, d9, [sp, 96]
.cfi_offset 73, -616
.cfi_offset 72, -624
add x0, x26, :lo12:.LC1
bl _gfortran_runtime_error
.L25:
mov w11, 0
lsl x22, x21, 3
sub w6, w24, #2
b .L8
.L160:
adrp x20, .LC2
add x0, x20, :lo12:.LC2
bl _gfortran_os_error
.cfi_endproc
.LFE0:
.size MAIN__, .-MAIN__
.section .text.startup,"ax",@progbits
.align 2
.p2align 4,,15
.global main
.type main, %function
main:
.LFB1:
.cfi_startproc
stp x29, x30, [sp, -16]!
.cfi_def_cfa_offset 16
.cfi_offset 29, -16
.cfi_offset 30, -8
mov x29, sp
bl _gfortran_set_args
adrp x1, .LANCHOR0
mov w0, 7
add x2, x1, :lo12:.LANCHOR0
add x1, x2, 40
bl _gfortran_set_options
bl MAIN__
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 3
.set .LANCHOR0,. + 0
.LC3:
.ascii "# Iterations: "
.zero 2
.LC4:
.ascii " Performance: "
.zero 2
.LC5:
.ascii " MLUPs"
.zero 2
.type options.8.2753, %object
.size options.8.2753, 28
options.8.2753:
.word 68
.word 8191
.word 0
.word 1
.word 1
.word 0
.word 31
.section .rodata.cst8,"aM",@progbits,8
.align 3
.LC6:
.word 2576980378
.word 1070176665
.LC7:
.word 2696277389
.word 1051772663
.section .rodata.str1.8,"aMS",@progbits,1
.align 3
.LC0:
.string "gs.f90"
.zero 1
.LC1:
.string "Integer overflow when calculating the amount of memory to allocate"
.zero 5
.LC2:
.string "Allocation would exceed memory limit"
.ident "GCC: (ARM-build-8) 8.2.0"
.section .note.GNU-stack,"",@progbits

1073
examples/gs/gs.s.zen.gcc.s Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,40 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L21:
vmovupd (%r8,%rax), %ymm11
vmovupd (%rsi,%rax), %ymm13
vaddpd (%r9,%rax), %ymm11, %ymm12
vaddpd (%rdi,%rax), %ymm13, %ymm14
vmovupd 32(%r8,%rax), %ymm1
vmovupd 32(%rsi,%rax), %ymm2
vaddpd %ymm14, %ymm12, %ymm15
vaddpd 32(%r9,%rax), %ymm1, %ymm5
vaddpd 32(%rdi,%rax), %ymm2, %ymm7
vmulpd %ymm8, %ymm15, %ymm0
vmovupd 64(%r8,%rax), %ymm10
vaddpd %ymm7, %ymm5, %ymm6
vmovupd 64(%rsi,%rax), %ymm12
vmovupd 96(%rsi,%rax), %ymm5
vmovupd %ymm0, (%rdx,%rax)
vmovupd 96(%r8,%rax), %ymm0
vaddpd 64(%r9,%rax), %ymm10, %ymm11
vaddpd 64(%rdi,%rax), %ymm12, %ymm13
vaddpd 96(%r9,%rax), %ymm0, %ymm1
vaddpd 96(%rdi,%rax), %ymm5, %ymm2
vaddpd %ymm13, %ymm11, %ymm14
vmulpd %ymm8, %ymm6, %ymm9
vaddpd %ymm2, %ymm1, %ymm7
vmulpd %ymm8, %ymm14, %ymm15
vmulpd %ymm8, %ymm7, %ymm6
vmovupd %ymm9, 32(%rdx,%rax)
vmovupd %ymm15, 64(%rdx,%rax)
vmovupd %ymm6, 96(%rdx,%rax)
subq $-128, %rax
cmpq %rax, %r15
jne .L21
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,46 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L28:
movupd 16(%r8,%rax), %xmm11
movupd 16(%rdi,%rax), %xmm12
movupd 16(%rsi,%rax), %xmm13
addpd %xmm11, %xmm15
addpd %xmm13, %xmm12
movupd 32(%rdi,%rax), %xmm14
movupd 32(%rsi,%rax), %xmm0
addpd %xmm15, %xmm12
movupd 32(%r8,%rax), %xmm15
addpd %xmm0, %xmm14
addpd %xmm15, %xmm11
movupd 48(%rdi,%rax), %xmm1
movupd 48(%rsi,%rax), %xmm7
addpd %xmm11, %xmm14
addpd %xmm7, %xmm1
mulpd %xmm2, %xmm12
mulpd %xmm2, %xmm14
movups %xmm12, 16(%rcx,%rax)
movups %xmm14, 32(%rcx,%rax)
movupd 48(%r8,%rax), %xmm14
addpd %xmm14, %xmm15
addpd %xmm15, %xmm1
mulpd %xmm2, %xmm1
movups %xmm1, 48(%rcx,%rax)
addq $64, %rax
.L21:
movupd (%r8,%rax), %xmm15
movupd (%rdi,%rax), %xmm0
movupd (%rsi,%rax), %xmm1
addpd %xmm15, %xmm14
addpd %xmm1, %xmm0
leaq 16(%rax), %r10
addpd %xmm0, %xmm14
mulpd %xmm2, %xmm14
movups %xmm14, (%rcx,%rax)
cmpq %r10, %r14
jne .L28
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,37 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.47: # Preds ..B1.47 ..B1.46
# Execution count [1.15e+04]
vmovupd 10016(%r8,%rcx,8), %ymm1 #94.5
vmovupd 10048(%r8,%rcx,8), %ymm6 #94.5
vmovupd 10080(%r8,%rcx,8), %ymm11 #94.5
vaddpd 16(%r12,%rcx,8), %ymm1, %ymm2 #94.5
vaddpd 48(%r12,%rcx,8), %ymm6, %ymm7 #94.5
vaddpd 80(%r12,%rcx,8), %ymm11, %ymm12 #94.5
vaddpd 20032(%r10,%rcx,8), %ymm2, %ymm3 #94.5
vaddpd 20064(%r10,%rcx,8), %ymm7, %ymm8 #94.5
vaddpd 20096(%r10,%rcx,8), %ymm12, %ymm13 #94.5
vaddpd 10032(%r8,%rcx,8), %ymm3, %ymm4 #94.5
vaddpd 10064(%r8,%rcx,8), %ymm8, %ymm9 #94.5
vaddpd 10096(%r8,%rcx,8), %ymm13, %ymm14 #94.5
vmovupd 10112(%r8,%rcx,8), %ymm1 #94.5
vmulpd %ymm4, %ymm0, %ymm5 #94.5
vmulpd %ymm9, %ymm0, %ymm10 #94.5
vmulpd %ymm14, %ymm0, %ymm15 #94.5
vaddpd 112(%r12,%rcx,8), %ymm1, %ymm2 #94.5
vmovupd %ymm5, 10016(%r9,%rcx,8) #94.5
vmovupd %ymm10, 10048(%r9,%rcx,8) #94.5
vmovupd %ymm15, 10080(%r9,%rcx,8) #94.5
vaddpd 20128(%r10,%rcx,8), %ymm2, %ymm3 #94.5
vaddpd 10128(%r8,%rcx,8), %ymm3, %ymm4 #94.5
vmulpd %ymm4, %ymm0, %ymm5 #94.5
vmovupd %ymm5, 10112(%r9,%rcx,8) #94.5
addq $16, %rcx #94.5
cmpq %r14, %rcx #94.5
jb ..B1.47 # Prob 82% #94.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,69 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.47: # Preds ..B1.63 ..B1.46
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.48: # Preds ..B1.47
# Execution count [1.73e+04]
vmovupd 10032(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10016(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.51: # Preds ..B1.48
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
vaddpd 16(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20032(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10016(%r9,%rcx,8) #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.52: # Preds ..B1.51
# Execution count [1.73e+04]
vmovupd 10096(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10080(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.55: # Preds ..B1.52
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
vaddpd 80(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20096(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10080(%r9,%rcx,8) #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.56: # Preds ..B1.55
# Execution count [1.73e+04]
vmovupd 10160(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10144(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.59: # Preds ..B1.56
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
vaddpd 144(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20160(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10144(%r9,%rcx,8) #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.60: # Preds ..B1.59
# Execution count [1.73e+04]
vmovupd 10224(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10208(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.63: # Preds ..B1.60
# Execution count [1.15e+04]
vaddpd 208(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20224(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10208(%r9,%rcx,8) #94.5
addq $32, %rcx #94.5
cmpq %r14, %rcx #94.5
jb ..B1.47 # Prob 82% #94.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,40 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.42: # Preds ..B1.42 ..B1.41
# Execution count [1.15e+04]
movups 10016(%r8,%rcx,8), %xmm0 #94.5
addpd 16(%r12,%rcx,8), %xmm0 #94.5
addpd 20032(%r10,%rcx,8), %xmm0 #94.5
movups 10032(%r8,%rcx,8), %xmm2 #94.5
movups 32(%r12,%rcx,8), %xmm1 #94.5
addpd %xmm2, %xmm0 #94.5
addpd %xmm1, %xmm2 #94.5
mulpd %xmm7, %xmm0 #94.5
addpd 20048(%r10,%rcx,8), %xmm2 #94.5
movups 10048(%r8,%rcx,8), %xmm4 #94.5
movups 48(%r12,%rcx,8), %xmm3 #94.5
addpd %xmm4, %xmm2 #94.5
addpd %xmm3, %xmm4 #94.5
mulpd %xmm7, %xmm2 #94.5
addpd 20064(%r10,%rcx,8), %xmm4 #94.5
movups 10064(%r8,%rcx,8), %xmm6 #94.5
movups 64(%r12,%rcx,8), %xmm5 #94.5
addpd %xmm6, %xmm4 #94.5
addpd %xmm5, %xmm6 #94.5
mulpd %xmm7, %xmm4 #94.5
addpd 20080(%r10,%rcx,8), %xmm6 #94.5
addpd 10080(%r8,%rcx,8), %xmm6 #94.5
mulpd %xmm7, %xmm6 #94.5
movups %xmm0, 10016(%r9,%rcx,8) #94.5
movups %xmm2, 10032(%r9,%rcx,8) #94.5
movups %xmm4, 10048(%r9,%rcx,8) #94.5
movups %xmm6, 10064(%r9,%rcx,8) #94.5
addq $8, %rcx #94.5
cmpq %r14, %rcx #94.5
jb ..B1.42 # Prob 82% #94.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,131 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_16 Depth=1
// Parent Loop BB1_19 Depth=2
// Parent Loop BB1_24 Depth=3
// => This Inner Loop Header: Depth=4
add x0, x5, x16
add x18, x21, x16
ldp q4, q5, [x0, #16]
ldp q6, q7, [x0, #48]
ldur q0, [x18, #8]
ldur q1, [x18, #24]
ldur q2, [x18, #40]
ldur q3, [x18, #56]
add x1, x28, x16
add x15, x15, #32 // =32
fadd v0.2d, v4.2d, v0.2d
fadd v4.2d, v5.2d, v1.2d
fadd v5.2d, v6.2d, v2.2d
fadd v6.2d, v7.2d, v3.2d
ldp q7, q16, [x1, #16]
fadd v1.2d, v7.2d, v1.2d
ldp q17, q18, [x1, #48]
ldur q19, [x18, #72]
fadd v0.2d, v0.2d, v1.2d
fadd v1.2d, v16.2d, v2.2d
fadd v2.2d, v17.2d, v3.2d
fadd v3.2d, v18.2d, v19.2d
ldp q16, q17, [x0, #80]
ldp q18, q19, [x0, #112]
fadd v1.2d, v4.2d, v1.2d
fadd v2.2d, v5.2d, v2.2d
fadd v3.2d, v6.2d, v3.2d
ldur q4, [x18, #72]
ldur q5, [x18, #88]
ldur q6, [x18, #104]
ldur q7, [x18, #120]
fadd v4.2d, v16.2d, v4.2d
fadd v16.2d, v17.2d, v5.2d
fadd v17.2d, v18.2d, v6.2d
fadd v18.2d, v19.2d, v7.2d
ldp q19, q20, [x1, #80]
fadd v5.2d, v19.2d, v5.2d
ldp q21, q22, [x1, #112]
ldur q23, [x18, #136]
fadd v4.2d, v4.2d, v5.2d
fadd v5.2d, v20.2d, v6.2d
fadd v6.2d, v21.2d, v7.2d
fadd v7.2d, v22.2d, v23.2d
ldp q20, q21, [x0, #144]
ldp q22, q23, [x0, #176]
fadd v5.2d, v16.2d, v5.2d
fadd v6.2d, v17.2d, v6.2d
fadd v7.2d, v18.2d, v7.2d
ldur q16, [x18, #136]
ldur q17, [x18, #152]
ldur q18, [x18, #168]
ldur q19, [x18, #184]
fadd v16.2d, v20.2d, v16.2d
fadd v20.2d, v21.2d, v17.2d
fadd v21.2d, v22.2d, v18.2d
fadd v22.2d, v23.2d, v19.2d
ldp q23, q24, [x1, #144]
fadd v17.2d, v23.2d, v17.2d
ldp q25, q26, [x1, #176]
fadd v16.2d, v16.2d, v17.2d
fadd v17.2d, v24.2d, v18.2d
fadd v18.2d, v25.2d, v19.2d
ldp q24, q25, [x0, #208]
ldur q23, [x18, #200]
fadd v17.2d, v20.2d, v17.2d
fadd v18.2d, v21.2d, v18.2d
ldur q20, [x18, #200]
ldur q21, [x18, #216]
fadd v19.2d, v26.2d, v23.2d
fadd v20.2d, v24.2d, v20.2d
fadd v24.2d, v25.2d, v21.2d
ldp q25, q26, [x1, #208]
fadd v21.2d, v25.2d, v21.2d
fadd v20.2d, v20.2d, v21.2d
ldp q21, q25, [x0, #240]
fadd v19.2d, v22.2d, v19.2d
ldur q22, [x18, #232]
fadd v21.2d, v21.2d, v22.2d
fadd v22.2d, v26.2d, v22.2d
fadd v22.2d, v24.2d, v22.2d
ldp q24, q26, [x1, #240]
ldur q23, [x18, #248]
fadd v25.2d, v25.2d, v23.2d
fadd v23.2d, v24.2d, v23.2d
add x18, x18, #264 // =264
fmul v0.2d, v0.2d, v28.2d
fmul v1.2d, v1.2d, v28.2d
fmul v2.2d, v2.2d, v28.2d
fmul v5.2d, v5.2d, v28.2d
fadd v21.2d, v21.2d, v23.2d
ldr q23, [x18]
add x18, x25, x16
stur q0, [x18, #8]
stur q1, [x18, #24]
fmul v3.2d, v3.2d, v28.2d
stur q2, [x18, #40]
fadd v23.2d, v26.2d, v23.2d
stur q5, [x18, #88]
fmul v4.2d, v4.2d, v28.2d
stur q3, [x18, #56]
fmul v6.2d, v6.2d, v28.2d
stur q4, [x18, #72]
fmul v0.2d, v7.2d, v28.2d
stur q6, [x18, #104]
fmul v1.2d, v16.2d, v28.2d
stur q0, [x18, #120]
fmul v2.2d, v17.2d, v28.2d
stur q1, [x18, #136]
fmul v4.2d, v19.2d, v28.2d
stur q2, [x18, #152]
fadd v5.2d, v25.2d, v23.2d
stur q4, [x18, #184]
fmul v3.2d, v18.2d, v28.2d
stur q3, [x18, #168]
fmul v6.2d, v20.2d, v28.2d
stur q6, [x18, #200]
fmul v0.2d, v22.2d, v28.2d
stur q0, [x18, #216]
fmul v1.2d, v21.2d, v28.2d
stur q1, [x18, #232]
add x16, x16, #256 // =256
fmul v2.2d, v5.2d, v28.2d
stur q2, [x18, #248]
adds x17, x17, #4 // =4
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,43 @@
// OSACA-BEGIN
.L93:
add x5, x0, 16
ldr q2, [x14, x0]
ldr q5, [x25, x0]
add x7, x0, 32
ldr q13, [x22, x0]
ldr q4, [x25, x5]
add x6, x0, 48
ldr x9, [sp, 144]
ldr q19, [x22, x5]
ldr q7, [x14, x5]
ldr q6, [x14, x7]
ldr q3, [x25, x7]
ldr q18, [x22, x7]
fadd v17.2d, v2.2d, v30.2d
ldr q16, [x14, x6]
ldr q20, [x25, x6]
fadd v23.2d, v5.2d, v13.2d
ldr q22, [x22, x6]
fadd v24.2d, v4.2d, v19.2d
fadd v25.2d, v7.2d, v2.2d
fadd v27.2d, v6.2d, v7.2d
fadd v26.2d, v3.2d, v18.2d
fadd v28.2d, v16.2d, v6.2d
mov v30.16b, v16.16b
fadd v29.2d, v20.2d, v22.2d
fadd v31.2d, v23.2d, v17.2d
fadd v0.2d, v24.2d, v25.2d
fadd v2.2d, v26.2d, v27.2d
fadd v1.2d, v29.2d, v28.2d
fmul v5.2d, v31.2d, v21.2d
fmul v13.2d, v0.2d, v21.2d
fmul v4.2d, v2.2d, v21.2d
fmul v19.2d, v1.2d, v21.2d
str q5, [x28, x0]
add x0, x0, 64
str q13, [x28, x5]
str q4, [x28, x7]
str q19, [x28, x6]
cmp x9, x0
bne .L93
// OSACA-END

View File

@@ -0,0 +1,36 @@
# OSACA-BEGIN
.L28:
vmovups (%r10,%rcx), %xmm5
vmovups 32(%r10,%rax), %xmm13
vmovups (%rdi,%rcx), %xmm1
vmovups 32(%rdi,%rax), %xmm14
vmovups 48(%rdi,%rax), %xmm9
vaddpd (%r8,%rcx), %xmm1, %xmm10
vaddpd 32(%r8,%rax), %xmm14, %xmm15
vaddpd 48(%r8,%rax), %xmm9, %xmm1
vaddpd %xmm5, %xmm8, %xmm8
vaddpd %xmm13, %xmm5, %xmm6
vmovups 48(%r10,%rax), %xmm5
vaddpd %xmm8, %xmm10, %xmm11
vaddpd %xmm6, %xmm15, %xmm0
vmulpd %xmm2, %xmm11, %xmm12
vaddpd %xmm5, %xmm13, %xmm4
vmulpd %xmm2, %xmm0, %xmm7
vaddpd %xmm4, %xmm1, %xmm10
vmovups %xmm12, (%rsi,%rcx)
vmovups %xmm7, 32(%rsi,%rax)
vmulpd %xmm2, %xmm10, %xmm8
vmovups %xmm8, 48(%rsi,%rax)
addq $64, %rax
.L21:
vmovups (%r10,%rax), %xmm8
leaq 16(%rax), %rcx
vmovups (%rdi,%rax), %xmm9
vaddpd (%r8,%rax), %xmm9, %xmm10
vaddpd %xmm8, %xmm5, %xmm11
vaddpd %xmm11, %xmm10, %xmm12
vmulpd %xmm2, %xmm12, %xmm13
vmovups %xmm13, (%rsi,%rax)
cmpq %rcx, %r14
jne .L28
# OSACA-END

View File

@@ -0,0 +1,44 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r15,%rax), %ymm5
vmovupd 0(%r13,%rax), %ymm6
vmovupd 32(%r15,%rax), %ymm8
vmovupd 32(%r13,%rax), %ymm7
vmovupd 64(%r15,%rax), %ymm9
vmovupd 64(%r13,%rax), %ymm10
vmovupd 96(%r15,%rax), %ymm11
vmovupd 96(%r13,%rax), %ymm12
vmovupd 128(%r15,%rax), %ymm13
vmovupd 128(%r13,%rax), %ymm14
vmovupd 160(%r15,%rax), %ymm15
vmovupd 160(%r13,%rax), %ymm2
vmovupd 192(%r15,%rax), %ymm0
vmovupd 192(%r13,%rax), %ymm1
vmovupd 224(%r15,%rax), %ymm3
vmovupd 224(%r13,%rax), %ymm4
vfmadd132pd (%r14,%rax), %ymm6, %ymm5
vfmadd132pd 32(%r14,%rax), %ymm7, %ymm8
vfmadd132pd 64(%r14,%rax), %ymm10, %ymm9
vfmadd132pd 96(%r14,%rax), %ymm12, %ymm11
vfmadd132pd 128(%r14,%rax), %ymm14, %ymm13
vfmadd132pd 160(%r14,%rax), %ymm2, %ymm15
vfmadd132pd 192(%r14,%rax), %ymm1, %ymm0
vfmadd132pd 224(%r14,%rax), %ymm4, %ymm3
vmovupd %ymm5, (%r12,%rax)
vmovupd %ymm8, 32(%r12,%rax)
vmovupd %ymm9, 64(%r12,%rax)
vmovupd %ymm11, 96(%r12,%rax)
vmovupd %ymm13, 128(%r12,%rax)
vmovupd %ymm15, 160(%r12,%rax)
vmovupd %ymm0, 192(%r12,%rax)
vmovupd %ymm3, 224(%r12,%rax)
addq $256, %rax
cmpq %rax, %r8
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,21 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.41: # Preds ..B1.41 ..B1.40
# Execution count [2.22e+03]
vmovups (%rcx,%rax,8), %zmm2 #80.5
vmovups 64(%rcx,%rax,8), %zmm4 #80.5
vmovups (%r14,%rax,8), %zmm1 #80.5
vmovups 64(%r14,%rax,8), %zmm3 #80.5
vfmadd213pd (%r8,%rax,8), %zmm1, %zmm2 #80.5
vfmadd213pd 64(%r8,%rax,8), %zmm3, %zmm4 #80.5
vmovupd %zmm2, (%r13,%rax,8) #80.5
vmovupd %zmm4, 64(%r13,%rax,8) #80.5
addq $16, %rax #80.5
cmpq %r12, %rax #80.5
jb ..B1.41 # Prob 82% #80.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,112 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x9, #-256]
ldp q2, q3, [x9, #-224]
ldp q4, q5, [x10, #-256]
ldp q6, q7, [x10, #-224]
ldp q16, q17, [x11, #-256]
ldp q18, q19, [x11, #-224]
fmla v0.2d, v16.2d, v4.2d
fmla v1.2d, v17.2d, v5.2d
stp q1, q0, [sp, #96] // 32-byte Folded Spill
fmla v2.2d, v18.2d, v6.2d
fmla v3.2d, v19.2d, v7.2d
ldp q4, q5, [x9, #-192]
ldp q6, q7, [x9, #-160]
ldp q16, q17, [x10, #-192]
ldp q18, q19, [x10, #-160]
ldp q20, q21, [x11, #-192]
ldp q22, q23, [x11, #-160]
fmla v4.2d, v20.2d, v16.2d
stp q3, q4, [x12, #-208]
fmla v5.2d, v21.2d, v17.2d
fmla v6.2d, v22.2d, v18.2d
stp q5, q6, [x12, #-176]
fmla v7.2d, v23.2d, v19.2d
ldp q16, q18, [x9, #-128]
ldp q17, q19, [x9, #-96]
ldp q20, q21, [x10, #-128]
ldp q22, q23, [x10, #-96]
ldp q24, q25, [x11, #-128]
ldp q26, q27, [x11, #-96]
fmla v16.2d, v24.2d, v20.2d
stp q7, q16, [x12, #-144]
fmla v18.2d, v25.2d, v21.2d
fmla v17.2d, v26.2d, v22.2d
stp q18, q17, [x12, #-112]
fmla v19.2d, v27.2d, v23.2d
ldp q22, q23, [x9, #-64]
ldp q20, q21, [x9, #-32]
ldp q24, q25, [x10, #-64]
ldp q26, q27, [x10, #-32]
ldp q28, q29, [x11, #-64]
ldp q30, q31, [x11, #-32]
fmla v22.2d, v28.2d, v24.2d
stp q19, q22, [x12, #-80]
fmla v23.2d, v29.2d, v25.2d
fmla v20.2d, v30.2d, v26.2d
stp q23, q20, [x12, #-48]
fmla v21.2d, v31.2d, v27.2d
stur q21, [x12, #-16]
ldp q24, q25, [x9]
ldp q26, q27, [x9, #32]
ldp q28, q29, [x10]
ldp q30, q31, [x10, #32]
ldp q8, q10, [x11]
ldp q11, q12, [x11, #32]
fmla v24.2d, v8.2d, v28.2d
fmla v25.2d, v10.2d, v29.2d
stp q24, q25, [x12]
fmla v26.2d, v11.2d, v30.2d
fmla v27.2d, v12.2d, v31.2d
stp q26, q27, [x12, #32]
ldp q28, q29, [x9, #64]
ldp q30, q31, [x9, #96]
ldp q8, q10, [x10, #64]
ldp q11, q12, [x10, #96]
ldp q13, q14, [x11, #64]
ldp q15, q9, [x11, #96]
fmla v28.2d, v13.2d, v8.2d
fmla v29.2d, v14.2d, v10.2d
stp q28, q29, [x12, #64]
fmla v30.2d, v15.2d, v11.2d
fmla v31.2d, v9.2d, v12.2d
stp q30, q31, [x12, #96]
ldp q8, q9, [x9, #128]
ldp q12, q13, [x10, #128]
ldp q14, q15, [x11, #128]
ldp q10, q11, [x9, #160]
fmla v8.2d, v14.2d, v12.2d
ldp q12, q14, [x10, #160]
fmla v9.2d, v15.2d, v13.2d
stp q8, q9, [x12, #128]
ldp q13, q15, [x11, #160]
fmla v10.2d, v13.2d, v12.2d
fmla v11.2d, v15.2d, v14.2d
stp q10, q11, [x12, #160]
ldp q12, q13, [x9, #192]
ldp q14, q15, [x10, #192]
ldp q0, q1, [x11, #192]
fmla v12.2d, v0.2d, v14.2d
ldr q0, [sp, #112] // 16-byte Folded Reload
stur q0, [x12, #-256]
ldr q0, [sp, #96] // 16-byte Folded Reload
stp q0, q2, [x12, #-240]
ldp q0, q2, [x9, #224]
ldp q3, q4, [x10, #224]
ldp q5, q6, [x11, #224]
fmla v13.2d, v1.2d, v15.2d
stp q12, q13, [x12, #192]
fmla v0.2d, v5.2d, v3.2d
fmla v2.2d, v6.2d, v4.2d
stp q0, q2, [x12, #224]
add x8, x8, #64 // =64
add x12, x12, #512 // =512
add x11, x11, #512 // =512
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x13, x13, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,53 @@
// OSACA-BEGIN
.L17:
add x12, x11, 16
ldr q29, [x22, x11]
ldr q30, [x20, x11]
add x7, x11, 32
ldr q31, [x21, x11]
ldr q7, [x22, x12]
add x6, x11, 48
add x5, x11, 64
ldr q6, [x20, x12]
ldr q2, [x21, x12]
add x8, x11, 80
add x0, x11, 96
ldr q9, [x22, x7]
ldr q5, [x20, x7]
add x13, x11, 112
ldr q1, [x21, x7]
ldr q16, [x22, x6]
ldr q4, [x20, x6]
ldr q0, [x21, x6]
fmla v30.2d, v29.2d, v31.2d
ldr q23, [x22, x5]
ldr q3, [x20, x5]
fmla v6.2d, v7.2d, v2.2d
ldr q22, [x21, x5]
ldr q21, [x22, x8]
ldr q24, [x20, x8]
ldr q20, [x21, x8]
fmla v5.2d, v9.2d, v1.2d
ldr q19, [x22, x0]
ldr q25, [x20, x0]
fmla v4.2d, v16.2d, v0.2d
ldr q18, [x21, x0]
ldr q17, [x22, x13]
ldr q26, [x20, x13]
ldr q27, [x21, x13]
fmla v3.2d, v23.2d, v22.2d
fmla v24.2d, v21.2d, v20.2d
str q30, [x19, x11]
add x11, x11, 128
str q6, [x19, x12]
fmla v25.2d, v19.2d, v18.2d
str q5, [x19, x7]
fmla v26.2d, v17.2d, v27.2d
str q4, [x19, x6]
str q3, [x19, x5]
str q24, [x19, x8]
str q25, [x19, x0]
str q26, [x19, x13]
cmp x25, x11
bne .L17
// OSACA-END

View File

@@ -0,0 +1,38 @@
# OSACA-BEGIN
.L19:
vmovups (%r14,%rax), %xmm0
vmovups (%r12,%rax), %xmm5
vmovups 16(%r14,%rax), %xmm3
vmovups 16(%r12,%rax), %xmm6
vmovups 32(%r14,%rax), %xmm4
vmovups 32(%r12,%rax), %xmm7
vmovups 48(%r14,%rax), %xmm8
vmovups 48(%r12,%rax), %xmm9
vmovups 64(%r14,%rax), %xmm10
vmovups 64(%r12,%rax), %xmm11
vmovups 80(%r14,%rax), %xmm12
vmovups 80(%r12,%rax), %xmm13
vmovups 96(%r14,%rax), %xmm14
vmovups 96(%r12,%rax), %xmm15
vmovups 112(%r14,%rax), %xmm2
vmovups 112(%r12,%rax), %xmm1
vfmadd132pd 0(%r13,%rax), %xmm5, %xmm0
vfmadd132pd 16(%r13,%rax), %xmm6, %xmm3
vfmadd132pd 32(%r13,%rax), %xmm7, %xmm4
vfmadd132pd 48(%r13,%rax), %xmm9, %xmm8
vfmadd132pd 64(%r13,%rax), %xmm11, %xmm10
vfmadd132pd 80(%r13,%rax), %xmm13, %xmm12
vfmadd132pd 96(%r13,%rax), %xmm15, %xmm14
vfmadd132pd 112(%r13,%rax), %xmm1, %xmm2
vmovups %xmm0, 0(%rbp,%rax)
vmovups %xmm3, 16(%rbp,%rax)
vmovups %xmm4, 32(%rbp,%rax)
vmovups %xmm8, 48(%rbp,%rax)
vmovups %xmm10, 64(%rbp,%rax)
vmovups %xmm12, 80(%rbp,%rax)
vmovups %xmm14, 96(%rbp,%rax)
vmovups %xmm2, 112(%rbp,%rax)
subq $-128, %rax
cmpq %rcx, %rax
jne .L19
# OSACA-END

View File

@@ -0,0 +1,46 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
# LLVM-MCA-BEGIN
.L19:
vmovupd (%rcx), %ymm4
vmovupd 32(%rcx), %ymm13
vaddsd %xmm4, %xmm0, %xmm6
vunpckhpd %xmm4, %xmm4, %xmm3
vextractf64x2 $0x1, %ymm4, %xmm8
vaddsd %xmm6, %xmm3, %xmm7
vunpckhpd %xmm8, %xmm8, %xmm11
vunpckhpd %xmm13, %xmm13, %xmm1
vaddsd %xmm7, %xmm8, %xmm10
vextractf64x2 $0x1, %ymm13, %xmm2
vunpckhpd %xmm2, %xmm2, %xmm3
vaddsd %xmm11, %xmm10, %xmm12
vmovupd 64(%rcx), %ymm8
vmovupd 96(%rcx), %ymm5
vaddsd %xmm13, %xmm12, %xmm0
vunpckhpd %xmm8, %xmm8, %xmm12
vextractf64x2 $0x1, %ymm8, %xmm14
vaddsd %xmm0, %xmm1, %xmm4
vunpckhpd %xmm14, %xmm14, %xmm0
vextractf64x2 $0x1, %ymm5, %xmm9
vaddsd %xmm4, %xmm2, %xmm6
subq $-128, %rcx
vaddsd %xmm3, %xmm6, %xmm7
vaddsd %xmm8, %xmm7, %xmm11
vunpckhpd %xmm5, %xmm5, %xmm7
vaddsd %xmm11, %xmm12, %xmm13
vunpckhpd %xmm9, %xmm9, %xmm12
vaddsd %xmm13, %xmm14, %xmm1
vaddsd %xmm0, %xmm1, %xmm4
vaddsd %xmm5, %xmm4, %xmm3
vaddsd %xmm3, %xmm7, %xmm8
vaddsd %xmm8, %xmm9, %xmm11
vaddsd %xmm12, %xmm11, %xmm0
cmpq %rcx, %r15
jne .L19
# LLVM-MCA-END
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,20 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vaddpd (%rcx), %ymm3, %ymm4
addq $256, %rcx
vaddpd -224(%rcx), %ymm4, %ymm5
vaddpd -192(%rcx), %ymm5, %ymm6
vaddpd -160(%rcx), %ymm6, %ymm8
vaddpd -128(%rcx), %ymm8, %ymm9
vaddpd -96(%rcx), %ymm9, %ymm10
vaddpd -64(%rcx), %ymm10, %ymm11
vaddpd -32(%rcx), %ymm11, %ymm3
cmpq %rcx, %r15
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,17 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.38: # Preds ..B1.38 ..B1.37
# Execution count [2.22e+03]
vaddpd (%r13,%rax,8), %zmm4, %zmm4 #76.5
vaddpd 64(%r13,%rax,8), %zmm3, %zmm3 #76.5
vaddpd 128(%r13,%rax,8), %zmm2, %zmm2 #76.5
vaddpd 192(%r13,%rax,8), %zmm1, %zmm1 #76.5
addq $32, %rax #76.5
cmpq %r14, %rax #76.5
jb ..B1.38 # Prob 82% #76.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,57 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q4, q5, [x9, #-256]
fadd v0.2d, v4.2d, v0.2d
fadd v1.2d, v5.2d, v1.2d
ldp q4, q5, [x9, #-192]
ldp q16, q17, [x9, #-128]
fadd v4.2d, v4.2d, v16.2d
ldp q6, q7, [x9, #-224]
fadd v2.2d, v6.2d, v2.2d
fadd v3.2d, v7.2d, v3.2d
fadd v0.2d, v0.2d, v4.2d
fadd v4.2d, v5.2d, v17.2d
ldp q6, q7, [x9, #-160]
ldp q18, q19, [x9, #-96]
ldp q16, q17, [x9]
add x8, x8, #64 // =64
fadd v1.2d, v1.2d, v4.2d
fadd v4.2d, v6.2d, v18.2d
fadd v2.2d, v2.2d, v4.2d
fadd v4.2d, v7.2d, v19.2d
ldp q6, q7, [x9, #-32]
ldp q18, q19, [x9, #32]
fadd v6.2d, v6.2d, v18.2d
fadd v7.2d, v7.2d, v19.2d
fadd v3.2d, v3.2d, v4.2d
ldp q4, q5, [x9, #-64]
fadd v4.2d, v4.2d, v16.2d
fadd v5.2d, v5.2d, v17.2d
ldp q16, q17, [x9, #64]
fadd v4.2d, v4.2d, v16.2d
fadd v5.2d, v5.2d, v17.2d
ldp q16, q17, [x9, #128]
fadd v0.2d, v0.2d, v16.2d
fadd v1.2d, v1.2d, v17.2d
ldp q16, q17, [x9, #192]
ldp q18, q19, [x9, #96]
fadd v6.2d, v6.2d, v18.2d
fadd v7.2d, v7.2d, v19.2d
fadd v4.2d, v4.2d, v16.2d
ldp q18, q19, [x9, #160]
fadd v2.2d, v2.2d, v18.2d
fadd v3.2d, v3.2d, v19.2d
fadd v0.2d, v0.2d, v4.2d
fadd v4.2d, v5.2d, v17.2d
ldp q18, q19, [x9, #224]
add x9, x9, #512 // =512
fadd v1.2d, v1.2d, v4.2d
fadd v4.2d, v6.2d, v18.2d
fadd v2.2d, v2.2d, v4.2d
fadd v4.2d, v7.2d, v19.2d
fadd v3.2d, v3.2d, v4.2d
adds x10, x10, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,47 @@
// OSACA-BEGIN
.L17:
mov x17, x16
ldr q4, [x17], 16
ldr q5, [x16, 16]
add x16, x16, 128
ldr q3, [x16, -80]
ldr q2, [x16, -64]
ldr q0, [x16, -48]
ldr q1, [x16, -32]
ldr q7, [x16, -16]
dup d16, v4.d[0]
dup d6, v4.d[1]
ldr q4, [x17, 16]
dup d22, v5.d[0]
dup d5, v5.d[1]
dup d20, v3.d[0]
dup d3, v3.d[1]
dup d19, v2.d[0]
dup d2, v2.d[1]
dup d21, v4.d[0]
dup d4, v4.d[1]
fadd d10, d8, d16
dup d18, v0.d[0]
dup d0, v0.d[1]
dup d8, v1.d[0]
dup d1, v1.d[1]
dup d17, v7.d[0]
dup d7, v7.d[1]
fadd d23, d6, d10
fadd d24, d23, d22
fadd d25, d5, d24
fadd d26, d25, d21
fadd d27, d4, d26
fadd d28, d27, d20
fadd d29, d3, d28
fadd d30, d29, d19
fadd d31, d2, d30
fadd d16, d31, d18
fadd d6, d0, d16
fadd d22, d6, d8
fadd d5, d1, d22
fadd d20, d5, d17
fadd d8, d7, d20
cmp x22, x16
bne .L17
// OSACA-END

View File

@@ -0,0 +1,23 @@
// OSACA-BEGIN
.L17:
mov x17, x16
ldr q10, [x17], 16
ldr q16, [x16, 16]
add x16, x16, 128
ldr q17, [x16, -80]
ldr q18, [x16, -64]
ldr q19, [x16, -48]
ldr q20, [x16, -32]
ldr q21, [x16, -16]
fadd v22.2d, v1.2d, v10.2d
ldr q23, [x17, 16]
fadd v24.2d, v22.2d, v16.2d
fadd v25.2d, v24.2d, v23.2d
fadd v26.2d, v25.2d, v17.2d
fadd v27.2d, v26.2d, v18.2d
fadd v28.2d, v27.2d, v19.2d
fadd v29.2d, v28.2d, v20.2d
fadd v1.2d, v29.2d, v21.2d
cmp x22, x16
bne .L17
// OSACA-END

View File

@@ -0,0 +1,38 @@
# OSACA-BEGIN
.L19:
vmovsd (%r10), %xmm8
vmovsd 8(%r10), %xmm10
subq $-128, %r10
vmovsd -112(%r10), %xmm12
vmovsd -104(%r10), %xmm14
vmovsd -96(%r10), %xmm1
vmovsd -88(%r10), %xmm2
vmovsd -80(%r10), %xmm3
vmovsd -72(%r10), %xmm6
vaddsd %xmm8, %xmm7, %xmm9
vmovsd -64(%r10), %xmm8
vaddsd %xmm9, %xmm10, %xmm11
vmovsd -56(%r10), %xmm10
vaddsd %xmm12, %xmm11, %xmm13
vmovsd -48(%r10), %xmm12
vaddsd %xmm13, %xmm14, %xmm15
vmovsd -40(%r10), %xmm14
vaddsd %xmm1, %xmm15, %xmm4
vmovsd -32(%r10), %xmm1
vaddsd %xmm4, %xmm2, %xmm0
vmovsd -24(%r10), %xmm2
vaddsd %xmm3, %xmm0, %xmm5
vmovsd -16(%r10), %xmm3
vaddsd %xmm5, %xmm6, %xmm7
vmovsd -8(%r10), %xmm6
vaddsd %xmm8, %xmm7, %xmm9
vaddsd %xmm9, %xmm10, %xmm11
vaddsd %xmm12, %xmm11, %xmm13
vaddsd %xmm13, %xmm14, %xmm15
vaddsd %xmm1, %xmm15, %xmm4
vaddsd %xmm4, %xmm2, %xmm0
vaddsd %xmm3, %xmm0, %xmm5
vaddsd %xmm5, %xmm6, %xmm7
cmpq %r10, %r14
jne .L19
# OSACA-END

View File

@@ -0,0 +1,14 @@
# OSACA-BEGIN
.L19:
vaddpd (%r10), %xmm3, %xmm1
subq $-128, %r10
vaddpd -112(%r10), %xmm1, %xmm4
vaddpd -96(%r10), %xmm4, %xmm5
vaddpd -80(%r10), %xmm5, %xmm6
vaddpd -64(%r10), %xmm6, %xmm8
vaddpd -48(%r10), %xmm8, %xmm9
vaddpd -32(%r10), %xmm9, %xmm10
vaddpd -16(%r10), %xmm10, %xmm3
cmpq %r10, %r14
jne .L19
# OSACA-END

View File

@@ -0,0 +1,36 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r14,%rsi), %ymm14
vmovupd 32(%r14,%rsi), %ymm15
vmovupd 64(%r14,%rsi), %ymm1
vmovupd 96(%r14,%rsi), %ymm0
vmovupd 128(%r14,%rsi), %ymm3
vmovupd 160(%r14,%rsi), %ymm4
vmovupd 192(%r14,%rsi), %ymm5
vmovupd 224(%r14,%rsi), %ymm7
vfmadd213pd 0(%r13,%rsi), %ymm6, %ymm14
vfmadd213pd 32(%r13,%rsi), %ymm6, %ymm15
vfmadd213pd 64(%r13,%rsi), %ymm6, %ymm1
vfmadd213pd 96(%r13,%rsi), %ymm6, %ymm0
vfmadd213pd 128(%r13,%rsi), %ymm6, %ymm3
vfmadd213pd 160(%r13,%rsi), %ymm6, %ymm4
vfmadd213pd 192(%r13,%rsi), %ymm6, %ymm5
vfmadd213pd 224(%r13,%rsi), %ymm6, %ymm7
vmovupd %ymm14, (%r12,%rsi)
vmovupd %ymm15, 32(%r12,%rsi)
vmovupd %ymm1, 64(%r12,%rsi)
vmovupd %ymm0, 96(%r12,%rsi)
vmovupd %ymm3, 128(%r12,%rsi)
vmovupd %ymm4, 160(%r12,%rsi)
vmovupd %ymm5, 192(%r12,%rsi)
vmovupd %ymm7, 224(%r12,%rsi)
addq $256, %rsi
cmpq %rsi, %rcx
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,16 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.40: # Preds ..B1.40 ..B1.39
# Execution count [2.22e+03]
vmovups (%r13,%rax,8), %zmm1 #78.5
vfmadd213pd (%rcx,%rax,8), %zmm2, %zmm1 #78.5
vmovupd %zmm1, (%r14,%rax,8) #78.5
addq $8, %rax #78.5
cmpq %r12, %rax #78.5
jb ..B1.40 # Prob 82% #78.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,118 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q2, q3, [x9, #-256]
ldp q0, q1, [x9, #-224]
ldp q4, q5, [x10, #-256]
ldp q6, q7, [x10, #-224]
fmla v2.2d, v4.2d, v16.2d
fmla v3.2d, v5.2d, v16.2d
stp q2, q3, [x11, #-256]
fmla v0.2d, v6.2d, v16.2d
fmla v1.2d, v7.2d, v16.2d
stp q0, q1, [x11, #-224]
ldp q6, q7, [x9, #-192]
ldp q16, q17, [x10, #-192]
ldr q20, [sp, #80] // 16-byte Folded Reload
fmla v6.2d, v16.2d, v20.2d
ldr q16, [sp, #80] // 16-byte Folded Reload
ldp q4, q5, [x9, #-160]
ldp q18, q19, [x10, #-160]
fmla v7.2d, v17.2d, v16.2d
stp q6, q7, [x11, #-192]
ldr q16, [sp, #80] // 16-byte Folded Reload
fmla v4.2d, v18.2d, v16.2d
ldr q16, [sp, #80] // 16-byte Folded Reload
fmla v5.2d, v19.2d, v16.2d
stp q4, q5, [x11, #-160]
ldp q17, q19, [x9, #-128]
ldp q20, q21, [x10, #-128]
ldr q24, [sp, #80] // 16-byte Folded Reload
fmla v17.2d, v20.2d, v24.2d
ldr q20, [sp, #80] // 16-byte Folded Reload
ldp q16, q18, [x9, #-96]
ldp q22, q23, [x10, #-96]
fmla v19.2d, v21.2d, v20.2d
stp q17, q19, [x11, #-128]
ldr q20, [sp, #80] // 16-byte Folded Reload
fmla v16.2d, v22.2d, v20.2d
ldr q20, [sp, #80] // 16-byte Folded Reload
ldp q24, q25, [x10, #-64]
fmla v18.2d, v23.2d, v20.2d
stp q16, q18, [x11, #-96]
ldp q20, q22, [x9, #-64]
ldr q28, [sp, #80] // 16-byte Folded Reload
fmla v20.2d, v24.2d, v28.2d
ldr q24, [sp, #80] // 16-byte Folded Reload
ldp q21, q23, [x9, #-32]
ldp q26, q27, [x10, #-32]
fmla v22.2d, v25.2d, v24.2d
stp q20, q22, [x11, #-64]
ldr q24, [sp, #80] // 16-byte Folded Reload
fmla v21.2d, v26.2d, v24.2d
ldr q24, [sp, #80] // 16-byte Folded Reload
ldp q28, q29, [x10]
ldr q8, [sp, #80] // 16-byte Folded Reload
ldp q30, q31, [x10, #32]
ldr q9, [sp, #80] // 16-byte Folded Reload
fmla v23.2d, v27.2d, v24.2d
stp q21, q23, [x11, #-32]
ldp q24, q25, [x9]
fmla v24.2d, v28.2d, v8.2d
ldr q28, [sp, #80] // 16-byte Folded Reload
ldp q26, q27, [x9, #32]
ldp q8, q10, [x10, #64]
ldp q11, q12, [x10, #96]
fmla v25.2d, v29.2d, v28.2d
stp q24, q25, [x11]
ldr q28, [sp, #80] // 16-byte Folded Reload
fmla v26.2d, v30.2d, v28.2d
ldr q28, [sp, #80] // 16-byte Folded Reload
ldp q13, q14, [x10, #128]
ldr q2, [sp, #80] // 16-byte Folded Reload
ldp q1, q3, [x10, #192]
fmla v27.2d, v31.2d, v28.2d
stp q26, q27, [x11, #32]
ldp q28, q29, [x9, #64]
fmla v28.2d, v8.2d, v9.2d
ldr q8, [sp, #80] // 16-byte Folded Reload
ldp q30, q31, [x9, #96]
ldr q9, [sp, #80] // 16-byte Folded Reload
ldr q6, [sp, #80] // 16-byte Folded Reload
ldr q5, [sp, #80] // 16-byte Folded Reload
fmla v29.2d, v10.2d, v8.2d
stp q28, q29, [x11, #64]
ldr q8, [sp, #80] // 16-byte Folded Reload
fmla v30.2d, v11.2d, v8.2d
ldr q8, [sp, #80] // 16-byte Folded Reload
ldr q16, [sp, #80] // 16-byte Folded Reload
add x8, x8, #64 // =64
fmla v31.2d, v12.2d, v8.2d
stp q30, q31, [x11, #96]
ldp q8, q10, [x9, #128]
fmla v8.2d, v13.2d, v9.2d
ldr q9, [sp, #80] // 16-byte Folded Reload
ldp q11, q12, [x9, #160]
fmla v10.2d, v14.2d, v9.2d
stp q8, q10, [x11, #128]
ldp q13, q14, [x10, #160]
fmla v12.2d, v14.2d, v2.2d
ldp q2, q0, [x9, #192]
ldr q9, [sp, #80] // 16-byte Folded Reload
fmla v2.2d, v1.2d, v6.2d
ldp q1, q4, [x9, #224]
fmla v0.2d, v3.2d, v5.2d
stp q2, q0, [x11, #192]
ldp q3, q5, [x10, #224]
fmla v11.2d, v13.2d, v9.2d
stp q11, q12, [x11, #160]
fmla v1.2d, v3.2d, v16.2d
fmla v4.2d, v5.2d, v16.2d
stp q1, q4, [x11, #224]
add x11, x11, #512 // =512
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x12, x12, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,45 @@
// OSACA-BEGIN
.L17:
add x0, x10, 16
ldr q23, [x20, x10]
ldr q24, [x21, x10]
add x7, x10, 32
ldr q25, [x20, x0]
ldr q26, [x21, x0]
add x6, x10, 48
add x5, x10, 64
ldr q27, [x20, x7]
ldr q28, [x21, x7]
add x4, x10, 80
add x11, x10, 96
ldr q29, [x20, x6]
ldr q30, [x21, x6]
add x2, x10, 112
fmla v23.2d, v3.2d, v24.2d
ldr q31, [x20, x5]
ldr q4, [x21, x5]
fmla v25.2d, v3.2d, v26.2d
ldr q2, [x20, x4]
ldr q5, [x21, x4]
fmla v27.2d, v3.2d, v28.2d
ldr q1, [x20, x11]
ldr q6, [x21, x11]
fmla v29.2d, v3.2d, v30.2d
ldr q0, [x20, x2]
ldr q7, [x21, x2]
fmla v31.2d, v3.2d, v4.2d
fmla v2.2d, v3.2d, v5.2d
fmla v1.2d, v3.2d, v6.2d
str q23, [x19, x10]
add x10, x10, 128
fmla v0.2d, v3.2d, v7.2d
str q25, [x19, x0]
str q27, [x19, x7]
str q29, [x19, x6]
str q31, [x19, x5]
str q2, [x19, x4]
str q1, [x19, x11]
str q0, [x19, x2]
cmp x24, x10
bne .L17
// OSACA-END

View File

@@ -0,0 +1,30 @@
# OSACA-BEGIN
.L19:
vmovups 0(%r13,%rax), %xmm12
vmovups 16(%r13,%rax), %xmm13
vmovups 32(%r13,%rax), %xmm14
vmovups 48(%r13,%rax), %xmm15
vmovups 64(%r13,%rax), %xmm1
vmovups 80(%r13,%rax), %xmm0
vmovups 96(%r13,%rax), %xmm4
vmovups 112(%r13,%rax), %xmm5
vfmadd213pd (%r12,%rax), %xmm3, %xmm12
vfmadd213pd 16(%r12,%rax), %xmm3, %xmm13
vfmadd213pd 32(%r12,%rax), %xmm3, %xmm14
vfmadd213pd 48(%r12,%rax), %xmm3, %xmm15
vfmadd213pd 64(%r12,%rax), %xmm3, %xmm1
vfmadd213pd 80(%r12,%rax), %xmm3, %xmm0
vfmadd213pd 96(%r12,%rax), %xmm3, %xmm4
vfmadd213pd 112(%r12,%rax), %xmm3, %xmm5
vmovups %xmm12, 0(%rbp,%rax)
vmovups %xmm13, 16(%rbp,%rax)
vmovups %xmm14, 32(%rbp,%rax)
vmovups %xmm15, 48(%rbp,%rax)
vmovups %xmm1, 64(%rbp,%rax)
vmovups %xmm0, 80(%rbp,%rax)
vmovups %xmm4, 96(%rbp,%rax)
vmovups %xmm5, 112(%rbp,%rax)
subq $-128, %rax
cmpq %rbx, %rax
jne .L19
# OSACA-END

View File

@@ -0,0 +1,28 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmulpd (%rcx), %ymm3, %ymm12
vmulpd 32(%rcx), %ymm3, %ymm13
vmulpd 64(%rcx), %ymm3, %ymm14
vmulpd 96(%rcx), %ymm3, %ymm15
vmulpd 128(%rcx), %ymm3, %ymm0
vmulpd 160(%rcx), %ymm3, %ymm1
vmulpd 192(%rcx), %ymm3, %ymm7
vmulpd 224(%rcx), %ymm3, %ymm4
vmovupd %ymm12, (%rcx)
vmovupd %ymm13, 32(%rcx)
vmovupd %ymm14, 64(%rcx)
vmovupd %ymm15, 96(%rcx)
vmovupd %ymm0, 128(%rcx)
vmovupd %ymm1, 160(%rcx)
vmovupd %ymm7, 192(%rcx)
vmovupd %ymm4, 224(%rcx)
addq $256, %rcx
cmpq %r15, %rcx
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,17 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.38: # Preds ..B1.38 ..B1.37
# Execution count [2.22e+03]
vmulpd (%r13,%rax,8), %zmm3, %zmm1 #75.5
vmulpd 64(%r13,%rax,8), %zmm3, %zmm2 #75.5
vmovupd %zmm1, (%r13,%rax,8) #75.5
vmovupd %zmm2, 64(%r13,%rax,8) #75.5
addq $16, %rax #75.5
cmpq %r14, %rax #75.5
jb ..B1.38 # Prob 82% #75.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,15 @@
// OSACA-BEGIN
.LBB1_32: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x8]
ldp q2, q3, [x8, #-32]
fmul v2.2d, v2.2d, v26.2d
fmul v3.2d, v3.2d, v26.2d
stp q2, q3, [x8, #-32]
fmul v0.2d, v0.2d, v26.2d
fmul v1.2d, v1.2d, v26.2d
stp q0, q1, [x8], #64
adds x9, x9, #1 // =1
b.ne .LBB1_32
// OSACA-END

View File

@@ -0,0 +1,31 @@
// OSACA-BEGIN
.L17:
ldr q23, [x16]
mov x17, x16
add x16, x16, 128
fmul v24.2d, v23.2d, v2.2d
str q24, [x17], 16
ldr q25, [x16, -112]
fmul v26.2d, v25.2d, v2.2d
str q26, [x16, -112]
ldr q27, [x17, 16]
fmul v28.2d, v27.2d, v2.2d
str q28, [x17, 16]
ldr q29, [x16, -80]
ldr q30, [x16, -64]
ldr q31, [x16, -48]
ldr q1, [x16, -32]
ldr q0, [x16, -16]
fmul v5.2d, v29.2d, v2.2d
fmul v4.2d, v30.2d, v2.2d
fmul v3.2d, v31.2d, v2.2d
fmul v6.2d, v1.2d, v2.2d
fmul v7.2d, v0.2d, v2.2d
str q5, [x16, -80]
str q4, [x16, -64]
str q3, [x16, -48]
str q6, [x16, -32]
str q7, [x16, -16]
cmp x22, x16
bne .L17
// OSACA-END

View File

@@ -0,0 +1,22 @@
# OSACA-BEGIN
.L19:
vmulpd (%r10), %xmm3, %xmm11
subq $-128, %r10
vmulpd -112(%r10), %xmm3, %xmm12
vmulpd -96(%r10), %xmm3, %xmm13
vmulpd -80(%r10), %xmm3, %xmm14
vmulpd -64(%r10), %xmm3, %xmm15
vmulpd -48(%r10), %xmm3, %xmm0
vmovups %xmm11, -128(%r10)
vmulpd -32(%r10), %xmm3, %xmm7
vmovups %xmm12, -112(%r10)
vmulpd -16(%r10), %xmm3, %xmm1
vmovups %xmm13, -96(%r10)
vmovups %xmm14, -80(%r10)
vmovups %xmm15, -64(%r10)
vmovups %xmm0, -48(%r10)
vmovups %xmm7, -32(%r10)
vmovups %xmm1, -16(%r10)
cmpq %r10, %r14
jne .L19
# OSACA-END