mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2025-12-15 16:40:05 +01:00
added example kernels
This commit is contained in:
43
examples/README.md
Normal file
43
examples/README.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Examples
|
||||
We collected sample kernels for the user to run examples with OSACA.
|
||||
The assembly files contain only the extracted and already marked kernel for code compiled with on Intel Cascade Lake (CSX), AMD Zen and Marvell ThunderX2 (TX2), but can be run on any system supporting the ISA and supported by OSACA.
|
||||
The used compilers were Intel Parallel Studio 19.0up05 and GNU 9.1.0 in case of the x86 systems and ARM HPC Compiler for Linux version 19.2 and GNU 8.2.0 for the ARM-based TX2.
|
||||
|
||||
To analyze the kernels with OSACA, run
|
||||
```
|
||||
osaca --arch ARCH filepath
|
||||
```
|
||||
While all Zen and TX2 kernels use the comment-style OSACA markers, the kernels for Intel Cascade Lake (*.csx.*.s) use the byte markers to be able to be analyzed by IACA as well.
|
||||
For this use
|
||||
```
|
||||
iaca -arch SKX filepath
|
||||
```
|
||||
|
||||
------------
|
||||
The kernels will be explained briefly in the following.
|
||||
|
||||
### Copy
|
||||
```c
|
||||
double * restrict a, * restrict b;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Vector add
|
||||
|
||||
### Vector update
|
||||
|
||||
### Sum reduction
|
||||
|
||||
### DAXPY
|
||||
|
||||
### STREAM triad
|
||||
|
||||
### Schönauer triad
|
||||
|
||||
### Gauss-Seidel method
|
||||
|
||||
### Jacobi 2D
|
||||
|
||||
36
examples/add/add.s.csx.gcc.s
Normal file
36
examples/add/add.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r14,%rax), %ymm3
|
||||
vmovupd 32(%r14,%rax), %ymm4
|
||||
vmovupd 64(%r14,%rax), %ymm6
|
||||
vmovupd 96(%r14,%rax), %ymm9
|
||||
vmovupd 128(%r14,%rax), %ymm11
|
||||
vmovupd 160(%r14,%rax), %ymm13
|
||||
vmovupd 192(%r14,%rax), %ymm15
|
||||
vmovupd 224(%r14,%rax), %ymm0
|
||||
vaddpd 0(%r13,%rax), %ymm3, %ymm7
|
||||
vaddpd 32(%r13,%rax), %ymm4, %ymm5
|
||||
vaddpd 64(%r13,%rax), %ymm6, %ymm8
|
||||
vaddpd 96(%r13,%rax), %ymm9, %ymm10
|
||||
vaddpd 128(%r13,%rax), %ymm11, %ymm12
|
||||
vaddpd 160(%r13,%rax), %ymm13, %ymm14
|
||||
vaddpd 192(%r13,%rax), %ymm15, %ymm1
|
||||
vaddpd 224(%r13,%rax), %ymm0, %ymm2
|
||||
vmovupd %ymm7, (%r12,%rax)
|
||||
vmovupd %ymm5, 32(%r12,%rax)
|
||||
vmovupd %ymm8, 64(%r12,%rax)
|
||||
vmovupd %ymm10, 96(%r12,%rax)
|
||||
vmovupd %ymm12, 128(%r12,%rax)
|
||||
vmovupd %ymm14, 160(%r12,%rax)
|
||||
vmovupd %ymm1, 192(%r12,%rax)
|
||||
vmovupd %ymm2, 224(%r12,%rax)
|
||||
addq $256, %rax
|
||||
cmpq %rax, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
19
examples/add/add.s.csx.icc.s
Normal file
19
examples/add/add.s.csx.icc.s
Normal file
@@ -0,0 +1,19 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.40: # Preds ..B1.40 ..B1.39
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%rcx,%rax,8), %zmm1 #78.5
|
||||
vmovups 64(%rcx,%rax,8), %zmm3 #78.5
|
||||
vaddpd (%r13,%rax,8), %zmm1, %zmm2 #78.5
|
||||
vaddpd 64(%r13,%rax,8), %zmm3, %zmm4 #78.5
|
||||
vmovupd %zmm2, (%r14,%rax,8) #78.5
|
||||
vmovupd %zmm4, 64(%r14,%rax,8) #78.5
|
||||
addq $16, %rax #78.5
|
||||
cmpq %r12, %rax #78.5
|
||||
jb ..B1.40 # Prob 82% #78.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
91
examples/add/add.s.tx2.clang.s
Normal file
91
examples/add/add.s.tx2.clang.s
Normal file
@@ -0,0 +1,91 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q4, q5, [x9, #-224]
|
||||
ldp q2, q3, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fadd v2.2d, v2.2d, v0.2d
|
||||
fadd v3.2d, v3.2d, v1.2d
|
||||
stp q2, q3, [x11, #-256]
|
||||
fadd v0.2d, v6.2d, v4.2d
|
||||
fadd v1.2d, v7.2d, v5.2d
|
||||
stp q0, q1, [x11, #-224]
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q16, q17, [x9, #-160]
|
||||
ldp q6, q7, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fadd v6.2d, v6.2d, v4.2d
|
||||
fadd v7.2d, v7.2d, v5.2d
|
||||
stp q6, q7, [x11, #-192]
|
||||
fadd v4.2d, v18.2d, v16.2d
|
||||
fadd v5.2d, v19.2d, v17.2d
|
||||
stp q4, q5, [x11, #-160]
|
||||
ldp q16, q17, [x9, #-128]
|
||||
ldp q19, q20, [x9, #-96]
|
||||
ldp q18, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fadd v16.2d, v18.2d, v16.2d
|
||||
fadd v18.2d, v21.2d, v17.2d
|
||||
stp q16, q18, [x11, #-128]
|
||||
fadd v17.2d, v22.2d, v19.2d
|
||||
fadd v19.2d, v23.2d, v20.2d
|
||||
stp q17, q19, [x11, #-96]
|
||||
ldp q20, q21, [x9, #-64]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
ldp q22, q23, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fadd v20.2d, v24.2d, v20.2d
|
||||
fadd v21.2d, v25.2d, v21.2d
|
||||
stp q20, q21, [x11, #-64]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q28, q29, [x10]
|
||||
fadd v22.2d, v26.2d, v22.2d
|
||||
fadd v23.2d, v27.2d, v23.2d
|
||||
stp q22, q23, [x11, #-32]
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q30, q31, [x10, #32]
|
||||
fadd v24.2d, v28.2d, v24.2d
|
||||
fadd v25.2d, v29.2d, v25.2d
|
||||
stp q24, q25, [x11]
|
||||
ldp q28, q29, [x9, #64]
|
||||
ldp q8, q10, [x10, #64]
|
||||
fadd v26.2d, v30.2d, v26.2d
|
||||
fadd v27.2d, v31.2d, v27.2d
|
||||
stp q26, q27, [x11, #32]
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldp q11, q12, [x10, #96]
|
||||
fadd v28.2d, v8.2d, v28.2d
|
||||
fadd v29.2d, v10.2d, v29.2d
|
||||
stp q28, q29, [x11, #64]
|
||||
ldp q8, q10, [x9, #128]
|
||||
ldp q13, q14, [x10, #128]
|
||||
ldp q3, q0, [x9, #192]
|
||||
ldp q1, q6, [x10, #192]
|
||||
fadd v30.2d, v11.2d, v30.2d
|
||||
fadd v31.2d, v12.2d, v31.2d
|
||||
stp q30, q31, [x11, #96]
|
||||
ldp q11, q12, [x9, #160]
|
||||
fadd v8.2d, v13.2d, v8.2d
|
||||
fadd v10.2d, v14.2d, v10.2d
|
||||
stp q8, q10, [x11, #128]
|
||||
ldp q13, q14, [x10, #160]
|
||||
fadd v1.2d, v1.2d, v3.2d
|
||||
ldp q3, q4, [x9, #224]
|
||||
fadd v0.2d, v6.2d, v0.2d
|
||||
stp q1, q0, [x11, #192]
|
||||
ldp q5, q6, [x10, #224]
|
||||
fadd v11.2d, v13.2d, v11.2d
|
||||
fadd v2.2d, v14.2d, v12.2d
|
||||
stp q11, q2, [x11, #160]
|
||||
fadd v3.2d, v5.2d, v3.2d
|
||||
fadd v4.2d, v6.2d, v4.2d
|
||||
stp q3, q4, [x11, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x12, x12, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
45
examples/add/add.s.tx2.gcc.s
Normal file
45
examples/add/add.s.tx2.gcc.s
Normal file
@@ -0,0 +1,45 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x0, x10, 16
|
||||
ldr q29, [x21, x10]
|
||||
ldr q30, [x20, x10]
|
||||
add x7, x10, 32
|
||||
ldr q31, [x21, x0]
|
||||
ldr q2, [x20, x0]
|
||||
add x6, x10, 48
|
||||
add x5, x10, 64
|
||||
ldr q5, [x21, x7]
|
||||
ldr q1, [x20, x7]
|
||||
add x4, x10, 80
|
||||
add x11, x10, 96
|
||||
ldr q4, [x21, x6]
|
||||
ldr q0, [x20, x6]
|
||||
add x2, x10, 112
|
||||
fadd v7.2d, v29.2d, v30.2d
|
||||
ldr q3, [x21, x5]
|
||||
ldr q9, [x20, x5]
|
||||
fadd v6.2d, v31.2d, v2.2d
|
||||
ldr q19, [x21, x4]
|
||||
ldr q18, [x20, x4]
|
||||
fadd v20.2d, v5.2d, v1.2d
|
||||
ldr q21, [x21, x11]
|
||||
ldr q17, [x20, x11]
|
||||
fadd v22.2d, v4.2d, v0.2d
|
||||
ldr q23, [x21, x2]
|
||||
ldr q16, [x20, x2]
|
||||
fadd v24.2d, v3.2d, v9.2d
|
||||
fadd v25.2d, v19.2d, v18.2d
|
||||
fadd v26.2d, v21.2d, v17.2d
|
||||
str q7, [x19, x10]
|
||||
add x10, x10, 128
|
||||
fadd v27.2d, v23.2d, v16.2d
|
||||
str q6, [x19, x0]
|
||||
str q20, [x19, x7]
|
||||
str q22, [x19, x6]
|
||||
str q24, [x19, x5]
|
||||
str q25, [x19, x4]
|
||||
str q26, [x19, x11]
|
||||
str q27, [x19, x2]
|
||||
cmp x24, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/add/add.s.zen.gcc.s
Normal file
30
examples/add/add.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%r13,%rax), %xmm0
|
||||
vmovups 16(%r13,%rax), %xmm3
|
||||
vmovups 32(%r13,%rax), %xmm4
|
||||
vmovups 48(%r13,%rax), %xmm6
|
||||
vmovups 64(%r13,%rax), %xmm9
|
||||
vmovups 80(%r13,%rax), %xmm11
|
||||
vmovups 96(%r13,%rax), %xmm13
|
||||
vmovups 112(%r13,%rax), %xmm15
|
||||
vaddpd (%r12,%rax), %xmm0, %xmm7
|
||||
vaddpd 16(%r12,%rax), %xmm3, %xmm2
|
||||
vaddpd 32(%r12,%rax), %xmm4, %xmm5
|
||||
vaddpd 48(%r12,%rax), %xmm6, %xmm8
|
||||
vaddpd 64(%r12,%rax), %xmm9, %xmm10
|
||||
vaddpd 80(%r12,%rax), %xmm11, %xmm12
|
||||
vaddpd 96(%r12,%rax), %xmm13, %xmm14
|
||||
vaddpd 112(%r12,%rax), %xmm15, %xmm1
|
||||
vmovups %xmm7, 0(%rbp,%rax)
|
||||
vmovups %xmm2, 16(%rbp,%rax)
|
||||
vmovups %xmm5, 32(%rbp,%rax)
|
||||
vmovups %xmm8, 48(%rbp,%rax)
|
||||
vmovups %xmm10, 64(%rbp,%rax)
|
||||
vmovups %xmm12, 80(%rbp,%rax)
|
||||
vmovups %xmm14, 96(%rbp,%rax)
|
||||
vmovups %xmm1, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rbx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
28
examples/copy/copy.s.csx.gcc.s
Normal file
28
examples/copy/copy.s.csx.gcc.s
Normal file
@@ -0,0 +1,28 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r12,%rcx), %ymm10
|
||||
vmovupd 32(%r12,%rcx), %ymm11
|
||||
vmovupd 64(%r12,%rcx), %ymm12
|
||||
vmovupd 96(%r12,%rcx), %ymm13
|
||||
vmovupd 128(%r12,%rcx), %ymm14
|
||||
vmovupd 160(%r12,%rcx), %ymm15
|
||||
vmovupd 192(%r12,%rcx), %ymm0
|
||||
vmovupd 224(%r12,%rcx), %ymm1
|
||||
vmovupd %ymm10, 0(%r13,%rcx)
|
||||
vmovupd %ymm11, 32(%r13,%rcx)
|
||||
vmovupd %ymm12, 64(%r13,%rcx)
|
||||
vmovupd %ymm13, 96(%r13,%rcx)
|
||||
vmovupd %ymm14, 128(%r13,%rcx)
|
||||
vmovupd %ymm15, 160(%r13,%rcx)
|
||||
vmovupd %ymm0, 192(%r13,%rcx)
|
||||
vmovupd %ymm1, 224(%r13,%rcx)
|
||||
addq $256, %rcx
|
||||
cmpq %rcx, %r10
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
15
examples/copy/copy.s.csx.icc.s
Normal file
15
examples/copy/copy.s.csx.icc.s
Normal file
@@ -0,0 +1,15 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.39: # Preds ..B1.39 ..B1.38
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r14,%rax,8), %zmm1 #79.5
|
||||
vmovupd %zmm1, (%r13,%rax,8) #79.5
|
||||
addq $8, %rax #79.5
|
||||
cmpq %r12, %rax #79.5
|
||||
jb ..B1.39 # Prob 82% #79.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
42
examples/copy/copy.s.tx2.clang.s
Normal file
42
examples/copy/copy.s.tx2.clang.s
Normal file
@@ -0,0 +1,42 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q2, q3, [x9, #-224]
|
||||
stp q0, q1, [x10, #-256]
|
||||
stp q2, q3, [x10, #-224]
|
||||
add x8, x8, #64 // =64
|
||||
ldp q0, q1, [x9]
|
||||
ldp q2, q3, [x9, #32]
|
||||
stp q0, q1, [x10]
|
||||
stp q2, q3, [x10, #32]
|
||||
ldp q0, q1, [x9, #-192]
|
||||
ldp q2, q3, [x9, #-160]
|
||||
stp q0, q1, [x10, #-192]
|
||||
stp q2, q3, [x10, #-160]
|
||||
ldp q0, q1, [x9, #64]
|
||||
ldp q2, q3, [x9, #96]
|
||||
stp q0, q1, [x10, #64]
|
||||
stp q2, q3, [x10, #96]
|
||||
ldp q0, q1, [x9, #-128]
|
||||
ldp q2, q3, [x9, #-96]
|
||||
stp q0, q1, [x10, #-128]
|
||||
stp q2, q3, [x10, #-96]
|
||||
ldp q0, q1, [x9, #128]
|
||||
ldp q2, q3, [x9, #160]
|
||||
stp q0, q1, [x10, #128]
|
||||
stp q2, q3, [x10, #160]
|
||||
ldp q0, q1, [x9, #-64]
|
||||
ldp q2, q3, [x9, #-32]
|
||||
stp q0, q1, [x10, #-64]
|
||||
stp q2, q3, [x10, #-32]
|
||||
ldp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x9, #224]
|
||||
add x9, x9, #512 // =512
|
||||
stp q0, q1, [x10, #192]
|
||||
stp q2, q3, [x10, #224]
|
||||
add x10, x10, #512 // =512
|
||||
adds x11, x11, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
29
examples/copy/copy.s.tx2.gcc.s
Normal file
29
examples/copy/copy.s.tx2.gcc.s
Normal file
@@ -0,0 +1,29 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x16, x15, 16
|
||||
ldr q9, [x19, x15]
|
||||
add x30, x15, 32
|
||||
add x17, x15, 48
|
||||
ldr q16, [x19, x16]
|
||||
ldr q18, [x19, x30]
|
||||
add x18, x15, 64
|
||||
add x1, x15, 80
|
||||
ldr q17, [x19, x17]
|
||||
ldr q19, [x19, x18]
|
||||
add x3, x15, 96
|
||||
add x2, x15, 112
|
||||
ldr q20, [x19, x1]
|
||||
ldr q21, [x19, x3]
|
||||
str q9, [x20, x15]
|
||||
ldr q22, [x19, x2]
|
||||
add x15, x15, 128
|
||||
str q16, [x20, x16]
|
||||
str q18, [x20, x30]
|
||||
str q17, [x20, x17]
|
||||
str q19, [x20, x18]
|
||||
str q20, [x20, x1]
|
||||
str q21, [x20, x3]
|
||||
str q22, [x20, x2]
|
||||
cmp x23, x15
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
22
examples/copy/copy.s.zen.gcc.s
Normal file
22
examples/copy/copy.s.zen.gcc.s
Normal file
@@ -0,0 +1,22 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%rbp,%r10), %xmm9
|
||||
vmovups 16(%rbp,%r10), %xmm10
|
||||
vmovups 32(%rbp,%r10), %xmm11
|
||||
vmovups 48(%rbp,%r10), %xmm12
|
||||
vmovups 64(%rbp,%r10), %xmm13
|
||||
vmovups 80(%rbp,%r10), %xmm14
|
||||
vmovups 96(%rbp,%r10), %xmm15
|
||||
vmovups 112(%rbp,%r10), %xmm0
|
||||
vmovups %xmm9, (%r12,%r10)
|
||||
vmovups %xmm10, 16(%r12,%r10)
|
||||
vmovups %xmm11, 32(%r12,%r10)
|
||||
vmovups %xmm12, 48(%r12,%r10)
|
||||
vmovups %xmm13, 64(%r12,%r10)
|
||||
vmovups %xmm14, 80(%r12,%r10)
|
||||
vmovups %xmm15, 96(%r12,%r10)
|
||||
vmovups %xmm0, 112(%r12,%r10)
|
||||
subq $-128, %r10
|
||||
cmpq %r10, %r15
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
36
examples/daxpy/daxpy.s.csx.gcc.s
Normal file
36
examples/daxpy/daxpy.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd 0(%r13,%rsi), %ymm14
|
||||
vmovupd 32(%r13,%rsi), %ymm15
|
||||
vmovupd 64(%r13,%rsi), %ymm1
|
||||
vmovupd 96(%r13,%rsi), %ymm0
|
||||
vmovupd 128(%r13,%rsi), %ymm3
|
||||
vmovupd 160(%r13,%rsi), %ymm4
|
||||
vmovupd 192(%r13,%rsi), %ymm5
|
||||
vmovupd 224(%r13,%rsi), %ymm7
|
||||
vfmadd213pd (%r12,%rsi), %ymm6, %ymm14
|
||||
vfmadd213pd 32(%r12,%rsi), %ymm6, %ymm15
|
||||
vfmadd213pd 64(%r12,%rsi), %ymm6, %ymm1
|
||||
vfmadd213pd 96(%r12,%rsi), %ymm6, %ymm0
|
||||
vfmadd213pd 128(%r12,%rsi), %ymm6, %ymm3
|
||||
vfmadd213pd 160(%r12,%rsi), %ymm6, %ymm4
|
||||
vfmadd213pd 192(%r12,%rsi), %ymm6, %ymm5
|
||||
vfmadd213pd 224(%r12,%rsi), %ymm6, %ymm7
|
||||
vmovupd %ymm14, (%r12,%rsi)
|
||||
vmovupd %ymm15, 32(%r12,%rsi)
|
||||
vmovupd %ymm1, 64(%r12,%rsi)
|
||||
vmovupd %ymm0, 96(%r12,%rsi)
|
||||
vmovupd %ymm3, 128(%r12,%rsi)
|
||||
vmovupd %ymm4, 160(%r12,%rsi)
|
||||
vmovupd %ymm5, 192(%r12,%rsi)
|
||||
vmovupd %ymm7, 224(%r12,%rsi)
|
||||
addq $256, %rsi
|
||||
cmpq %rsi, %r10
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
16
examples/daxpy/daxpy.s.csx.icc.s
Normal file
16
examples/daxpy/daxpy.s.csx.icc.s
Normal file
@@ -0,0 +1,16 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.39: # Preds ..B1.39 ..B1.38
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r13,%rax,8), %zmm1 #77.5
|
||||
vfmadd213pd (%r14,%rax,8), %zmm2, %zmm1 #77.5
|
||||
vmovupd %zmm1, (%r14,%rax,8) #77.5
|
||||
addq $8, %rax #77.5
|
||||
cmpq %rbx, %rax #77.5
|
||||
jb ..B1.39 # Prob 82% #77.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
90
examples/daxpy/daxpy.s.tx2.clang.s
Normal file
90
examples/daxpy/daxpy.s.tx2.clang.s
Normal file
@@ -0,0 +1,90 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q1, q2, [x9, #-256]
|
||||
ldp q3, q0, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fmla v1.2d, v4.2d, v31.2d
|
||||
fmla v2.2d, v5.2d, v31.2d
|
||||
stp q1, q2, [x9, #-256]
|
||||
fmla v3.2d, v6.2d, v31.2d
|
||||
fmla v0.2d, v7.2d, v31.2d
|
||||
stp q3, q0, [x9, #-224]
|
||||
ldp q5, q6, [x9, #-192]
|
||||
ldp q7, q4, [x9, #-160]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fmla v5.2d, v16.2d, v31.2d
|
||||
fmla v6.2d, v17.2d, v31.2d
|
||||
stp q5, q6, [x9, #-192]
|
||||
fmla v7.2d, v18.2d, v31.2d
|
||||
fmla v4.2d, v19.2d, v31.2d
|
||||
stp q7, q4, [x9, #-160]
|
||||
ldp q19, q18, [x9, #-128]
|
||||
ldp q16, q17, [x9, #-96]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fmla v18.2d, v21.2d, v31.2d
|
||||
fmla v16.2d, v22.2d, v31.2d
|
||||
ldp q21, q22, [x9, #-64]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
fmla v19.2d, v20.2d, v31.2d
|
||||
stp q19, q18, [x9, #-128]
|
||||
fmla v17.2d, v23.2d, v31.2d
|
||||
stp q16, q17, [x9, #-96]
|
||||
ldp q23, q20, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fmla v21.2d, v24.2d, v31.2d
|
||||
fmla v22.2d, v25.2d, v31.2d
|
||||
stp q21, q22, [x9, #-64]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q28, q29, [x10]
|
||||
fmla v23.2d, v26.2d, v31.2d
|
||||
fmla v20.2d, v27.2d, v31.2d
|
||||
stp q23, q20, [x9, #-32]
|
||||
ldp q26, q27, [x9, #32]
|
||||
fmla v24.2d, v28.2d, v31.2d
|
||||
fmla v25.2d, v29.2d, v31.2d
|
||||
stp q24, q25, [x9]
|
||||
ldp q28, q29, [x10, #32]
|
||||
fmla v26.2d, v28.2d, v31.2d
|
||||
fmla v27.2d, v29.2d, v31.2d
|
||||
stp q26, q27, [x9, #32]
|
||||
ldp q24, q25, [x9, #64]
|
||||
ldp q28, q29, [x10, #64]
|
||||
ldp q26, q27, [x9, #96]
|
||||
fmla v24.2d, v28.2d, v31.2d
|
||||
fmla v25.2d, v29.2d, v31.2d
|
||||
stp q24, q25, [x9, #64]
|
||||
ldp q28, q29, [x10, #96]
|
||||
fmla v26.2d, v28.2d, v31.2d
|
||||
fmla v27.2d, v29.2d, v31.2d
|
||||
stp q26, q27, [x9, #96]
|
||||
ldp q24, q25, [x9, #128]
|
||||
ldp q26, q27, [x10, #128]
|
||||
fmla v24.2d, v26.2d, v31.2d
|
||||
fmla v25.2d, v27.2d, v31.2d
|
||||
stp q24, q25, [x9, #128]
|
||||
ldp q26, q27, [x9, #160]
|
||||
ldp q1, q2, [x10, #160]
|
||||
fmla v26.2d, v1.2d, v31.2d
|
||||
fmla v27.2d, v2.2d, v31.2d
|
||||
stp q26, q27, [x9, #160]
|
||||
ldp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x10, #192]
|
||||
fmla v0.2d, v2.2d, v31.2d
|
||||
fmla v1.2d, v3.2d, v31.2d
|
||||
stp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x9, #224]
|
||||
ldp q4, q5, [x10, #224]
|
||||
fmla v2.2d, v4.2d, v31.2d
|
||||
fmla v3.2d, v5.2d, v31.2d
|
||||
stp q2, q3, [x9, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x11, x11, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
41
examples/daxpy/daxpy.s.tx2.gcc.s
Normal file
41
examples/daxpy/daxpy.s.tx2.gcc.s
Normal file
@@ -0,0 +1,41 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x5, x3
|
||||
ldr q23, [x10]
|
||||
ldr q24, [x5], 16
|
||||
mov x6, x10
|
||||
ldr q25, [x3, 16]
|
||||
ldr q26, [x3, 48]
|
||||
add x10, x10, 128
|
||||
add x3, x3, 128
|
||||
ldr q27, [x3, -64]
|
||||
ldr q28, [x3, -48]
|
||||
ldr q29, [x3, -32]
|
||||
ldr q30, [x3, -16]
|
||||
fmla v23.2d, v3.2d, v24.2d
|
||||
ldr q31, [x5, 16]
|
||||
str q23, [x6], 16
|
||||
ldr q0, [x10, -112]
|
||||
fmla v0.2d, v3.2d, v25.2d
|
||||
str q0, [x10, -112]
|
||||
ldr q2, [x6, 16]
|
||||
fmla v2.2d, v3.2d, v31.2d
|
||||
str q2, [x6, 16]
|
||||
ldr q5, [x10, -80]
|
||||
ldr q4, [x10, -64]
|
||||
ldr q6, [x10, -48]
|
||||
ldr q1, [x10, -32]
|
||||
ldr q7, [x10, -16]
|
||||
fmla v5.2d, v3.2d, v26.2d
|
||||
fmla v4.2d, v3.2d, v27.2d
|
||||
fmla v6.2d, v3.2d, v28.2d
|
||||
fmla v1.2d, v3.2d, v29.2d
|
||||
fmla v7.2d, v3.2d, v30.2d
|
||||
str q5, [x10, -80]
|
||||
str q4, [x10, -64]
|
||||
str q6, [x10, -48]
|
||||
str q1, [x10, -32]
|
||||
str q7, [x10, -16]
|
||||
cmp x23, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/daxpy/daxpy.s.zen.gcc.s
Normal file
30
examples/daxpy/daxpy.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups (%r12,%rax), %xmm12
|
||||
vmovups 16(%r12,%rax), %xmm13
|
||||
vmovups 32(%r12,%rax), %xmm14
|
||||
vmovups 48(%r12,%rax), %xmm15
|
||||
vmovups 64(%r12,%rax), %xmm1
|
||||
vmovups 80(%r12,%rax), %xmm0
|
||||
vmovups 96(%r12,%rax), %xmm4
|
||||
vmovups 112(%r12,%rax), %xmm5
|
||||
vfmadd213pd 0(%rbp,%rax), %xmm3, %xmm12
|
||||
vfmadd213pd 16(%rbp,%rax), %xmm3, %xmm13
|
||||
vfmadd213pd 32(%rbp,%rax), %xmm3, %xmm14
|
||||
vfmadd213pd 48(%rbp,%rax), %xmm3, %xmm15
|
||||
vfmadd213pd 64(%rbp,%rax), %xmm3, %xmm1
|
||||
vfmadd213pd 80(%rbp,%rax), %xmm3, %xmm0
|
||||
vfmadd213pd 96(%rbp,%rax), %xmm3, %xmm4
|
||||
vfmadd213pd 112(%rbp,%rax), %xmm3, %xmm5
|
||||
vmovups %xmm12, 0(%rbp,%rax)
|
||||
vmovups %xmm13, 16(%rbp,%rax)
|
||||
vmovups %xmm14, 32(%rbp,%rax)
|
||||
vmovups %xmm15, 48(%rbp,%rax)
|
||||
vmovups %xmm1, 64(%rbp,%rax)
|
||||
vmovups %xmm0, 80(%rbp,%rax)
|
||||
vmovups %xmm4, 96(%rbp,%rax)
|
||||
vmovups %xmm5, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %r15, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
1144
examples/gs/gs.s.csx.gcc.s
Normal file
1144
examples/gs/gs.s.csx.gcc.s
Normal file
File diff suppressed because it is too large
Load Diff
1123
examples/gs/gs.s.csx.icc.s
Normal file
1123
examples/gs/gs.s.csx.icc.s
Normal file
File diff suppressed because it is too large
Load Diff
1194
examples/gs/gs.s.tx2.clang.s
Normal file
1194
examples/gs/gs.s.tx2.clang.s
Normal file
File diff suppressed because it is too large
Load Diff
737
examples/gs/gs.s.tx2.gcc.s
Normal file
737
examples/gs/gs.s.tx2.gcc.s
Normal file
@@ -0,0 +1,737 @@
|
||||
.arch armv8.1-a+crypto+crc
|
||||
.file "gs.f90"
|
||||
.text
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.type MAIN__, %function
|
||||
MAIN__:
|
||||
.LFB0:
|
||||
.cfi_startproc
|
||||
sub sp, sp, #720
|
||||
.cfi_def_cfa_offset 720
|
||||
mov x0, 128
|
||||
mov w1, 12
|
||||
stp x29, x30, [sp]
|
||||
.cfi_offset 29, -720
|
||||
.cfi_offset 30, -712
|
||||
mov x29, sp
|
||||
movk x0, 0x5, lsl 32
|
||||
stp x19, x20, [sp, 16]
|
||||
.cfi_offset 19, -704
|
||||
.cfi_offset 20, -696
|
||||
adrp x19, .LC0
|
||||
add x19, x19, :lo12:.LC0
|
||||
stp x21, x22, [sp, 32]
|
||||
stp x0, x19, [sp, 192]
|
||||
add x0, sp, 192
|
||||
stp x23, x24, [sp, 48]
|
||||
stp x25, x26, [sp, 64]
|
||||
stp x27, x28, [sp, 80]
|
||||
str w1, [sp, 208]
|
||||
.cfi_offset 21, -688
|
||||
.cfi_offset 22, -680
|
||||
.cfi_offset 23, -672
|
||||
.cfi_offset 24, -664
|
||||
.cfi_offset 25, -656
|
||||
.cfi_offset 26, -648
|
||||
.cfi_offset 27, -640
|
||||
.cfi_offset 28, -632
|
||||
bl _gfortran_st_read
|
||||
mov w2, 4
|
||||
add x1, sp, 144
|
||||
add x0, sp, 192
|
||||
bl _gfortran_transfer_integer
|
||||
mov w2, 4
|
||||
add x1, sp, 148
|
||||
add x0, sp, 192
|
||||
bl _gfortran_transfer_integer
|
||||
add x0, sp, 192
|
||||
bl _gfortran_st_read_done
|
||||
ldp w24, w23, [sp, 144]
|
||||
mov x3, -1
|
||||
mov x5, 4611686018427387904
|
||||
mov x2, 2305843009213693951
|
||||
sxtw x25, w24
|
||||
sxtw x20, w23
|
||||
cmp x25, 0
|
||||
csel x21, x25, x3, ge
|
||||
cmp x20, 0
|
||||
csel x4, x20, x3, ge
|
||||
add x21, x21, 1
|
||||
add x6, x4, 1
|
||||
mul x26, x6, x21
|
||||
cmp x26, x5
|
||||
lsl x27, x26, 1
|
||||
lsl x7, x26, 4
|
||||
cset w8, eq
|
||||
cmp x27, x2
|
||||
cinc w9, w8, gt
|
||||
cmp x25, 0
|
||||
ccmp x20, 0, 1, ge
|
||||
csel x10, x7, xzr, ge
|
||||
cbnz w9, .L159
|
||||
cmp x10, 0
|
||||
mov x28, 1
|
||||
csel x0, x10, x28, ne
|
||||
bl malloc
|
||||
stp d8, d9, [sp, 96]
|
||||
.cfi_offset 73, -616
|
||||
.cfi_offset 72, -624
|
||||
cbz x0, .L160
|
||||
cmp w23, 1
|
||||
ble .L5
|
||||
cmp w24, 1
|
||||
ble .L6
|
||||
sub w12, w24, #2
|
||||
sub x4, x27, x26
|
||||
lsl x22, x21, 3
|
||||
mov w8, w28
|
||||
add x13, x21, x12
|
||||
mvn x14, x12
|
||||
add x10, x4, x21
|
||||
mov x6, x12
|
||||
add x15, x0, x13, lsl 3
|
||||
lsl x17, x14, 3
|
||||
mov x9, x21
|
||||
add x5, x15, 16
|
||||
.L10:
|
||||
add x1, x17, x5
|
||||
sub x18, x10, x9
|
||||
sub x16, x5, x1
|
||||
sub x30, x16, #8
|
||||
lsr x3, x30, 3
|
||||
add x2, x3, 1
|
||||
ands x7, x2, 7
|
||||
beq .L7
|
||||
cmp x7, 1
|
||||
beq .L104
|
||||
cmp x7, 2
|
||||
beq .L105
|
||||
cmp x7, 3
|
||||
beq .L106
|
||||
cmp x7, 4
|
||||
beq .L107
|
||||
cmp x7, 5
|
||||
beq .L108
|
||||
cmp x7, 6
|
||||
beq .L109
|
||||
str xzr, [x1]
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x1, x1, 8
|
||||
.L109:
|
||||
str xzr, [x1]
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x1, x1, 8
|
||||
.L108:
|
||||
str xzr, [x1]
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x1, x1, 8
|
||||
.L107:
|
||||
str xzr, [x1]
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x1, x1, 8
|
||||
.L106:
|
||||
str xzr, [x1]
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x1, x1, 8
|
||||
.L105:
|
||||
str xzr, [x1]
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x1, x1, 8
|
||||
.L104:
|
||||
str xzr, [x1]
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x1, x1, 8
|
||||
cmp x1, x5
|
||||
beq .L155
|
||||
.L7:
|
||||
str xzr, [x1]
|
||||
add x28, x1, 8
|
||||
add x16, x1, 16
|
||||
add x15, x1, 24
|
||||
str xzr, [x1, x18, lsl 3]
|
||||
add x14, x1, 32
|
||||
add x13, x1, 40
|
||||
add x12, x1, 48
|
||||
str xzr, [x1, 8]
|
||||
add x11, x1, 56
|
||||
add x1, x1, 64
|
||||
str xzr, [x28, x18, lsl 3]
|
||||
str xzr, [x1, -48]
|
||||
str xzr, [x16, x18, lsl 3]
|
||||
str xzr, [x1, -40]
|
||||
str xzr, [x15, x18, lsl 3]
|
||||
str xzr, [x1, -32]
|
||||
str xzr, [x14, x18, lsl 3]
|
||||
str xzr, [x1, -24]
|
||||
str xzr, [x13, x18, lsl 3]
|
||||
str xzr, [x1, -16]
|
||||
str xzr, [x12, x18, lsl 3]
|
||||
str xzr, [x1, -8]
|
||||
str xzr, [x11, x18, lsl 3]
|
||||
cmp x1, x5
|
||||
bne .L7
|
||||
.L155:
|
||||
add w8, w8, 1
|
||||
add x10, x10, x21
|
||||
add x9, x9, x21
|
||||
add x5, x5, x22
|
||||
cmp w23, w8
|
||||
bne .L10
|
||||
.L9:
|
||||
mul x20, x21, x20
|
||||
fmov d0, 1.0e+0
|
||||
sub x17, x26, x27
|
||||
and w18, w24, 7
|
||||
mov x2, 1
|
||||
add x30, x4, x20
|
||||
neg x3, x20, lsl 3
|
||||
add x7, x0, x30, lsl 3
|
||||
str d0, [x7, x17, lsl 3]
|
||||
add x1, x7, 8
|
||||
str d0, [x7]
|
||||
str xzr, [x0]
|
||||
str xzr, [x7, x3]
|
||||
cmp w24, 1
|
||||
blt .L151
|
||||
cbz w18, .L13
|
||||
cmp w18, 1
|
||||
beq .L119
|
||||
cmp w18, 2
|
||||
beq .L120
|
||||
cmp w18, 3
|
||||
beq .L121
|
||||
cmp w18, 4
|
||||
beq .L122
|
||||
cmp w18, 5
|
||||
beq .L123
|
||||
cmp w18, 6
|
||||
beq .L124
|
||||
str d0, [x1, x17, lsl 3]
|
||||
mov x2, 2
|
||||
str d0, [x1]
|
||||
str xzr, [x0, 8]
|
||||
str xzr, [x1, x3]
|
||||
add x1, x1, 8
|
||||
.L124:
|
||||
str d0, [x1, x17, lsl 3]
|
||||
str d0, [x1]
|
||||
str xzr, [x0, x2, lsl 3]
|
||||
add x2, x2, 1
|
||||
str xzr, [x1, x3]
|
||||
add x1, x1, 8
|
||||
.L123:
|
||||
str d0, [x1, x17, lsl 3]
|
||||
str d0, [x1]
|
||||
str xzr, [x0, x2, lsl 3]
|
||||
add x2, x2, 1
|
||||
str xzr, [x1, x3]
|
||||
add x1, x1, 8
|
||||
.L122:
|
||||
str d0, [x1, x17, lsl 3]
|
||||
str d0, [x1]
|
||||
str xzr, [x0, x2, lsl 3]
|
||||
add x2, x2, 1
|
||||
str xzr, [x1, x3]
|
||||
add x1, x1, 8
|
||||
.L121:
|
||||
str d0, [x1, x17, lsl 3]
|
||||
str d0, [x1]
|
||||
str xzr, [x0, x2, lsl 3]
|
||||
add x2, x2, 1
|
||||
str xzr, [x1, x3]
|
||||
add x1, x1, 8
|
||||
.L120:
|
||||
str d0, [x1, x17, lsl 3]
|
||||
str d0, [x1]
|
||||
str xzr, [x0, x2, lsl 3]
|
||||
add x2, x2, 1
|
||||
str xzr, [x1, x3]
|
||||
add x1, x1, 8
|
||||
.L119:
|
||||
str d0, [x1, x17, lsl 3]
|
||||
str d0, [x1]
|
||||
str xzr, [x0, x2, lsl 3]
|
||||
add x2, x2, 1
|
||||
str xzr, [x1, x3]
|
||||
add x1, x1, 8
|
||||
cmp w24, w2
|
||||
blt .L151
|
||||
.L13:
|
||||
str d0, [x1, x17, lsl 3]
|
||||
add x28, x1, 8
|
||||
add x15, x2, 1
|
||||
add x16, x1, 16
|
||||
str d0, [x1]
|
||||
add x13, x2, 2
|
||||
add x14, x1, 24
|
||||
add x12, x2, 3
|
||||
str xzr, [x0, x2, lsl 3]
|
||||
add x9, x1, 32
|
||||
add x4, x2, 4
|
||||
add x8, x1, 40
|
||||
str xzr, [x1, x3]
|
||||
add x11, x2, 5
|
||||
add x5, x1, 48
|
||||
add x10, x2, 6
|
||||
str d0, [x28, x17, lsl 3]
|
||||
add x20, x1, 56
|
||||
add x18, x2, 7
|
||||
add x2, x2, 8
|
||||
str d0, [x1, 8]
|
||||
add x1, x1, 64
|
||||
str xzr, [x0, x15, lsl 3]
|
||||
str xzr, [x28, x3]
|
||||
str d0, [x16, x17, lsl 3]
|
||||
str d0, [x1, -48]
|
||||
str xzr, [x0, x13, lsl 3]
|
||||
str xzr, [x16, x3]
|
||||
str d0, [x14, x17, lsl 3]
|
||||
str d0, [x1, -40]
|
||||
str xzr, [x0, x12, lsl 3]
|
||||
str xzr, [x14, x3]
|
||||
str d0, [x9, x17, lsl 3]
|
||||
str d0, [x1, -32]
|
||||
str xzr, [x0, x4, lsl 3]
|
||||
str xzr, [x9, x3]
|
||||
str d0, [x8, x17, lsl 3]
|
||||
str d0, [x1, -24]
|
||||
str xzr, [x0, x11, lsl 3]
|
||||
str xzr, [x8, x3]
|
||||
str d0, [x5, x17, lsl 3]
|
||||
str d0, [x1, -16]
|
||||
str xzr, [x0, x10, lsl 3]
|
||||
str xzr, [x5, x3]
|
||||
str d0, [x20, x17, lsl 3]
|
||||
str d0, [x1, -8]
|
||||
str xzr, [x0, x18, lsl 3]
|
||||
str xzr, [x20, x3]
|
||||
cmp w24, w2
|
||||
bge .L13
|
||||
.L151:
|
||||
cmp w24, 0
|
||||
csel w17, w24, wzr, ge
|
||||
add w11, w17, 1
|
||||
.L8:
|
||||
tbnz w23, #31, .L11
|
||||
.L12:
|
||||
scvtf d2, w11
|
||||
scvtf d1, w24
|
||||
sub x30, x27, x26
|
||||
sub x25, x25, x26
|
||||
add x26, x25, x26
|
||||
add x27, x25, x27
|
||||
mov w3, 1
|
||||
and w7, w23, 7
|
||||
add x2, x0, x22
|
||||
fdiv d3, d2, d1
|
||||
str d3, [x0]
|
||||
str d3, [x0, x30, lsl 3]
|
||||
str d3, [x0, x26, lsl 3]
|
||||
str d3, [x0, x27, lsl 3]
|
||||
cmp w23, w3
|
||||
blt .L11
|
||||
cbz w7, .L15
|
||||
cmp w7, 1
|
||||
beq .L113
|
||||
cmp w7, 2
|
||||
beq .L114
|
||||
cmp w7, 3
|
||||
beq .L115
|
||||
cmp w7, 4
|
||||
beq .L116
|
||||
cmp w7, 5
|
||||
beq .L117
|
||||
cmp w7, 6
|
||||
beq .L118
|
||||
str d3, [x2]
|
||||
mov w3, 2
|
||||
str d3, [x2, x30, lsl 3]
|
||||
str d3, [x2, x26, lsl 3]
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x2, x2, x22
|
||||
.L118:
|
||||
str d3, [x2]
|
||||
add w3, w3, 1
|
||||
str d3, [x2, x30, lsl 3]
|
||||
str d3, [x2, x26, lsl 3]
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x2, x2, x22
|
||||
.L117:
|
||||
str d3, [x2]
|
||||
add w3, w3, 1
|
||||
str d3, [x2, x30, lsl 3]
|
||||
str d3, [x2, x26, lsl 3]
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x2, x2, x22
|
||||
.L116:
|
||||
str d3, [x2]
|
||||
add w3, w3, 1
|
||||
str d3, [x2, x30, lsl 3]
|
||||
str d3, [x2, x26, lsl 3]
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x2, x2, x22
|
||||
.L115:
|
||||
str d3, [x2]
|
||||
add w3, w3, 1
|
||||
str d3, [x2, x30, lsl 3]
|
||||
str d3, [x2, x26, lsl 3]
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x2, x2, x22
|
||||
.L114:
|
||||
str d3, [x2]
|
||||
add w3, w3, 1
|
||||
str d3, [x2, x30, lsl 3]
|
||||
str d3, [x2, x26, lsl 3]
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x2, x2, x22
|
||||
.L113:
|
||||
str d3, [x2]
|
||||
add w3, w3, 1
|
||||
str d3, [x2, x30, lsl 3]
|
||||
str d3, [x2, x26, lsl 3]
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x2, x2, x22
|
||||
cmp w23, w3
|
||||
blt .L11
|
||||
.L15:
|
||||
str d3, [x2]
|
||||
add x1, x2, x22
|
||||
add w3, w3, 8
|
||||
str d3, [x2, x30, lsl 3]
|
||||
add x28, x1, x22
|
||||
str d3, [x2, x26, lsl 3]
|
||||
add x15, x28, x22
|
||||
str d3, [x2, x27, lsl 3]
|
||||
add x14, x15, x22
|
||||
str d3, [x1]
|
||||
add x16, x14, x22
|
||||
str d3, [x1, x30, lsl 3]
|
||||
add x13, x16, x22
|
||||
str d3, [x1, x26, lsl 3]
|
||||
add x12, x13, x22
|
||||
str d3, [x1, x27, lsl 3]
|
||||
add x2, x12, x22
|
||||
str d3, [x28]
|
||||
str d3, [x28, x30, lsl 3]
|
||||
str d3, [x28, x26, lsl 3]
|
||||
str d3, [x28, x27, lsl 3]
|
||||
str d3, [x15]
|
||||
str d3, [x15, x30, lsl 3]
|
||||
str d3, [x15, x26, lsl 3]
|
||||
str d3, [x15, x27, lsl 3]
|
||||
str d3, [x14]
|
||||
str d3, [x14, x30, lsl 3]
|
||||
str d3, [x14, x26, lsl 3]
|
||||
str d3, [x14, x27, lsl 3]
|
||||
str d3, [x16]
|
||||
str d3, [x16, x30, lsl 3]
|
||||
str d3, [x16, x26, lsl 3]
|
||||
str d3, [x16, x27, lsl 3]
|
||||
str d3, [x13]
|
||||
str d3, [x13, x30, lsl 3]
|
||||
str d3, [x13, x26, lsl 3]
|
||||
str d3, [x13, x27, lsl 3]
|
||||
str d3, [x12]
|
||||
str d3, [x12, x30, lsl 3]
|
||||
str d3, [x12, x26, lsl 3]
|
||||
str d3, [x12, x27, lsl 3]
|
||||
cmp w23, w3
|
||||
bge .L15
|
||||
.L11:
|
||||
add x6, x21, x6, uxtw
|
||||
adrp x4, .LC6
|
||||
add x9, x22, 8
|
||||
fmov d9, 2.5e-1
|
||||
ldr d8, [x4, #:lo12:.LC6]
|
||||
add x27, x0, x9
|
||||
mov w20, 51711
|
||||
add x0, x0, x6, lsl 3
|
||||
lsl x28, x21, 1
|
||||
mov w26, 10
|
||||
movk w20, 0x3b9a, lsl 16
|
||||
add x25, x0, 16
|
||||
.L14:
|
||||
add x0, sp, 176
|
||||
add x1, sp, 160
|
||||
lsl w26, w26, 1
|
||||
bl timing_
|
||||
mov w0, 0
|
||||
.p2align 4
|
||||
.L18:
|
||||
cmp w23, 1
|
||||
ble .L21
|
||||
cmp w24, 1
|
||||
ble .L21
|
||||
mov x11, 0
|
||||
mov w10, 1
|
||||
mov x7, x25
|
||||
mov x9, x28
|
||||
mov x8, x21
|
||||
mov x6, x27
|
||||
.p2align 4
|
||||
.L22:
|
||||
sub x5, x7, x6
|
||||
add w10, w10, 1
|
||||
mov x15, x6
|
||||
sub x18, x11, x8
|
||||
sub x17, x5, #8
|
||||
sub x30, x9, x8
|
||||
ldr d30, [x6, -8]
|
||||
lsr x3, x17, 3
|
||||
add x2, x3, 1
|
||||
ands x1, x2, 3
|
||||
beq .L20
|
||||
cmp x1, 1
|
||||
beq .L111
|
||||
cmp x1, 2
|
||||
beq .L112
|
||||
ldr d4, [x6, x18, lsl 3]
|
||||
ldr d6, [x6, 8]
|
||||
ldr d5, [x6, x30, lsl 3]
|
||||
fadd d7, d4, d6
|
||||
fadd d16, d7, d30
|
||||
fadd d17, d16, d5
|
||||
fmul d30, d17, d9
|
||||
str d30, [x15], 8
|
||||
.L112:
|
||||
ldr d18, [x15, x18, lsl 3]
|
||||
ldr d20, [x15, 8]
|
||||
ldr d19, [x15, x30, lsl 3]
|
||||
fadd d21, d18, d20
|
||||
fadd d22, d21, d30
|
||||
fadd d23, d22, d19
|
||||
fmul d30, d23, d9
|
||||
str d30, [x15], 8
|
||||
.L111:
|
||||
ldr d24, [x15, x18, lsl 3]
|
||||
ldr d26, [x15, 8]
|
||||
ldr d25, [x15, x30, lsl 3]
|
||||
fadd d27, d24, d26
|
||||
fadd d28, d27, d30
|
||||
fadd d29, d28, d25
|
||||
fmul d30, d29, d9
|
||||
str d30, [x15], 8
|
||||
cmp x7, x15
|
||||
beq .L154
|
||||
// OSACA-BEGIN
|
||||
.L20:
|
||||
ldr d31, [x15, x18, lsl 3]
|
||||
ldr d0, [x15, 8]
|
||||
mov x14, x15
|
||||
add x16, x15, 24
|
||||
ldr d2, [x15, x30, lsl 3]
|
||||
add x15, x15, 32
|
||||
fadd d1, d31, d0
|
||||
fadd d3, d1, d30
|
||||
fadd d4, d3, d2
|
||||
fmul d5, d4, d9
|
||||
str d5, [x14], 8
|
||||
ldr d6, [x14, x18, lsl 3]
|
||||
ldr d16, [x14, 8]
|
||||
add x13, x14, 8
|
||||
ldr d7, [x14, x30, lsl 3]
|
||||
fadd d17, d6, d16
|
||||
fadd d18, d17, d5
|
||||
fadd d19, d18, d7
|
||||
fmul d20, d19, d9
|
||||
str d20, [x15, -24]
|
||||
ldr d21, [x13, x18, lsl 3]
|
||||
ldr d23, [x14, 16]
|
||||
ldr d22, [x13, x30, lsl 3]
|
||||
fadd d24, d21, d23
|
||||
fadd d25, d24, d20
|
||||
fadd d26, d25, d22
|
||||
fmul d27, d26, d9
|
||||
str d27, [x14, 8]
|
||||
ldr d30, [x15]
|
||||
ldr d28, [x16, x18, lsl 3]
|
||||
ldr d29, [x16, x30, lsl 3]
|
||||
fadd d31, d28, d30
|
||||
fadd d2, d31, d27
|
||||
fadd d0, d2, d29
|
||||
fmul d30, d0, d9
|
||||
str d30, [x15, -8]
|
||||
cmp x7, x15
|
||||
bne .L20
|
||||
// OSACA-END
|
||||
.L154:
|
||||
add x6, x6, x22
|
||||
add x11, x11, x21
|
||||
add x8, x8, x21
|
||||
add x9, x9, x21
|
||||
add x7, x7, x22
|
||||
cmp w23, w10
|
||||
bne .L22
|
||||
.L21:
|
||||
add w4, w0, 1
|
||||
cmp w26, w4
|
||||
beq .L17
|
||||
mov w0, w4
|
||||
b .L18
|
||||
.L17:
|
||||
add w12, w0, 2
|
||||
add x1, sp, 152
|
||||
add x0, sp, 168
|
||||
str w12, [sp, 124]
|
||||
str w12, [sp, 140]
|
||||
bl timing_
|
||||
ldp d3, d1, [sp, 168]
|
||||
ldr w5, [sp, 124]
|
||||
fsub d4, d3, d1
|
||||
fcmpe d4, d8
|
||||
ccmp w26, w20, 0, lt
|
||||
ble .L14
|
||||
cmp w5, w26
|
||||
ble .L23
|
||||
str w26, [sp, 140]
|
||||
.L23:
|
||||
mov x21, 128
|
||||
add x0, sp, 192
|
||||
mov w22, 72
|
||||
movk x21, 0x6, lsl 32
|
||||
str w22, [sp, 208]
|
||||
sub w24, w24, #1
|
||||
sub w23, w23, #1
|
||||
stp x21, x19, [sp, 192]
|
||||
bl _gfortran_st_write
|
||||
adrp x19, .LANCHOR0
|
||||
adrp x27, .LC7
|
||||
add x28, x19, :lo12:.LANCHOR0
|
||||
mov x2, 14
|
||||
add x0, sp, 192
|
||||
mov x1, x28
|
||||
bl _gfortran_transfer_character_write
|
||||
mov w2, 4
|
||||
add x1, sp, 140
|
||||
add x0, sp, 192
|
||||
bl _gfortran_transfer_integer_write
|
||||
add x1, x28, 16
|
||||
mov x2, 14
|
||||
add x0, sp, 192
|
||||
bl _gfortran_transfer_character_write
|
||||
ldr w25, [sp, 140]
|
||||
scvtf d9, w24
|
||||
scvtf d8, w23
|
||||
ldr d5, [x27, #:lo12:.LC7]
|
||||
ldp d18, d19, [sp, 168]
|
||||
mov w2, 8
|
||||
add x1, sp, 184
|
||||
add x0, sp, 192
|
||||
scvtf d7, w25
|
||||
fsub d20, d18, d19
|
||||
fmul d6, d9, d8
|
||||
fmul d16, d7, d5
|
||||
fmul d17, d6, d16
|
||||
fdiv d21, d17, d20
|
||||
str d21, [sp, 184]
|
||||
bl _gfortran_transfer_real_write
|
||||
add x1, x28, 32
|
||||
mov x2, 6
|
||||
add x0, sp, 192
|
||||
bl _gfortran_transfer_character_write
|
||||
add x0, sp, 192
|
||||
bl _gfortran_st_write_done
|
||||
mov w2, 0
|
||||
mov x1, 0
|
||||
mov x0, 0
|
||||
bl _gfortran_stop_string
|
||||
.L5:
|
||||
tbnz w24, #31, .L25
|
||||
.L157:
|
||||
sub x4, x27, x26
|
||||
lsl x22, x21, 3
|
||||
sub w6, w24, #2
|
||||
b .L9
|
||||
.L6:
|
||||
tbz w24, #31, .L157
|
||||
mov w11, 0
|
||||
lsl x22, x21, 3
|
||||
sub w6, w24, #2
|
||||
b .L12
|
||||
.L159:
|
||||
.cfi_restore 72
|
||||
.cfi_restore 73
|
||||
adrp x26, .LC1
|
||||
stp d8, d9, [sp, 96]
|
||||
.cfi_offset 73, -616
|
||||
.cfi_offset 72, -624
|
||||
add x0, x26, :lo12:.LC1
|
||||
bl _gfortran_runtime_error
|
||||
.L25:
|
||||
mov w11, 0
|
||||
lsl x22, x21, 3
|
||||
sub w6, w24, #2
|
||||
b .L8
|
||||
.L160:
|
||||
adrp x20, .LC2
|
||||
add x0, x20, :lo12:.LC2
|
||||
bl _gfortran_os_error
|
||||
.cfi_endproc
|
||||
.LFE0:
|
||||
.size MAIN__, .-MAIN__
|
||||
.section .text.startup,"ax",@progbits
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.global main
|
||||
.type main, %function
|
||||
main:
|
||||
.LFB1:
|
||||
.cfi_startproc
|
||||
stp x29, x30, [sp, -16]!
|
||||
.cfi_def_cfa_offset 16
|
||||
.cfi_offset 29, -16
|
||||
.cfi_offset 30, -8
|
||||
mov x29, sp
|
||||
bl _gfortran_set_args
|
||||
adrp x1, .LANCHOR0
|
||||
mov w0, 7
|
||||
add x2, x1, :lo12:.LANCHOR0
|
||||
add x1, x2, 40
|
||||
bl _gfortran_set_options
|
||||
bl MAIN__
|
||||
.cfi_endproc
|
||||
.LFE1:
|
||||
.size main, .-main
|
||||
.section .rodata
|
||||
.align 3
|
||||
.set .LANCHOR0,. + 0
|
||||
.LC3:
|
||||
.ascii "# Iterations: "
|
||||
.zero 2
|
||||
.LC4:
|
||||
.ascii " Performance: "
|
||||
.zero 2
|
||||
.LC5:
|
||||
.ascii " MLUPs"
|
||||
.zero 2
|
||||
.type options.8.2753, %object
|
||||
.size options.8.2753, 28
|
||||
options.8.2753:
|
||||
.word 68
|
||||
.word 8191
|
||||
.word 0
|
||||
.word 1
|
||||
.word 1
|
||||
.word 0
|
||||
.word 31
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.align 3
|
||||
.LC6:
|
||||
.word 2576980378
|
||||
.word 1070176665
|
||||
.LC7:
|
||||
.word 2696277389
|
||||
.word 1051772663
|
||||
.section .rodata.str1.8,"aMS",@progbits,1
|
||||
.align 3
|
||||
.LC0:
|
||||
.string "gs.f90"
|
||||
.zero 1
|
||||
.LC1:
|
||||
.string "Integer overflow when calculating the amount of memory to allocate"
|
||||
.zero 5
|
||||
.LC2:
|
||||
.string "Allocation would exceed memory limit"
|
||||
.ident "GCC: (ARM-build-8) 8.2.0"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
1073
examples/gs/gs.s.zen.gcc.s
Normal file
1073
examples/gs/gs.s.zen.gcc.s
Normal file
File diff suppressed because it is too large
Load Diff
40
examples/j2d/j2d.s.csx.gcc.AVX.s
Normal file
40
examples/j2d/j2d.s.csx.gcc.AVX.s
Normal file
@@ -0,0 +1,40 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L21:
|
||||
vmovupd (%r8,%rax), %ymm11
|
||||
vmovupd (%rsi,%rax), %ymm13
|
||||
vaddpd (%r9,%rax), %ymm11, %ymm12
|
||||
vaddpd (%rdi,%rax), %ymm13, %ymm14
|
||||
vmovupd 32(%r8,%rax), %ymm1
|
||||
vmovupd 32(%rsi,%rax), %ymm2
|
||||
vaddpd %ymm14, %ymm12, %ymm15
|
||||
vaddpd 32(%r9,%rax), %ymm1, %ymm5
|
||||
vaddpd 32(%rdi,%rax), %ymm2, %ymm7
|
||||
vmulpd %ymm8, %ymm15, %ymm0
|
||||
vmovupd 64(%r8,%rax), %ymm10
|
||||
vaddpd %ymm7, %ymm5, %ymm6
|
||||
vmovupd 64(%rsi,%rax), %ymm12
|
||||
vmovupd 96(%rsi,%rax), %ymm5
|
||||
vmovupd %ymm0, (%rdx,%rax)
|
||||
vmovupd 96(%r8,%rax), %ymm0
|
||||
vaddpd 64(%r9,%rax), %ymm10, %ymm11
|
||||
vaddpd 64(%rdi,%rax), %ymm12, %ymm13
|
||||
vaddpd 96(%r9,%rax), %ymm0, %ymm1
|
||||
vaddpd 96(%rdi,%rax), %ymm5, %ymm2
|
||||
vaddpd %ymm13, %ymm11, %ymm14
|
||||
vmulpd %ymm8, %ymm6, %ymm9
|
||||
vaddpd %ymm2, %ymm1, %ymm7
|
||||
vmulpd %ymm8, %ymm14, %ymm15
|
||||
vmulpd %ymm8, %ymm7, %ymm6
|
||||
vmovupd %ymm9, 32(%rdx,%rax)
|
||||
vmovupd %ymm15, 64(%rdx,%rax)
|
||||
vmovupd %ymm6, 96(%rdx,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rax, %r15
|
||||
jne .L21
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
46
examples/j2d/j2d.s.csx.gcc.SSE.s
Normal file
46
examples/j2d/j2d.s.csx.gcc.SSE.s
Normal file
@@ -0,0 +1,46 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L28:
|
||||
movupd 16(%r8,%rax), %xmm11
|
||||
movupd 16(%rdi,%rax), %xmm12
|
||||
movupd 16(%rsi,%rax), %xmm13
|
||||
addpd %xmm11, %xmm15
|
||||
addpd %xmm13, %xmm12
|
||||
movupd 32(%rdi,%rax), %xmm14
|
||||
movupd 32(%rsi,%rax), %xmm0
|
||||
addpd %xmm15, %xmm12
|
||||
movupd 32(%r8,%rax), %xmm15
|
||||
addpd %xmm0, %xmm14
|
||||
addpd %xmm15, %xmm11
|
||||
movupd 48(%rdi,%rax), %xmm1
|
||||
movupd 48(%rsi,%rax), %xmm7
|
||||
addpd %xmm11, %xmm14
|
||||
addpd %xmm7, %xmm1
|
||||
mulpd %xmm2, %xmm12
|
||||
mulpd %xmm2, %xmm14
|
||||
movups %xmm12, 16(%rcx,%rax)
|
||||
movups %xmm14, 32(%rcx,%rax)
|
||||
movupd 48(%r8,%rax), %xmm14
|
||||
addpd %xmm14, %xmm15
|
||||
addpd %xmm15, %xmm1
|
||||
mulpd %xmm2, %xmm1
|
||||
movups %xmm1, 48(%rcx,%rax)
|
||||
addq $64, %rax
|
||||
.L21:
|
||||
movupd (%r8,%rax), %xmm15
|
||||
movupd (%rdi,%rax), %xmm0
|
||||
movupd (%rsi,%rax), %xmm1
|
||||
addpd %xmm15, %xmm14
|
||||
addpd %xmm1, %xmm0
|
||||
leaq 16(%rax), %r10
|
||||
addpd %xmm0, %xmm14
|
||||
mulpd %xmm2, %xmm14
|
||||
movups %xmm14, (%rcx,%rax)
|
||||
cmpq %r10, %r14
|
||||
jne .L28
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
37
examples/j2d/j2d.s.csx.icc.AVX.s
Normal file
37
examples/j2d/j2d.s.csx.icc.AVX.s
Normal file
@@ -0,0 +1,37 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.47: # Preds ..B1.47 ..B1.46
|
||||
# Execution count [1.15e+04]
|
||||
vmovupd 10016(%r8,%rcx,8), %ymm1 #94.5
|
||||
vmovupd 10048(%r8,%rcx,8), %ymm6 #94.5
|
||||
vmovupd 10080(%r8,%rcx,8), %ymm11 #94.5
|
||||
vaddpd 16(%r12,%rcx,8), %ymm1, %ymm2 #94.5
|
||||
vaddpd 48(%r12,%rcx,8), %ymm6, %ymm7 #94.5
|
||||
vaddpd 80(%r12,%rcx,8), %ymm11, %ymm12 #94.5
|
||||
vaddpd 20032(%r10,%rcx,8), %ymm2, %ymm3 #94.5
|
||||
vaddpd 20064(%r10,%rcx,8), %ymm7, %ymm8 #94.5
|
||||
vaddpd 20096(%r10,%rcx,8), %ymm12, %ymm13 #94.5
|
||||
vaddpd 10032(%r8,%rcx,8), %ymm3, %ymm4 #94.5
|
||||
vaddpd 10064(%r8,%rcx,8), %ymm8, %ymm9 #94.5
|
||||
vaddpd 10096(%r8,%rcx,8), %ymm13, %ymm14 #94.5
|
||||
vmovupd 10112(%r8,%rcx,8), %ymm1 #94.5
|
||||
vmulpd %ymm4, %ymm0, %ymm5 #94.5
|
||||
vmulpd %ymm9, %ymm0, %ymm10 #94.5
|
||||
vmulpd %ymm14, %ymm0, %ymm15 #94.5
|
||||
vaddpd 112(%r12,%rcx,8), %ymm1, %ymm2 #94.5
|
||||
vmovupd %ymm5, 10016(%r9,%rcx,8) #94.5
|
||||
vmovupd %ymm10, 10048(%r9,%rcx,8) #94.5
|
||||
vmovupd %ymm15, 10080(%r9,%rcx,8) #94.5
|
||||
vaddpd 20128(%r10,%rcx,8), %ymm2, %ymm3 #94.5
|
||||
vaddpd 10128(%r8,%rcx,8), %ymm3, %ymm4 #94.5
|
||||
vmulpd %ymm4, %ymm0, %ymm5 #94.5
|
||||
vmovupd %ymm5, 10112(%r9,%rcx,8) #94.5
|
||||
addq $16, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.47 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
69
examples/j2d/j2d.s.csx.icc.AVX512.s
Normal file
69
examples/j2d/j2d.s.csx.icc.AVX512.s
Normal file
@@ -0,0 +1,69 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.47: # Preds ..B1.63 ..B1.46
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.48: # Preds ..B1.47
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10032(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10016(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.51: # Preds ..B1.48
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 16(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20032(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10016(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.52: # Preds ..B1.51
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10096(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10080(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.55: # Preds ..B1.52
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 80(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20096(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10080(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.56: # Preds ..B1.55
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10160(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10144(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.59: # Preds ..B1.56
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 144(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20160(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10144(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.60: # Preds ..B1.59
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10224(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10208(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.63: # Preds ..B1.60
|
||||
# Execution count [1.15e+04]
|
||||
vaddpd 208(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20224(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10208(%r9,%rcx,8) #94.5
|
||||
addq $32, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.47 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
40
examples/j2d/j2d.s.csx.icc.SSE.s
Normal file
40
examples/j2d/j2d.s.csx.icc.SSE.s
Normal file
@@ -0,0 +1,40 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.42: # Preds ..B1.42 ..B1.41
|
||||
# Execution count [1.15e+04]
|
||||
movups 10016(%r8,%rcx,8), %xmm0 #94.5
|
||||
addpd 16(%r12,%rcx,8), %xmm0 #94.5
|
||||
addpd 20032(%r10,%rcx,8), %xmm0 #94.5
|
||||
movups 10032(%r8,%rcx,8), %xmm2 #94.5
|
||||
movups 32(%r12,%rcx,8), %xmm1 #94.5
|
||||
addpd %xmm2, %xmm0 #94.5
|
||||
addpd %xmm1, %xmm2 #94.5
|
||||
mulpd %xmm7, %xmm0 #94.5
|
||||
addpd 20048(%r10,%rcx,8), %xmm2 #94.5
|
||||
movups 10048(%r8,%rcx,8), %xmm4 #94.5
|
||||
movups 48(%r12,%rcx,8), %xmm3 #94.5
|
||||
addpd %xmm4, %xmm2 #94.5
|
||||
addpd %xmm3, %xmm4 #94.5
|
||||
mulpd %xmm7, %xmm2 #94.5
|
||||
addpd 20064(%r10,%rcx,8), %xmm4 #94.5
|
||||
movups 10064(%r8,%rcx,8), %xmm6 #94.5
|
||||
movups 64(%r12,%rcx,8), %xmm5 #94.5
|
||||
addpd %xmm6, %xmm4 #94.5
|
||||
addpd %xmm5, %xmm6 #94.5
|
||||
mulpd %xmm7, %xmm4 #94.5
|
||||
addpd 20080(%r10,%rcx,8), %xmm6 #94.5
|
||||
addpd 10080(%r8,%rcx,8), %xmm6 #94.5
|
||||
mulpd %xmm7, %xmm6 #94.5
|
||||
movups %xmm0, 10016(%r9,%rcx,8) #94.5
|
||||
movups %xmm2, 10032(%r9,%rcx,8) #94.5
|
||||
movups %xmm4, 10048(%r9,%rcx,8) #94.5
|
||||
movups %xmm6, 10064(%r9,%rcx,8) #94.5
|
||||
addq $8, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.42 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
131
examples/j2d/j2d.s.tx2.clang.s
Normal file
131
examples/j2d/j2d.s.tx2.clang.s
Normal file
@@ -0,0 +1,131 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_16 Depth=1
|
||||
// Parent Loop BB1_19 Depth=2
|
||||
// Parent Loop BB1_24 Depth=3
|
||||
// => This Inner Loop Header: Depth=4
|
||||
add x0, x5, x16
|
||||
add x18, x21, x16
|
||||
ldp q4, q5, [x0, #16]
|
||||
ldp q6, q7, [x0, #48]
|
||||
ldur q0, [x18, #8]
|
||||
ldur q1, [x18, #24]
|
||||
ldur q2, [x18, #40]
|
||||
ldur q3, [x18, #56]
|
||||
add x1, x28, x16
|
||||
add x15, x15, #32 // =32
|
||||
fadd v0.2d, v4.2d, v0.2d
|
||||
fadd v4.2d, v5.2d, v1.2d
|
||||
fadd v5.2d, v6.2d, v2.2d
|
||||
fadd v6.2d, v7.2d, v3.2d
|
||||
ldp q7, q16, [x1, #16]
|
||||
fadd v1.2d, v7.2d, v1.2d
|
||||
ldp q17, q18, [x1, #48]
|
||||
ldur q19, [x18, #72]
|
||||
fadd v0.2d, v0.2d, v1.2d
|
||||
fadd v1.2d, v16.2d, v2.2d
|
||||
fadd v2.2d, v17.2d, v3.2d
|
||||
fadd v3.2d, v18.2d, v19.2d
|
||||
ldp q16, q17, [x0, #80]
|
||||
ldp q18, q19, [x0, #112]
|
||||
fadd v1.2d, v4.2d, v1.2d
|
||||
fadd v2.2d, v5.2d, v2.2d
|
||||
fadd v3.2d, v6.2d, v3.2d
|
||||
ldur q4, [x18, #72]
|
||||
ldur q5, [x18, #88]
|
||||
ldur q6, [x18, #104]
|
||||
ldur q7, [x18, #120]
|
||||
fadd v4.2d, v16.2d, v4.2d
|
||||
fadd v16.2d, v17.2d, v5.2d
|
||||
fadd v17.2d, v18.2d, v6.2d
|
||||
fadd v18.2d, v19.2d, v7.2d
|
||||
ldp q19, q20, [x1, #80]
|
||||
fadd v5.2d, v19.2d, v5.2d
|
||||
ldp q21, q22, [x1, #112]
|
||||
ldur q23, [x18, #136]
|
||||
fadd v4.2d, v4.2d, v5.2d
|
||||
fadd v5.2d, v20.2d, v6.2d
|
||||
fadd v6.2d, v21.2d, v7.2d
|
||||
fadd v7.2d, v22.2d, v23.2d
|
||||
ldp q20, q21, [x0, #144]
|
||||
ldp q22, q23, [x0, #176]
|
||||
fadd v5.2d, v16.2d, v5.2d
|
||||
fadd v6.2d, v17.2d, v6.2d
|
||||
fadd v7.2d, v18.2d, v7.2d
|
||||
ldur q16, [x18, #136]
|
||||
ldur q17, [x18, #152]
|
||||
ldur q18, [x18, #168]
|
||||
ldur q19, [x18, #184]
|
||||
fadd v16.2d, v20.2d, v16.2d
|
||||
fadd v20.2d, v21.2d, v17.2d
|
||||
fadd v21.2d, v22.2d, v18.2d
|
||||
fadd v22.2d, v23.2d, v19.2d
|
||||
ldp q23, q24, [x1, #144]
|
||||
fadd v17.2d, v23.2d, v17.2d
|
||||
ldp q25, q26, [x1, #176]
|
||||
fadd v16.2d, v16.2d, v17.2d
|
||||
fadd v17.2d, v24.2d, v18.2d
|
||||
fadd v18.2d, v25.2d, v19.2d
|
||||
ldp q24, q25, [x0, #208]
|
||||
ldur q23, [x18, #200]
|
||||
fadd v17.2d, v20.2d, v17.2d
|
||||
fadd v18.2d, v21.2d, v18.2d
|
||||
ldur q20, [x18, #200]
|
||||
ldur q21, [x18, #216]
|
||||
fadd v19.2d, v26.2d, v23.2d
|
||||
fadd v20.2d, v24.2d, v20.2d
|
||||
fadd v24.2d, v25.2d, v21.2d
|
||||
ldp q25, q26, [x1, #208]
|
||||
fadd v21.2d, v25.2d, v21.2d
|
||||
fadd v20.2d, v20.2d, v21.2d
|
||||
ldp q21, q25, [x0, #240]
|
||||
fadd v19.2d, v22.2d, v19.2d
|
||||
ldur q22, [x18, #232]
|
||||
fadd v21.2d, v21.2d, v22.2d
|
||||
fadd v22.2d, v26.2d, v22.2d
|
||||
fadd v22.2d, v24.2d, v22.2d
|
||||
ldp q24, q26, [x1, #240]
|
||||
ldur q23, [x18, #248]
|
||||
fadd v25.2d, v25.2d, v23.2d
|
||||
fadd v23.2d, v24.2d, v23.2d
|
||||
add x18, x18, #264 // =264
|
||||
fmul v0.2d, v0.2d, v28.2d
|
||||
fmul v1.2d, v1.2d, v28.2d
|
||||
fmul v2.2d, v2.2d, v28.2d
|
||||
fmul v5.2d, v5.2d, v28.2d
|
||||
fadd v21.2d, v21.2d, v23.2d
|
||||
ldr q23, [x18]
|
||||
add x18, x25, x16
|
||||
stur q0, [x18, #8]
|
||||
stur q1, [x18, #24]
|
||||
fmul v3.2d, v3.2d, v28.2d
|
||||
stur q2, [x18, #40]
|
||||
fadd v23.2d, v26.2d, v23.2d
|
||||
stur q5, [x18, #88]
|
||||
fmul v4.2d, v4.2d, v28.2d
|
||||
stur q3, [x18, #56]
|
||||
fmul v6.2d, v6.2d, v28.2d
|
||||
stur q4, [x18, #72]
|
||||
fmul v0.2d, v7.2d, v28.2d
|
||||
stur q6, [x18, #104]
|
||||
fmul v1.2d, v16.2d, v28.2d
|
||||
stur q0, [x18, #120]
|
||||
fmul v2.2d, v17.2d, v28.2d
|
||||
stur q1, [x18, #136]
|
||||
fmul v4.2d, v19.2d, v28.2d
|
||||
stur q2, [x18, #152]
|
||||
fadd v5.2d, v25.2d, v23.2d
|
||||
stur q4, [x18, #184]
|
||||
fmul v3.2d, v18.2d, v28.2d
|
||||
stur q3, [x18, #168]
|
||||
fmul v6.2d, v20.2d, v28.2d
|
||||
stur q6, [x18, #200]
|
||||
fmul v0.2d, v22.2d, v28.2d
|
||||
stur q0, [x18, #216]
|
||||
fmul v1.2d, v21.2d, v28.2d
|
||||
stur q1, [x18, #232]
|
||||
add x16, x16, #256 // =256
|
||||
fmul v2.2d, v5.2d, v28.2d
|
||||
stur q2, [x18, #248]
|
||||
adds x17, x17, #4 // =4
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
43
examples/j2d/j2d.s.tx2.gcc.s
Normal file
43
examples/j2d/j2d.s.tx2.gcc.s
Normal file
@@ -0,0 +1,43 @@
|
||||
// OSACA-BEGIN
|
||||
.L93:
|
||||
add x5, x0, 16
|
||||
ldr q2, [x14, x0]
|
||||
ldr q5, [x25, x0]
|
||||
add x7, x0, 32
|
||||
ldr q13, [x22, x0]
|
||||
ldr q4, [x25, x5]
|
||||
add x6, x0, 48
|
||||
ldr x9, [sp, 144]
|
||||
ldr q19, [x22, x5]
|
||||
ldr q7, [x14, x5]
|
||||
ldr q6, [x14, x7]
|
||||
ldr q3, [x25, x7]
|
||||
ldr q18, [x22, x7]
|
||||
fadd v17.2d, v2.2d, v30.2d
|
||||
ldr q16, [x14, x6]
|
||||
ldr q20, [x25, x6]
|
||||
fadd v23.2d, v5.2d, v13.2d
|
||||
ldr q22, [x22, x6]
|
||||
fadd v24.2d, v4.2d, v19.2d
|
||||
fadd v25.2d, v7.2d, v2.2d
|
||||
fadd v27.2d, v6.2d, v7.2d
|
||||
fadd v26.2d, v3.2d, v18.2d
|
||||
fadd v28.2d, v16.2d, v6.2d
|
||||
mov v30.16b, v16.16b
|
||||
fadd v29.2d, v20.2d, v22.2d
|
||||
fadd v31.2d, v23.2d, v17.2d
|
||||
fadd v0.2d, v24.2d, v25.2d
|
||||
fadd v2.2d, v26.2d, v27.2d
|
||||
fadd v1.2d, v29.2d, v28.2d
|
||||
fmul v5.2d, v31.2d, v21.2d
|
||||
fmul v13.2d, v0.2d, v21.2d
|
||||
fmul v4.2d, v2.2d, v21.2d
|
||||
fmul v19.2d, v1.2d, v21.2d
|
||||
str q5, [x28, x0]
|
||||
add x0, x0, 64
|
||||
str q13, [x28, x5]
|
||||
str q4, [x28, x7]
|
||||
str q19, [x28, x6]
|
||||
cmp x9, x0
|
||||
bne .L93
|
||||
// OSACA-END
|
||||
36
examples/j2d/j2d.s.zen.gcc.s
Normal file
36
examples/j2d/j2d.s.zen.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
# OSACA-BEGIN
|
||||
.L28:
|
||||
vmovups (%r10,%rcx), %xmm5
|
||||
vmovups 32(%r10,%rax), %xmm13
|
||||
vmovups (%rdi,%rcx), %xmm1
|
||||
vmovups 32(%rdi,%rax), %xmm14
|
||||
vmovups 48(%rdi,%rax), %xmm9
|
||||
vaddpd (%r8,%rcx), %xmm1, %xmm10
|
||||
vaddpd 32(%r8,%rax), %xmm14, %xmm15
|
||||
vaddpd 48(%r8,%rax), %xmm9, %xmm1
|
||||
vaddpd %xmm5, %xmm8, %xmm8
|
||||
vaddpd %xmm13, %xmm5, %xmm6
|
||||
vmovups 48(%r10,%rax), %xmm5
|
||||
vaddpd %xmm8, %xmm10, %xmm11
|
||||
vaddpd %xmm6, %xmm15, %xmm0
|
||||
vmulpd %xmm2, %xmm11, %xmm12
|
||||
vaddpd %xmm5, %xmm13, %xmm4
|
||||
vmulpd %xmm2, %xmm0, %xmm7
|
||||
vaddpd %xmm4, %xmm1, %xmm10
|
||||
vmovups %xmm12, (%rsi,%rcx)
|
||||
vmovups %xmm7, 32(%rsi,%rax)
|
||||
vmulpd %xmm2, %xmm10, %xmm8
|
||||
vmovups %xmm8, 48(%rsi,%rax)
|
||||
addq $64, %rax
|
||||
.L21:
|
||||
vmovups (%r10,%rax), %xmm8
|
||||
leaq 16(%rax), %rcx
|
||||
vmovups (%rdi,%rax), %xmm9
|
||||
vaddpd (%r8,%rax), %xmm9, %xmm10
|
||||
vaddpd %xmm8, %xmm5, %xmm11
|
||||
vaddpd %xmm11, %xmm10, %xmm12
|
||||
vmulpd %xmm2, %xmm12, %xmm13
|
||||
vmovups %xmm13, (%rsi,%rax)
|
||||
cmpq %rcx, %r14
|
||||
jne .L28
|
||||
# OSACA-END
|
||||
44
examples/striad/striad.s.csx.gcc.s
Normal file
44
examples/striad/striad.s.csx.gcc.s
Normal file
@@ -0,0 +1,44 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r15,%rax), %ymm5
|
||||
vmovupd 0(%r13,%rax), %ymm6
|
||||
vmovupd 32(%r15,%rax), %ymm8
|
||||
vmovupd 32(%r13,%rax), %ymm7
|
||||
vmovupd 64(%r15,%rax), %ymm9
|
||||
vmovupd 64(%r13,%rax), %ymm10
|
||||
vmovupd 96(%r15,%rax), %ymm11
|
||||
vmovupd 96(%r13,%rax), %ymm12
|
||||
vmovupd 128(%r15,%rax), %ymm13
|
||||
vmovupd 128(%r13,%rax), %ymm14
|
||||
vmovupd 160(%r15,%rax), %ymm15
|
||||
vmovupd 160(%r13,%rax), %ymm2
|
||||
vmovupd 192(%r15,%rax), %ymm0
|
||||
vmovupd 192(%r13,%rax), %ymm1
|
||||
vmovupd 224(%r15,%rax), %ymm3
|
||||
vmovupd 224(%r13,%rax), %ymm4
|
||||
vfmadd132pd (%r14,%rax), %ymm6, %ymm5
|
||||
vfmadd132pd 32(%r14,%rax), %ymm7, %ymm8
|
||||
vfmadd132pd 64(%r14,%rax), %ymm10, %ymm9
|
||||
vfmadd132pd 96(%r14,%rax), %ymm12, %ymm11
|
||||
vfmadd132pd 128(%r14,%rax), %ymm14, %ymm13
|
||||
vfmadd132pd 160(%r14,%rax), %ymm2, %ymm15
|
||||
vfmadd132pd 192(%r14,%rax), %ymm1, %ymm0
|
||||
vfmadd132pd 224(%r14,%rax), %ymm4, %ymm3
|
||||
vmovupd %ymm5, (%r12,%rax)
|
||||
vmovupd %ymm8, 32(%r12,%rax)
|
||||
vmovupd %ymm9, 64(%r12,%rax)
|
||||
vmovupd %ymm11, 96(%r12,%rax)
|
||||
vmovupd %ymm13, 128(%r12,%rax)
|
||||
vmovupd %ymm15, 160(%r12,%rax)
|
||||
vmovupd %ymm0, 192(%r12,%rax)
|
||||
vmovupd %ymm3, 224(%r12,%rax)
|
||||
addq $256, %rax
|
||||
cmpq %rax, %r8
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
21
examples/striad/striad.s.csx.icc.s
Normal file
21
examples/striad/striad.s.csx.icc.s
Normal file
@@ -0,0 +1,21 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.41: # Preds ..B1.41 ..B1.40
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%rcx,%rax,8), %zmm2 #80.5
|
||||
vmovups 64(%rcx,%rax,8), %zmm4 #80.5
|
||||
vmovups (%r14,%rax,8), %zmm1 #80.5
|
||||
vmovups 64(%r14,%rax,8), %zmm3 #80.5
|
||||
vfmadd213pd (%r8,%rax,8), %zmm1, %zmm2 #80.5
|
||||
vfmadd213pd 64(%r8,%rax,8), %zmm3, %zmm4 #80.5
|
||||
vmovupd %zmm2, (%r13,%rax,8) #80.5
|
||||
vmovupd %zmm4, 64(%r13,%rax,8) #80.5
|
||||
addq $16, %rax #80.5
|
||||
cmpq %r12, %rax #80.5
|
||||
jb ..B1.41 # Prob 82% #80.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
112
examples/striad/striad.s.tx2.clang.s
Normal file
112
examples/striad/striad.s.tx2.clang.s
Normal file
@@ -0,0 +1,112 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q2, q3, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
ldp q16, q17, [x11, #-256]
|
||||
ldp q18, q19, [x11, #-224]
|
||||
fmla v0.2d, v16.2d, v4.2d
|
||||
fmla v1.2d, v17.2d, v5.2d
|
||||
stp q1, q0, [sp, #96] // 32-byte Folded Spill
|
||||
fmla v2.2d, v18.2d, v6.2d
|
||||
fmla v3.2d, v19.2d, v7.2d
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q6, q7, [x9, #-160]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
ldp q20, q21, [x11, #-192]
|
||||
ldp q22, q23, [x11, #-160]
|
||||
fmla v4.2d, v20.2d, v16.2d
|
||||
stp q3, q4, [x12, #-208]
|
||||
fmla v5.2d, v21.2d, v17.2d
|
||||
fmla v6.2d, v22.2d, v18.2d
|
||||
stp q5, q6, [x12, #-176]
|
||||
fmla v7.2d, v23.2d, v19.2d
|
||||
ldp q16, q18, [x9, #-128]
|
||||
ldp q17, q19, [x9, #-96]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
ldp q24, q25, [x11, #-128]
|
||||
ldp q26, q27, [x11, #-96]
|
||||
fmla v16.2d, v24.2d, v20.2d
|
||||
stp q7, q16, [x12, #-144]
|
||||
fmla v18.2d, v25.2d, v21.2d
|
||||
fmla v17.2d, v26.2d, v22.2d
|
||||
stp q18, q17, [x12, #-112]
|
||||
fmla v19.2d, v27.2d, v23.2d
|
||||
ldp q22, q23, [x9, #-64]
|
||||
ldp q20, q21, [x9, #-32]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
ldp q28, q29, [x11, #-64]
|
||||
ldp q30, q31, [x11, #-32]
|
||||
fmla v22.2d, v28.2d, v24.2d
|
||||
stp q19, q22, [x12, #-80]
|
||||
fmla v23.2d, v29.2d, v25.2d
|
||||
fmla v20.2d, v30.2d, v26.2d
|
||||
stp q23, q20, [x12, #-48]
|
||||
fmla v21.2d, v31.2d, v27.2d
|
||||
stur q21, [x12, #-16]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q28, q29, [x10]
|
||||
ldp q30, q31, [x10, #32]
|
||||
ldp q8, q10, [x11]
|
||||
ldp q11, q12, [x11, #32]
|
||||
fmla v24.2d, v8.2d, v28.2d
|
||||
fmla v25.2d, v10.2d, v29.2d
|
||||
stp q24, q25, [x12]
|
||||
fmla v26.2d, v11.2d, v30.2d
|
||||
fmla v27.2d, v12.2d, v31.2d
|
||||
stp q26, q27, [x12, #32]
|
||||
ldp q28, q29, [x9, #64]
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldp q8, q10, [x10, #64]
|
||||
ldp q11, q12, [x10, #96]
|
||||
ldp q13, q14, [x11, #64]
|
||||
ldp q15, q9, [x11, #96]
|
||||
fmla v28.2d, v13.2d, v8.2d
|
||||
fmla v29.2d, v14.2d, v10.2d
|
||||
stp q28, q29, [x12, #64]
|
||||
fmla v30.2d, v15.2d, v11.2d
|
||||
fmla v31.2d, v9.2d, v12.2d
|
||||
stp q30, q31, [x12, #96]
|
||||
ldp q8, q9, [x9, #128]
|
||||
ldp q12, q13, [x10, #128]
|
||||
ldp q14, q15, [x11, #128]
|
||||
ldp q10, q11, [x9, #160]
|
||||
fmla v8.2d, v14.2d, v12.2d
|
||||
ldp q12, q14, [x10, #160]
|
||||
fmla v9.2d, v15.2d, v13.2d
|
||||
stp q8, q9, [x12, #128]
|
||||
ldp q13, q15, [x11, #160]
|
||||
fmla v10.2d, v13.2d, v12.2d
|
||||
fmla v11.2d, v15.2d, v14.2d
|
||||
stp q10, q11, [x12, #160]
|
||||
ldp q12, q13, [x9, #192]
|
||||
ldp q14, q15, [x10, #192]
|
||||
ldp q0, q1, [x11, #192]
|
||||
fmla v12.2d, v0.2d, v14.2d
|
||||
ldr q0, [sp, #112] // 16-byte Folded Reload
|
||||
stur q0, [x12, #-256]
|
||||
ldr q0, [sp, #96] // 16-byte Folded Reload
|
||||
stp q0, q2, [x12, #-240]
|
||||
ldp q0, q2, [x9, #224]
|
||||
ldp q3, q4, [x10, #224]
|
||||
ldp q5, q6, [x11, #224]
|
||||
fmla v13.2d, v1.2d, v15.2d
|
||||
stp q12, q13, [x12, #192]
|
||||
fmla v0.2d, v5.2d, v3.2d
|
||||
fmla v2.2d, v6.2d, v4.2d
|
||||
stp q0, q2, [x12, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x12, x12, #512 // =512
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x13, x13, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
53
examples/striad/striad.s.tx2.gcc.s
Normal file
53
examples/striad/striad.s.tx2.gcc.s
Normal file
@@ -0,0 +1,53 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x12, x11, 16
|
||||
ldr q29, [x22, x11]
|
||||
ldr q30, [x20, x11]
|
||||
add x7, x11, 32
|
||||
ldr q31, [x21, x11]
|
||||
ldr q7, [x22, x12]
|
||||
add x6, x11, 48
|
||||
add x5, x11, 64
|
||||
ldr q6, [x20, x12]
|
||||
ldr q2, [x21, x12]
|
||||
add x8, x11, 80
|
||||
add x0, x11, 96
|
||||
ldr q9, [x22, x7]
|
||||
ldr q5, [x20, x7]
|
||||
add x13, x11, 112
|
||||
ldr q1, [x21, x7]
|
||||
ldr q16, [x22, x6]
|
||||
ldr q4, [x20, x6]
|
||||
ldr q0, [x21, x6]
|
||||
fmla v30.2d, v29.2d, v31.2d
|
||||
ldr q23, [x22, x5]
|
||||
ldr q3, [x20, x5]
|
||||
fmla v6.2d, v7.2d, v2.2d
|
||||
ldr q22, [x21, x5]
|
||||
ldr q21, [x22, x8]
|
||||
ldr q24, [x20, x8]
|
||||
ldr q20, [x21, x8]
|
||||
fmla v5.2d, v9.2d, v1.2d
|
||||
ldr q19, [x22, x0]
|
||||
ldr q25, [x20, x0]
|
||||
fmla v4.2d, v16.2d, v0.2d
|
||||
ldr q18, [x21, x0]
|
||||
ldr q17, [x22, x13]
|
||||
ldr q26, [x20, x13]
|
||||
ldr q27, [x21, x13]
|
||||
fmla v3.2d, v23.2d, v22.2d
|
||||
fmla v24.2d, v21.2d, v20.2d
|
||||
str q30, [x19, x11]
|
||||
add x11, x11, 128
|
||||
str q6, [x19, x12]
|
||||
fmla v25.2d, v19.2d, v18.2d
|
||||
str q5, [x19, x7]
|
||||
fmla v26.2d, v17.2d, v27.2d
|
||||
str q4, [x19, x6]
|
||||
str q3, [x19, x5]
|
||||
str q24, [x19, x8]
|
||||
str q25, [x19, x0]
|
||||
str q26, [x19, x13]
|
||||
cmp x25, x11
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
38
examples/striad/striad.s.zen.gcc.s
Normal file
38
examples/striad/striad.s.zen.gcc.s
Normal file
@@ -0,0 +1,38 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups (%r14,%rax), %xmm0
|
||||
vmovups (%r12,%rax), %xmm5
|
||||
vmovups 16(%r14,%rax), %xmm3
|
||||
vmovups 16(%r12,%rax), %xmm6
|
||||
vmovups 32(%r14,%rax), %xmm4
|
||||
vmovups 32(%r12,%rax), %xmm7
|
||||
vmovups 48(%r14,%rax), %xmm8
|
||||
vmovups 48(%r12,%rax), %xmm9
|
||||
vmovups 64(%r14,%rax), %xmm10
|
||||
vmovups 64(%r12,%rax), %xmm11
|
||||
vmovups 80(%r14,%rax), %xmm12
|
||||
vmovups 80(%r12,%rax), %xmm13
|
||||
vmovups 96(%r14,%rax), %xmm14
|
||||
vmovups 96(%r12,%rax), %xmm15
|
||||
vmovups 112(%r14,%rax), %xmm2
|
||||
vmovups 112(%r12,%rax), %xmm1
|
||||
vfmadd132pd 0(%r13,%rax), %xmm5, %xmm0
|
||||
vfmadd132pd 16(%r13,%rax), %xmm6, %xmm3
|
||||
vfmadd132pd 32(%r13,%rax), %xmm7, %xmm4
|
||||
vfmadd132pd 48(%r13,%rax), %xmm9, %xmm8
|
||||
vfmadd132pd 64(%r13,%rax), %xmm11, %xmm10
|
||||
vfmadd132pd 80(%r13,%rax), %xmm13, %xmm12
|
||||
vfmadd132pd 96(%r13,%rax), %xmm15, %xmm14
|
||||
vfmadd132pd 112(%r13,%rax), %xmm1, %xmm2
|
||||
vmovups %xmm0, 0(%rbp,%rax)
|
||||
vmovups %xmm3, 16(%rbp,%rax)
|
||||
vmovups %xmm4, 32(%rbp,%rax)
|
||||
vmovups %xmm8, 48(%rbp,%rax)
|
||||
vmovups %xmm10, 64(%rbp,%rax)
|
||||
vmovups %xmm12, 80(%rbp,%rax)
|
||||
vmovups %xmm14, 96(%rbp,%rax)
|
||||
vmovups %xmm2, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rcx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
46
examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s
Normal file
46
examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s
Normal file
@@ -0,0 +1,46 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
# LLVM-MCA-BEGIN
|
||||
.L19:
|
||||
vmovupd (%rcx), %ymm4
|
||||
vmovupd 32(%rcx), %ymm13
|
||||
vaddsd %xmm4, %xmm0, %xmm6
|
||||
vunpckhpd %xmm4, %xmm4, %xmm3
|
||||
vextractf64x2 $0x1, %ymm4, %xmm8
|
||||
vaddsd %xmm6, %xmm3, %xmm7
|
||||
vunpckhpd %xmm8, %xmm8, %xmm11
|
||||
vunpckhpd %xmm13, %xmm13, %xmm1
|
||||
vaddsd %xmm7, %xmm8, %xmm10
|
||||
vextractf64x2 $0x1, %ymm13, %xmm2
|
||||
vunpckhpd %xmm2, %xmm2, %xmm3
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmovupd 64(%rcx), %ymm8
|
||||
vmovupd 96(%rcx), %ymm5
|
||||
vaddsd %xmm13, %xmm12, %xmm0
|
||||
vunpckhpd %xmm8, %xmm8, %xmm12
|
||||
vextractf64x2 $0x1, %ymm8, %xmm14
|
||||
vaddsd %xmm0, %xmm1, %xmm4
|
||||
vunpckhpd %xmm14, %xmm14, %xmm0
|
||||
vextractf64x2 $0x1, %ymm5, %xmm9
|
||||
vaddsd %xmm4, %xmm2, %xmm6
|
||||
subq $-128, %rcx
|
||||
vaddsd %xmm3, %xmm6, %xmm7
|
||||
vaddsd %xmm8, %xmm7, %xmm11
|
||||
vunpckhpd %xmm5, %xmm5, %xmm7
|
||||
vaddsd %xmm11, %xmm12, %xmm13
|
||||
vunpckhpd %xmm9, %xmm9, %xmm12
|
||||
vaddsd %xmm13, %xmm14, %xmm1
|
||||
vaddsd %xmm0, %xmm1, %xmm4
|
||||
vaddsd %xmm5, %xmm4, %xmm3
|
||||
vaddsd %xmm3, %xmm7, %xmm8
|
||||
vaddsd %xmm8, %xmm9, %xmm11
|
||||
vaddsd %xmm12, %xmm11, %xmm0
|
||||
cmpq %rcx, %r15
|
||||
jne .L19
|
||||
# LLVM-MCA-END
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
20
examples/sum_reduction/sum_reduction.s.csx.gcc.s
Normal file
20
examples/sum_reduction/sum_reduction.s.csx.gcc.s
Normal file
@@ -0,0 +1,20 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vaddpd (%rcx), %ymm3, %ymm4
|
||||
addq $256, %rcx
|
||||
vaddpd -224(%rcx), %ymm4, %ymm5
|
||||
vaddpd -192(%rcx), %ymm5, %ymm6
|
||||
vaddpd -160(%rcx), %ymm6, %ymm8
|
||||
vaddpd -128(%rcx), %ymm8, %ymm9
|
||||
vaddpd -96(%rcx), %ymm9, %ymm10
|
||||
vaddpd -64(%rcx), %ymm10, %ymm11
|
||||
vaddpd -32(%rcx), %ymm11, %ymm3
|
||||
cmpq %rcx, %r15
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
17
examples/sum_reduction/sum_reduction.s.csx.icc.s
Normal file
17
examples/sum_reduction/sum_reduction.s.csx.icc.s
Normal file
@@ -0,0 +1,17 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.38: # Preds ..B1.38 ..B1.37
|
||||
# Execution count [2.22e+03]
|
||||
vaddpd (%r13,%rax,8), %zmm4, %zmm4 #76.5
|
||||
vaddpd 64(%r13,%rax,8), %zmm3, %zmm3 #76.5
|
||||
vaddpd 128(%r13,%rax,8), %zmm2, %zmm2 #76.5
|
||||
vaddpd 192(%r13,%rax,8), %zmm1, %zmm1 #76.5
|
||||
addq $32, %rax #76.5
|
||||
cmpq %r14, %rax #76.5
|
||||
jb ..B1.38 # Prob 82% #76.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
57
examples/sum_reduction/sum_reduction.s.tx2.clang.s
Normal file
57
examples/sum_reduction/sum_reduction.s.tx2.clang.s
Normal file
@@ -0,0 +1,57 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q4, q5, [x9, #-256]
|
||||
fadd v0.2d, v4.2d, v0.2d
|
||||
fadd v1.2d, v5.2d, v1.2d
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q16, q17, [x9, #-128]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
ldp q6, q7, [x9, #-224]
|
||||
fadd v2.2d, v6.2d, v2.2d
|
||||
fadd v3.2d, v7.2d, v3.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
fadd v4.2d, v5.2d, v17.2d
|
||||
ldp q6, q7, [x9, #-160]
|
||||
ldp q18, q19, [x9, #-96]
|
||||
ldp q16, q17, [x9]
|
||||
add x8, x8, #64 // =64
|
||||
fadd v1.2d, v1.2d, v4.2d
|
||||
fadd v4.2d, v6.2d, v18.2d
|
||||
fadd v2.2d, v2.2d, v4.2d
|
||||
fadd v4.2d, v7.2d, v19.2d
|
||||
ldp q6, q7, [x9, #-32]
|
||||
ldp q18, q19, [x9, #32]
|
||||
fadd v6.2d, v6.2d, v18.2d
|
||||
fadd v7.2d, v7.2d, v19.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
ldp q4, q5, [x9, #-64]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
fadd v5.2d, v5.2d, v17.2d
|
||||
ldp q16, q17, [x9, #64]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
fadd v5.2d, v5.2d, v17.2d
|
||||
ldp q16, q17, [x9, #128]
|
||||
fadd v0.2d, v0.2d, v16.2d
|
||||
fadd v1.2d, v1.2d, v17.2d
|
||||
ldp q16, q17, [x9, #192]
|
||||
ldp q18, q19, [x9, #96]
|
||||
fadd v6.2d, v6.2d, v18.2d
|
||||
fadd v7.2d, v7.2d, v19.2d
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
ldp q18, q19, [x9, #160]
|
||||
fadd v2.2d, v2.2d, v18.2d
|
||||
fadd v3.2d, v3.2d, v19.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
fadd v4.2d, v5.2d, v17.2d
|
||||
ldp q18, q19, [x9, #224]
|
||||
add x9, x9, #512 // =512
|
||||
fadd v1.2d, v1.2d, v4.2d
|
||||
fadd v4.2d, v6.2d, v18.2d
|
||||
fadd v2.2d, v2.2d, v4.2d
|
||||
fadd v4.2d, v7.2d, v19.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
adds x10, x10, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
47
examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s
Normal file
47
examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s
Normal file
@@ -0,0 +1,47 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x17, x16
|
||||
ldr q4, [x17], 16
|
||||
ldr q5, [x16, 16]
|
||||
add x16, x16, 128
|
||||
ldr q3, [x16, -80]
|
||||
ldr q2, [x16, -64]
|
||||
ldr q0, [x16, -48]
|
||||
ldr q1, [x16, -32]
|
||||
ldr q7, [x16, -16]
|
||||
dup d16, v4.d[0]
|
||||
dup d6, v4.d[1]
|
||||
ldr q4, [x17, 16]
|
||||
dup d22, v5.d[0]
|
||||
dup d5, v5.d[1]
|
||||
dup d20, v3.d[0]
|
||||
dup d3, v3.d[1]
|
||||
dup d19, v2.d[0]
|
||||
dup d2, v2.d[1]
|
||||
dup d21, v4.d[0]
|
||||
dup d4, v4.d[1]
|
||||
fadd d10, d8, d16
|
||||
dup d18, v0.d[0]
|
||||
dup d0, v0.d[1]
|
||||
dup d8, v1.d[0]
|
||||
dup d1, v1.d[1]
|
||||
dup d17, v7.d[0]
|
||||
dup d7, v7.d[1]
|
||||
fadd d23, d6, d10
|
||||
fadd d24, d23, d22
|
||||
fadd d25, d5, d24
|
||||
fadd d26, d25, d21
|
||||
fadd d27, d4, d26
|
||||
fadd d28, d27, d20
|
||||
fadd d29, d3, d28
|
||||
fadd d30, d29, d19
|
||||
fadd d31, d2, d30
|
||||
fadd d16, d31, d18
|
||||
fadd d6, d0, d16
|
||||
fadd d22, d6, d8
|
||||
fadd d5, d1, d22
|
||||
fadd d20, d5, d17
|
||||
fadd d8, d7, d20
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
23
examples/sum_reduction/sum_reduction.s.tx2.gcc.s
Normal file
23
examples/sum_reduction/sum_reduction.s.tx2.gcc.s
Normal file
@@ -0,0 +1,23 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x17, x16
|
||||
ldr q10, [x17], 16
|
||||
ldr q16, [x16, 16]
|
||||
add x16, x16, 128
|
||||
ldr q17, [x16, -80]
|
||||
ldr q18, [x16, -64]
|
||||
ldr q19, [x16, -48]
|
||||
ldr q20, [x16, -32]
|
||||
ldr q21, [x16, -16]
|
||||
fadd v22.2d, v1.2d, v10.2d
|
||||
ldr q23, [x17, 16]
|
||||
fadd v24.2d, v22.2d, v16.2d
|
||||
fadd v25.2d, v24.2d, v23.2d
|
||||
fadd v26.2d, v25.2d, v17.2d
|
||||
fadd v27.2d, v26.2d, v18.2d
|
||||
fadd v28.2d, v27.2d, v19.2d
|
||||
fadd v29.2d, v28.2d, v20.2d
|
||||
fadd v1.2d, v29.2d, v21.2d
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
38
examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s
Normal file
38
examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s
Normal file
@@ -0,0 +1,38 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovsd (%r10), %xmm8
|
||||
vmovsd 8(%r10), %xmm10
|
||||
subq $-128, %r10
|
||||
vmovsd -112(%r10), %xmm12
|
||||
vmovsd -104(%r10), %xmm14
|
||||
vmovsd -96(%r10), %xmm1
|
||||
vmovsd -88(%r10), %xmm2
|
||||
vmovsd -80(%r10), %xmm3
|
||||
vmovsd -72(%r10), %xmm6
|
||||
vaddsd %xmm8, %xmm7, %xmm9
|
||||
vmovsd -64(%r10), %xmm8
|
||||
vaddsd %xmm9, %xmm10, %xmm11
|
||||
vmovsd -56(%r10), %xmm10
|
||||
vaddsd %xmm12, %xmm11, %xmm13
|
||||
vmovsd -48(%r10), %xmm12
|
||||
vaddsd %xmm13, %xmm14, %xmm15
|
||||
vmovsd -40(%r10), %xmm14
|
||||
vaddsd %xmm1, %xmm15, %xmm4
|
||||
vmovsd -32(%r10), %xmm1
|
||||
vaddsd %xmm4, %xmm2, %xmm0
|
||||
vmovsd -24(%r10), %xmm2
|
||||
vaddsd %xmm3, %xmm0, %xmm5
|
||||
vmovsd -16(%r10), %xmm3
|
||||
vaddsd %xmm5, %xmm6, %xmm7
|
||||
vmovsd -8(%r10), %xmm6
|
||||
vaddsd %xmm8, %xmm7, %xmm9
|
||||
vaddsd %xmm9, %xmm10, %xmm11
|
||||
vaddsd %xmm12, %xmm11, %xmm13
|
||||
vaddsd %xmm13, %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm4
|
||||
vaddsd %xmm4, %xmm2, %xmm0
|
||||
vaddsd %xmm3, %xmm0, %xmm5
|
||||
vaddsd %xmm5, %xmm6, %xmm7
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
14
examples/sum_reduction/sum_reduction.s.zen.gcc.s
Normal file
14
examples/sum_reduction/sum_reduction.s.zen.gcc.s
Normal file
@@ -0,0 +1,14 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vaddpd (%r10), %xmm3, %xmm1
|
||||
subq $-128, %r10
|
||||
vaddpd -112(%r10), %xmm1, %xmm4
|
||||
vaddpd -96(%r10), %xmm4, %xmm5
|
||||
vaddpd -80(%r10), %xmm5, %xmm6
|
||||
vaddpd -64(%r10), %xmm6, %xmm8
|
||||
vaddpd -48(%r10), %xmm8, %xmm9
|
||||
vaddpd -32(%r10), %xmm9, %xmm10
|
||||
vaddpd -16(%r10), %xmm10, %xmm3
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
36
examples/triad/triad.s.csx.gcc.s
Normal file
36
examples/triad/triad.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r14,%rsi), %ymm14
|
||||
vmovupd 32(%r14,%rsi), %ymm15
|
||||
vmovupd 64(%r14,%rsi), %ymm1
|
||||
vmovupd 96(%r14,%rsi), %ymm0
|
||||
vmovupd 128(%r14,%rsi), %ymm3
|
||||
vmovupd 160(%r14,%rsi), %ymm4
|
||||
vmovupd 192(%r14,%rsi), %ymm5
|
||||
vmovupd 224(%r14,%rsi), %ymm7
|
||||
vfmadd213pd 0(%r13,%rsi), %ymm6, %ymm14
|
||||
vfmadd213pd 32(%r13,%rsi), %ymm6, %ymm15
|
||||
vfmadd213pd 64(%r13,%rsi), %ymm6, %ymm1
|
||||
vfmadd213pd 96(%r13,%rsi), %ymm6, %ymm0
|
||||
vfmadd213pd 128(%r13,%rsi), %ymm6, %ymm3
|
||||
vfmadd213pd 160(%r13,%rsi), %ymm6, %ymm4
|
||||
vfmadd213pd 192(%r13,%rsi), %ymm6, %ymm5
|
||||
vfmadd213pd 224(%r13,%rsi), %ymm6, %ymm7
|
||||
vmovupd %ymm14, (%r12,%rsi)
|
||||
vmovupd %ymm15, 32(%r12,%rsi)
|
||||
vmovupd %ymm1, 64(%r12,%rsi)
|
||||
vmovupd %ymm0, 96(%r12,%rsi)
|
||||
vmovupd %ymm3, 128(%r12,%rsi)
|
||||
vmovupd %ymm4, 160(%r12,%rsi)
|
||||
vmovupd %ymm5, 192(%r12,%rsi)
|
||||
vmovupd %ymm7, 224(%r12,%rsi)
|
||||
addq $256, %rsi
|
||||
cmpq %rsi, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
16
examples/triad/triad.s.csx.icc.s
Normal file
16
examples/triad/triad.s.csx.icc.s
Normal file
@@ -0,0 +1,16 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.40: # Preds ..B1.40 ..B1.39
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r13,%rax,8), %zmm1 #78.5
|
||||
vfmadd213pd (%rcx,%rax,8), %zmm2, %zmm1 #78.5
|
||||
vmovupd %zmm1, (%r14,%rax,8) #78.5
|
||||
addq $8, %rax #78.5
|
||||
cmpq %r12, %rax #78.5
|
||||
jb ..B1.40 # Prob 82% #78.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
118
examples/triad/triad.s.tx2.clang.s
Normal file
118
examples/triad/triad.s.tx2.clang.s
Normal file
@@ -0,0 +1,118 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q2, q3, [x9, #-256]
|
||||
ldp q0, q1, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fmla v2.2d, v4.2d, v16.2d
|
||||
fmla v3.2d, v5.2d, v16.2d
|
||||
stp q2, q3, [x11, #-256]
|
||||
fmla v0.2d, v6.2d, v16.2d
|
||||
fmla v1.2d, v7.2d, v16.2d
|
||||
stp q0, q1, [x11, #-224]
|
||||
ldp q6, q7, [x9, #-192]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v6.2d, v16.2d, v20.2d
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q4, q5, [x9, #-160]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fmla v7.2d, v17.2d, v16.2d
|
||||
stp q6, q7, [x11, #-192]
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v4.2d, v18.2d, v16.2d
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v5.2d, v19.2d, v16.2d
|
||||
stp q4, q5, [x11, #-160]
|
||||
ldp q17, q19, [x9, #-128]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v17.2d, v20.2d, v24.2d
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q16, q18, [x9, #-96]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fmla v19.2d, v21.2d, v20.2d
|
||||
stp q17, q19, [x11, #-128]
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v16.2d, v22.2d, v20.2d
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q24, q25, [x10, #-64]
|
||||
fmla v18.2d, v23.2d, v20.2d
|
||||
stp q16, q18, [x11, #-96]
|
||||
ldp q20, q22, [x9, #-64]
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v20.2d, v24.2d, v28.2d
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q21, q23, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fmla v22.2d, v25.2d, v24.2d
|
||||
stp q20, q22, [x11, #-64]
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v21.2d, v26.2d, v24.2d
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q28, q29, [x10]
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q30, q31, [x10, #32]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v23.2d, v27.2d, v24.2d
|
||||
stp q21, q23, [x11, #-32]
|
||||
ldp q24, q25, [x9]
|
||||
fmla v24.2d, v28.2d, v8.2d
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q8, q10, [x10, #64]
|
||||
ldp q11, q12, [x10, #96]
|
||||
fmla v25.2d, v29.2d, v28.2d
|
||||
stp q24, q25, [x11]
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v26.2d, v30.2d, v28.2d
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q13, q14, [x10, #128]
|
||||
ldr q2, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q1, q3, [x10, #192]
|
||||
fmla v27.2d, v31.2d, v28.2d
|
||||
stp q26, q27, [x11, #32]
|
||||
ldp q28, q29, [x9, #64]
|
||||
fmla v28.2d, v8.2d, v9.2d
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q6, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q5, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v29.2d, v10.2d, v8.2d
|
||||
stp q28, q29, [x11, #64]
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v30.2d, v11.2d, v8.2d
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
add x8, x8, #64 // =64
|
||||
fmla v31.2d, v12.2d, v8.2d
|
||||
stp q30, q31, [x11, #96]
|
||||
ldp q8, q10, [x9, #128]
|
||||
fmla v8.2d, v13.2d, v9.2d
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q11, q12, [x9, #160]
|
||||
fmla v10.2d, v14.2d, v9.2d
|
||||
stp q8, q10, [x11, #128]
|
||||
ldp q13, q14, [x10, #160]
|
||||
fmla v12.2d, v14.2d, v2.2d
|
||||
ldp q2, q0, [x9, #192]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v2.2d, v1.2d, v6.2d
|
||||
ldp q1, q4, [x9, #224]
|
||||
fmla v0.2d, v3.2d, v5.2d
|
||||
stp q2, q0, [x11, #192]
|
||||
ldp q3, q5, [x10, #224]
|
||||
fmla v11.2d, v13.2d, v9.2d
|
||||
stp q11, q12, [x11, #160]
|
||||
fmla v1.2d, v3.2d, v16.2d
|
||||
fmla v4.2d, v5.2d, v16.2d
|
||||
stp q1, q4, [x11, #224]
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x12, x12, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
45
examples/triad/triad.s.tx2.gcc.s
Normal file
45
examples/triad/triad.s.tx2.gcc.s
Normal file
@@ -0,0 +1,45 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x0, x10, 16
|
||||
ldr q23, [x20, x10]
|
||||
ldr q24, [x21, x10]
|
||||
add x7, x10, 32
|
||||
ldr q25, [x20, x0]
|
||||
ldr q26, [x21, x0]
|
||||
add x6, x10, 48
|
||||
add x5, x10, 64
|
||||
ldr q27, [x20, x7]
|
||||
ldr q28, [x21, x7]
|
||||
add x4, x10, 80
|
||||
add x11, x10, 96
|
||||
ldr q29, [x20, x6]
|
||||
ldr q30, [x21, x6]
|
||||
add x2, x10, 112
|
||||
fmla v23.2d, v3.2d, v24.2d
|
||||
ldr q31, [x20, x5]
|
||||
ldr q4, [x21, x5]
|
||||
fmla v25.2d, v3.2d, v26.2d
|
||||
ldr q2, [x20, x4]
|
||||
ldr q5, [x21, x4]
|
||||
fmla v27.2d, v3.2d, v28.2d
|
||||
ldr q1, [x20, x11]
|
||||
ldr q6, [x21, x11]
|
||||
fmla v29.2d, v3.2d, v30.2d
|
||||
ldr q0, [x20, x2]
|
||||
ldr q7, [x21, x2]
|
||||
fmla v31.2d, v3.2d, v4.2d
|
||||
fmla v2.2d, v3.2d, v5.2d
|
||||
fmla v1.2d, v3.2d, v6.2d
|
||||
str q23, [x19, x10]
|
||||
add x10, x10, 128
|
||||
fmla v0.2d, v3.2d, v7.2d
|
||||
str q25, [x19, x0]
|
||||
str q27, [x19, x7]
|
||||
str q29, [x19, x6]
|
||||
str q31, [x19, x5]
|
||||
str q2, [x19, x4]
|
||||
str q1, [x19, x11]
|
||||
str q0, [x19, x2]
|
||||
cmp x24, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/triad/triad.s.zen.gcc.s
Normal file
30
examples/triad/triad.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%r13,%rax), %xmm12
|
||||
vmovups 16(%r13,%rax), %xmm13
|
||||
vmovups 32(%r13,%rax), %xmm14
|
||||
vmovups 48(%r13,%rax), %xmm15
|
||||
vmovups 64(%r13,%rax), %xmm1
|
||||
vmovups 80(%r13,%rax), %xmm0
|
||||
vmovups 96(%r13,%rax), %xmm4
|
||||
vmovups 112(%r13,%rax), %xmm5
|
||||
vfmadd213pd (%r12,%rax), %xmm3, %xmm12
|
||||
vfmadd213pd 16(%r12,%rax), %xmm3, %xmm13
|
||||
vfmadd213pd 32(%r12,%rax), %xmm3, %xmm14
|
||||
vfmadd213pd 48(%r12,%rax), %xmm3, %xmm15
|
||||
vfmadd213pd 64(%r12,%rax), %xmm3, %xmm1
|
||||
vfmadd213pd 80(%r12,%rax), %xmm3, %xmm0
|
||||
vfmadd213pd 96(%r12,%rax), %xmm3, %xmm4
|
||||
vfmadd213pd 112(%r12,%rax), %xmm3, %xmm5
|
||||
vmovups %xmm12, 0(%rbp,%rax)
|
||||
vmovups %xmm13, 16(%rbp,%rax)
|
||||
vmovups %xmm14, 32(%rbp,%rax)
|
||||
vmovups %xmm15, 48(%rbp,%rax)
|
||||
vmovups %xmm1, 64(%rbp,%rax)
|
||||
vmovups %xmm0, 80(%rbp,%rax)
|
||||
vmovups %xmm4, 96(%rbp,%rax)
|
||||
vmovups %xmm5, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rbx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
28
examples/update/update.s.csx.gcc.s
Normal file
28
examples/update/update.s.csx.gcc.s
Normal file
@@ -0,0 +1,28 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmulpd (%rcx), %ymm3, %ymm12
|
||||
vmulpd 32(%rcx), %ymm3, %ymm13
|
||||
vmulpd 64(%rcx), %ymm3, %ymm14
|
||||
vmulpd 96(%rcx), %ymm3, %ymm15
|
||||
vmulpd 128(%rcx), %ymm3, %ymm0
|
||||
vmulpd 160(%rcx), %ymm3, %ymm1
|
||||
vmulpd 192(%rcx), %ymm3, %ymm7
|
||||
vmulpd 224(%rcx), %ymm3, %ymm4
|
||||
vmovupd %ymm12, (%rcx)
|
||||
vmovupd %ymm13, 32(%rcx)
|
||||
vmovupd %ymm14, 64(%rcx)
|
||||
vmovupd %ymm15, 96(%rcx)
|
||||
vmovupd %ymm0, 128(%rcx)
|
||||
vmovupd %ymm1, 160(%rcx)
|
||||
vmovupd %ymm7, 192(%rcx)
|
||||
vmovupd %ymm4, 224(%rcx)
|
||||
addq $256, %rcx
|
||||
cmpq %r15, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
17
examples/update/update.s.csx.icc.s
Normal file
17
examples/update/update.s.csx.icc.s
Normal file
@@ -0,0 +1,17 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.38: # Preds ..B1.38 ..B1.37
|
||||
# Execution count [2.22e+03]
|
||||
vmulpd (%r13,%rax,8), %zmm3, %zmm1 #75.5
|
||||
vmulpd 64(%r13,%rax,8), %zmm3, %zmm2 #75.5
|
||||
vmovupd %zmm1, (%r13,%rax,8) #75.5
|
||||
vmovupd %zmm2, 64(%r13,%rax,8) #75.5
|
||||
addq $16, %rax #75.5
|
||||
cmpq %r14, %rax #75.5
|
||||
jb ..B1.38 # Prob 82% #75.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
15
examples/update/update.s.tx2.clang.s
Normal file
15
examples/update/update.s.tx2.clang.s
Normal file
@@ -0,0 +1,15 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_32: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x8]
|
||||
ldp q2, q3, [x8, #-32]
|
||||
fmul v2.2d, v2.2d, v26.2d
|
||||
fmul v3.2d, v3.2d, v26.2d
|
||||
stp q2, q3, [x8, #-32]
|
||||
fmul v0.2d, v0.2d, v26.2d
|
||||
fmul v1.2d, v1.2d, v26.2d
|
||||
stp q0, q1, [x8], #64
|
||||
adds x9, x9, #1 // =1
|
||||
b.ne .LBB1_32
|
||||
// OSACA-END
|
||||
31
examples/update/update.s.tx2.gcc.s
Normal file
31
examples/update/update.s.tx2.gcc.s
Normal file
@@ -0,0 +1,31 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
ldr q23, [x16]
|
||||
mov x17, x16
|
||||
add x16, x16, 128
|
||||
fmul v24.2d, v23.2d, v2.2d
|
||||
str q24, [x17], 16
|
||||
ldr q25, [x16, -112]
|
||||
fmul v26.2d, v25.2d, v2.2d
|
||||
str q26, [x16, -112]
|
||||
ldr q27, [x17, 16]
|
||||
fmul v28.2d, v27.2d, v2.2d
|
||||
str q28, [x17, 16]
|
||||
ldr q29, [x16, -80]
|
||||
ldr q30, [x16, -64]
|
||||
ldr q31, [x16, -48]
|
||||
ldr q1, [x16, -32]
|
||||
ldr q0, [x16, -16]
|
||||
fmul v5.2d, v29.2d, v2.2d
|
||||
fmul v4.2d, v30.2d, v2.2d
|
||||
fmul v3.2d, v31.2d, v2.2d
|
||||
fmul v6.2d, v1.2d, v2.2d
|
||||
fmul v7.2d, v0.2d, v2.2d
|
||||
str q5, [x16, -80]
|
||||
str q4, [x16, -64]
|
||||
str q3, [x16, -48]
|
||||
str q6, [x16, -32]
|
||||
str q7, [x16, -16]
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
22
examples/update/update.s.zen.gcc.s
Normal file
22
examples/update/update.s.zen.gcc.s
Normal file
@@ -0,0 +1,22 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmulpd (%r10), %xmm3, %xmm11
|
||||
subq $-128, %r10
|
||||
vmulpd -112(%r10), %xmm3, %xmm12
|
||||
vmulpd -96(%r10), %xmm3, %xmm13
|
||||
vmulpd -80(%r10), %xmm3, %xmm14
|
||||
vmulpd -64(%r10), %xmm3, %xmm15
|
||||
vmulpd -48(%r10), %xmm3, %xmm0
|
||||
vmovups %xmm11, -128(%r10)
|
||||
vmulpd -32(%r10), %xmm3, %xmm7
|
||||
vmovups %xmm12, -112(%r10)
|
||||
vmulpd -16(%r10), %xmm3, %xmm1
|
||||
vmovups %xmm13, -96(%r10)
|
||||
vmovups %xmm14, -80(%r10)
|
||||
vmovups %xmm15, -64(%r10)
|
||||
vmovups %xmm0, -48(%r10)
|
||||
vmovups %xmm7, -32(%r10)
|
||||
vmovups %xmm1, -16(%r10)
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
Reference in New Issue
Block a user