Files
OSACA/tests/test_files/triad_x86_iaca.s
2020-02-20 12:07:20 +01:00

354 lines
6.8 KiB
ArmAsm

.file "triad.c"
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC9:
.string "%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
.text
.p2align 4,,15
.globl triad
.type triad, @function
triad:
.LFB24:
.cfi_startproc
pushq %r13
.cfi_def_cfa_offset 16
.cfi_offset 13, -16
movslq %edi, %rax
movl $64, %edi
leaq 16(%rsp), %r13
.cfi_def_cfa 13, 0
andq $-32, %rsp
pushq -8(%r13)
pushq %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
movq %rsp, %rbp
pushq %r15
.cfi_escape 0x10,0xf,0x2,0x76,0x78
leaq 0(,%rax,8), %r15
pushq %r14
movq %r15, %rsi
pushq %r13
.cfi_escape 0xf,0x3,0x76,0x68,0x6
.cfi_escape 0x10,0xe,0x2,0x76,0x70
pushq %r12
pushq %rbx
.cfi_escape 0x10,0xc,0x2,0x76,0x60
.cfi_escape 0x10,0x3,0x2,0x76,0x58
movq %rax, %rbx
subq $72, %rsp
call aligned_alloc
movq %r15, %rsi
movl $64, %edi
movq %rax, %r14
call aligned_alloc
movq %r15, %rsi
movl $64, %edi
movq %rax, %r12
call aligned_alloc
movq %r15, %rsi
movl $64, %edi
movq %rax, %r13
call aligned_alloc
movq %rax, %r15
leal -1(%rbx), %eax
movl %eax, -96(%rbp)
testl %ebx, %ebx
jle .L2
cmpl $2, %eax
jbe .L14
movl %ebx, %esi
vmovapd .LC0(%rip), %ymm0
xorl %eax, %eax
xorl %ecx, %ecx
shrl $2, %esi
.p2align 4,,10
.p2align 3
.L4:
addl $1, %ecx
vmovapd %ymm0, (%r15,%rax)
vmovapd %ymm0, 0(%r13,%rax)
vmovapd %ymm0, (%r12,%rax)
vmovapd %ymm0, (%r14,%rax)
addq $32, %rax
cmpl %ecx, %esi
ja .L4
movl %ebx, %eax
andl $-4, %eax
cmpl %eax, %ebx
je .L26
vzeroupper
.L3:
vmovsd .LC1(%rip), %xmm0
movslq %eax, %rcx
vmovsd %xmm0, (%r15,%rcx,8)
vmovsd %xmm0, 0(%r13,%rcx,8)
vmovsd %xmm0, (%r12,%rcx,8)
vmovsd %xmm0, (%r14,%rcx,8)
leal 1(%rax), %ecx
cmpl %ecx, %ebx
jle .L2
movslq %ecx, %rcx
addl $2, %eax
vmovsd %xmm0, (%r15,%rcx,8)
vmovsd %xmm0, 0(%r13,%rcx,8)
vmovsd %xmm0, (%r12,%rcx,8)
vmovsd %xmm0, (%r14,%rcx,8)
cmpl %eax, %ebx
jle .L2
cltq
vmovsd %xmm0, (%r15,%rax,8)
vmovsd %xmm0, 0(%r13,%rax,8)
vmovsd %xmm0, (%r12,%rax,8)
vmovsd %xmm0, (%r14,%rax,8)
.L2:
movl %ebx, %eax
movl $1, -84(%rbp)
movl %ebx, %r10d
andl $-4, %eax
shrl $2, %r10d
movl %eax, -100(%rbp)
.p2align 4,,10
.p2align 3
.L13:
leaq -56(%rbp), %rsi
leaq -72(%rbp), %rdi
movl %r10d, -88(%rbp)
call timing
movl -88(%rbp), %r10d
xorl %r11d, %r11d
.p2align 4,,10
.p2align 3
.L12:
vmovsd (%r14), %xmm0
vxorpd %xmm7, %xmm7, %xmm7
vucomisd %xmm7, %xmm0
jbe .L6
movq %r14, %rdi
movl %r11d, -92(%rbp)
movl %r10d, -88(%rbp)
vzeroupper
call dummy
movl -92(%rbp), %r11d
movl -88(%rbp), %r10d
.L6:
testl %ebx, %ebx
jle .L8
cmpl $2, -96(%rbp)
jbe .L15
xorl %eax, %eax
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L10:
vmovapd (%r15,%rax), %ymm0
vmovapd (%r12,%rax), %ymm3
addl $1, %ecx
vfmadd132pd 0(%r13,%rax), %ymm3, %ymm0
vmovapd %ymm0, (%r14,%rax)
addq $32, %rax
cmpl %ecx, %r10d
ja .L10
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
movl -100(%rbp), %eax
cmpl %ebx, %eax
je .L8
.L9:
movslq %eax, %rcx
vmovsd 0(%r13,%rcx,8), %xmm0
vmovsd (%r12,%rcx,8), %xmm5
vfmadd132sd (%r15,%rcx,8), %xmm5, %xmm0
vmovsd %xmm0, (%r14,%rcx,8)
leal 1(%rax), %ecx
cmpl %ebx, %ecx
jge .L8
movslq %ecx, %rcx
addl $2, %eax
vmovsd 0(%r13,%rcx,8), %xmm0
vmovsd (%r12,%rcx,8), %xmm6
vfmadd132sd (%r15,%rcx,8), %xmm6, %xmm0
vmovsd %xmm0, (%r14,%rcx,8)
cmpl %eax, %ebx
jle .L8
cltq
vmovsd (%r15,%rax,8), %xmm0
vmovsd (%r12,%rax,8), %xmm4
vfmadd132sd 0(%r13,%rax,8), %xmm4, %xmm0
vmovsd %xmm0, (%r14,%rax,8)
.L8:
addl $1, %r11d
cmpl -84(%rbp), %r11d
jne .L12
leaq -56(%rbp), %rsi
leaq -64(%rbp), %rdi
movl %r11d, -84(%rbp)
movl %r10d, -88(%rbp)
vzeroupper
call timing
vmovsd -64(%rbp), %xmm1
vsubsd -72(%rbp), %xmm1, %xmm1
vmovsd .LC3(%rip), %xmm2
movl -84(%rbp), %r11d
movl -88(%rbp), %r10d
vucomisd %xmm1, %xmm2
leal (%r11,%r11), %eax
movl %eax, -84(%rbp)
ja .L13
movl %eax, %esi
vxorpd %xmm6, %xmm6, %xmm6
vxorpd %xmm0, %xmm0, %xmm0
movl %ebx, %edx
sarl %esi
vcvtsi2sd %ebx, %xmm0, %xmm0
movl $.LC9, %edi
movl $5, %eax
vcvtsi2sd %esi, %xmm6, %xmm6
vmulsd .LC5(%rip), %xmm6, %xmm2
vmovsd .LC4(%rip), %xmm5
vmovsd .LC6(%rip), %xmm7
vmulsd %xmm0, %xmm6, %xmm4
vmulsd %xmm0, %xmm2, %xmm2
vdivsd %xmm1, %xmm4, %xmm4
vdivsd %xmm1, %xmm2, %xmm2
vdivsd %xmm5, %xmm4, %xmm4
vmulsd %xmm7, %xmm2, %xmm3
vaddsd %xmm0, %xmm0, %xmm2
vmulsd .LC8(%rip), %xmm0, %xmm0
vmulsd %xmm6, %xmm2, %xmm2
vmulsd .LC7(%rip), %xmm2, %xmm2
vmulsd %xmm7, %xmm3, %xmm3
vdivsd %xmm5, %xmm0, %xmm0
vdivsd %xmm5, %xmm4, %xmm4
vdivsd %xmm1, %xmm2, %xmm2
call printf
movq %r14, %rdi
call free
movq %r12, %rdi
call free
movq %r13, %rdi
call free
addq $72, %rsp
movq %r15, %rdi
popq %rbx
popq %r12
popq %r13
.cfi_remember_state
.cfi_def_cfa 13, 0
popq %r14
popq %r15
popq %rbp
leaq -16(%r13), %rsp
.cfi_def_cfa 7, 16
popq %r13
.cfi_def_cfa_offset 8
jmp free
.p2align 4,,10
.p2align 3
.L15:
.cfi_restore_state
xorl %eax, %eax
jmp .L9
.L26:
vzeroupper
jmp .L2
.L14:
xorl %eax, %eax
jmp .L3
.cfi_endproc
.LFE24:
.size triad, .-triad
.section .rodata.str1.8
.align 8
.LC10:
.string "TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
.align 8
.LC11:
.string "Size (KByte) | runtime | MFlop/s | MB/s | MLUP/s | repeat | size"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB25:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movl $.LC10, %edi
movl $20, %ebx
call puts
movl $.LC11, %edi
call puts
.p2align 4,,10
.p2align 3
.L28:
vxorpd %xmm1, %xmm1, %xmm1
movq .LC12(%rip), %rax
vcvtsi2sd %ebx, %xmm1, %xmm1
addl $1, %ebx
vmovq %rax, %xmm0
call pow
vcvttsd2si %xmm0, %edi
call triad
cmpl $36, %ebx
jne .L28
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE25:
.size main, .-main
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.long 1907715710
.long 1048610426
.long 1907715710
.long 1048610426
.long 1907715710
.long 1048610426
.long 1907715710
.long 1048610426
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC1:
.long 1907715710
.long 1048610426
.align 8
.LC3:
.long 2576980378
.long 1070176665
.align 8
.LC4:
.long 0
.long 1083129856
.align 8
.LC5:
.long 0
.long 1077936128
.align 8
.LC6:
.long 0
.long 1062207488
.align 8
.LC7:
.long 2696277389
.long 1051772663
.align 8
.LC8:
.long 0
.long 1075838976
.align 8
.LC12:
.long 3435973837
.long 1073007820
.ident "GCC: (GNU) 7.2.0"
.section .note.GNU-stack,"",@progbits