diff --git a/tests/test_files/triad_x86_unmarked.s b/tests/test_files/triad_x86_unmarked.s new file mode 100644 index 0000000..c92718d --- /dev/null +++ b/tests/test_files/triad_x86_unmarked.s @@ -0,0 +1,345 @@ + .file "triad.c" + .section .rodata.str1.8,"aMS",@progbits,1 + .align 8 +.LC9: + .string "%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n" + .text + .p2align 4,,15 + .globl triad + .type triad, @function +triad: +.LFB24: + .cfi_startproc + pushq %r13 + .cfi_def_cfa_offset 16 + .cfi_offset 13, -16 + movslq %edi, %rax + movl $64, %edi + leaq 16(%rsp), %r13 + .cfi_def_cfa 13, 0 + andq $-32, %rsp + pushq -8(%r13) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movq %rsp, %rbp + pushq %r15 + .cfi_escape 0x10,0xf,0x2,0x76,0x78 + leaq 0(,%rax,8), %r15 + pushq %r14 + movq %r15, %rsi + pushq %r13 + .cfi_escape 0xf,0x3,0x76,0x68,0x6 + .cfi_escape 0x10,0xe,0x2,0x76,0x70 + pushq %r12 + pushq %rbx + .cfi_escape 0x10,0xc,0x2,0x76,0x60 + .cfi_escape 0x10,0x3,0x2,0x76,0x58 + movq %rax, %rbx + subq $72, %rsp + call aligned_alloc + movq %r15, %rsi + movl $64, %edi + movq %rax, %r14 + call aligned_alloc + movq %r15, %rsi + movl $64, %edi + movq %rax, %r12 + call aligned_alloc + movq %r15, %rsi + movl $64, %edi + movq %rax, %r13 + call aligned_alloc + movq %rax, %r15 + leal -1(%rbx), %eax + movl %eax, -96(%rbp) + testl %ebx, %ebx + jle .L2 + cmpl $2, %eax + jbe .L14 + movl %ebx, %esi + vmovapd .LC0(%rip), %ymm0 + xorl %eax, %eax + xorl %ecx, %ecx + shrl $2, %esi + .p2align 4,,10 + .p2align 3 +.L4: + addl $1, %ecx + vmovapd %ymm0, (%r15,%rax) + vmovapd %ymm0, 0(%r13,%rax) + vmovapd %ymm0, (%r12,%rax) + vmovapd %ymm0, (%r14,%rax) + addq $32, %rax + cmpl %ecx, %esi + ja .L4 + movl %ebx, %eax + andl $-4, %eax + cmpl %eax, %ebx + je .L26 + vzeroupper +.L3: + vmovsd .LC1(%rip), %xmm0 + movslq %eax, %rcx + vmovsd %xmm0, (%r15,%rcx,8) + vmovsd %xmm0, 0(%r13,%rcx,8) + vmovsd %xmm0, (%r12,%rcx,8) + vmovsd %xmm0, (%r14,%rcx,8) + leal 1(%rax), %ecx + cmpl %ecx, %ebx + jle .L2 + movslq %ecx, %rcx + addl $2, %eax + vmovsd %xmm0, (%r15,%rcx,8) + vmovsd %xmm0, 0(%r13,%rcx,8) + vmovsd %xmm0, (%r12,%rcx,8) + vmovsd %xmm0, (%r14,%rcx,8) + cmpl %eax, %ebx + jle .L2 + cltq + vmovsd %xmm0, (%r15,%rax,8) + vmovsd %xmm0, 0(%r13,%rax,8) + vmovsd %xmm0, (%r12,%rax,8) + vmovsd %xmm0, (%r14,%rax,8) +.L2: + movl %ebx, %eax + movl $1, -84(%rbp) + movl %ebx, %r10d + andl $-4, %eax + shrl $2, %r10d + movl %eax, -100(%rbp) + .p2align 4,,10 + .p2align 3 +.L13: + leaq -56(%rbp), %rsi + leaq -72(%rbp), %rdi + movl %r10d, -88(%rbp) + call timing + movl -88(%rbp), %r10d + xorl %r11d, %r11d + .p2align 4,,10 + .p2align 3 +.L12: + vmovsd (%r14), %xmm0 + vxorpd %xmm7, %xmm7, %xmm7 + vucomisd %xmm7, %xmm0 + jbe .L6 + movq %r14, %rdi + movl %r11d, -92(%rbp) + movl %r10d, -88(%rbp) + vzeroupper + call dummy + movl -92(%rbp), %r11d + movl -88(%rbp), %r10d +.L6: + testl %ebx, %ebx + jle .L8 + cmpl $2, -96(%rbp) + jbe .L15 + xorl %eax, %eax + xorl %ecx, %ecx + .p2align 4,,10 + .p2align 3 +.L10: + vmovapd (%r15,%rax), %ymm0 + vmovapd (%r12,%rax), %ymm3 + addl $1, %ecx + vfmadd132pd 0(%r13,%rax), %ymm3, %ymm0 + vmovapd %ymm0, (%r14,%rax) + addq $32, %rax + cmpl %ecx, %r10d + ja .L10 + movl -100(%rbp), %eax + cmpl %ebx, %eax + je .L8 +.L9: + movslq %eax, %rcx + vmovsd 0(%r13,%rcx,8), %xmm0 + vmovsd (%r12,%rcx,8), %xmm5 + vfmadd132sd (%r15,%rcx,8), %xmm5, %xmm0 + vmovsd %xmm0, (%r14,%rcx,8) + leal 1(%rax), %ecx + cmpl %ebx, %ecx + jge .L8 + movslq %ecx, %rcx + addl $2, %eax + vmovsd 0(%r13,%rcx,8), %xmm0 + vmovsd (%r12,%rcx,8), %xmm6 + vfmadd132sd (%r15,%rcx,8), %xmm6, %xmm0 + vmovsd %xmm0, (%r14,%rcx,8) + cmpl %eax, %ebx + jle .L8 + cltq + vmovsd (%r15,%rax,8), %xmm0 + vmovsd (%r12,%rax,8), %xmm4 + vfmadd132sd 0(%r13,%rax,8), %xmm4, %xmm0 + vmovsd %xmm0, (%r14,%rax,8) +.L8: + addl $1, %r11d + cmpl -84(%rbp), %r11d + jne .L12 + leaq -56(%rbp), %rsi + leaq -64(%rbp), %rdi + movl %r11d, -84(%rbp) + movl %r10d, -88(%rbp) + vzeroupper + call timing + vmovsd -64(%rbp), %xmm1 + vsubsd -72(%rbp), %xmm1, %xmm1 + vmovsd .LC3(%rip), %xmm2 + movl -84(%rbp), %r11d + movl -88(%rbp), %r10d + vucomisd %xmm1, %xmm2 + leal (%r11,%r11), %eax + movl %eax, -84(%rbp) + ja .L13 + movl %eax, %esi + vxorpd %xmm6, %xmm6, %xmm6 + vxorpd %xmm0, %xmm0, %xmm0 + movl %ebx, %edx + sarl %esi + vcvtsi2sd %ebx, %xmm0, %xmm0 + movl $.LC9, %edi + movl $5, %eax + vcvtsi2sd %esi, %xmm6, %xmm6 + vmulsd .LC5(%rip), %xmm6, %xmm2 + vmovsd .LC4(%rip), %xmm5 + vmovsd .LC6(%rip), %xmm7 + vmulsd %xmm0, %xmm6, %xmm4 + vmulsd %xmm0, %xmm2, %xmm2 + vdivsd %xmm1, %xmm4, %xmm4 + vdivsd %xmm1, %xmm2, %xmm2 + vdivsd %xmm5, %xmm4, %xmm4 + vmulsd %xmm7, %xmm2, %xmm3 + vaddsd %xmm0, %xmm0, %xmm2 + vmulsd .LC8(%rip), %xmm0, %xmm0 + vmulsd %xmm6, %xmm2, %xmm2 + vmulsd .LC7(%rip), %xmm2, %xmm2 + vmulsd %xmm7, %xmm3, %xmm3 + vdivsd %xmm5, %xmm0, %xmm0 + vdivsd %xmm5, %xmm4, %xmm4 + vdivsd %xmm1, %xmm2, %xmm2 + call printf + movq %r14, %rdi + call free + movq %r12, %rdi + call free + movq %r13, %rdi + call free + addq $72, %rsp + movq %r15, %rdi + popq %rbx + popq %r12 + popq %r13 + .cfi_remember_state + .cfi_def_cfa 13, 0 + popq %r14 + popq %r15 + popq %rbp + leaq -16(%r13), %rsp + .cfi_def_cfa 7, 16 + popq %r13 + .cfi_def_cfa_offset 8 + jmp free + .p2align 4,,10 + .p2align 3 +.L15: + .cfi_restore_state + xorl %eax, %eax + jmp .L9 +.L26: + vzeroupper + jmp .L2 +.L14: + xorl %eax, %eax + jmp .L3 + .cfi_endproc +.LFE24: + .size triad, .-triad + .section .rodata.str1.8 + .align 8 +.LC10: + .string "TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it" + .align 8 +.LC11: + .string "Size (KByte) | runtime | MFlop/s | MB/s | MLUP/s | repeat | size" + .section .text.startup,"ax",@progbits + .p2align 4,,15 + .globl main + .type main, @function +main: +.LFB25: + .cfi_startproc + pushq %rbx + .cfi_def_cfa_offset 16 + .cfi_offset 3, -16 + movl $.LC10, %edi + movl $20, %ebx + call puts + movl $.LC11, %edi + call puts + .p2align 4,,10 + .p2align 3 +.L28: + vxorpd %xmm1, %xmm1, %xmm1 + movq .LC12(%rip), %rax + vcvtsi2sd %ebx, %xmm1, %xmm1 + addl $1, %ebx + vmovq %rax, %xmm0 + call pow + vcvttsd2si %xmm0, %edi + call triad + cmpl $36, %ebx + jne .L28 + xorl %eax, %eax + popq %rbx + .cfi_def_cfa_offset 8 + ret + .cfi_endproc +.LFE25: + .size main, .-main + .section .rodata.cst32,"aM",@progbits,32 + .align 32 +.LC0: + .long 1907715710 + .long 1048610426 + .long 1907715710 + .long 1048610426 + .long 1907715710 + .long 1048610426 + .long 1907715710 + .long 1048610426 + .section .rodata.cst8,"aM",@progbits,8 + .align 8 +.LC1: + .long 1907715710 + .long 1048610426 + .align 8 +.LC3: + .long 2576980378 + .long 1070176665 + .align 8 +.LC4: + .long 0 + .long 1083129856 + .align 8 +.LC5: + .long 0 + .long 1077936128 + .align 8 +.LC6: + .long 0 + .long 1062207488 + .align 8 +.LC7: + .long 2696277389 + .long 1051772663 + .align 8 +.LC8: + .long 0 + .long 1075838976 + .align 8 +.LC12: + .long 3435973837 + .long 1073007820 + .ident "GCC: (GNU) 7.2.0" + .section .note.GNU-stack,"",@progbits