From d861d66206159591a60ffd288567b4db128d38ba Mon Sep 17 00:00:00 2001 From: JanLJL Date: Thu, 27 Feb 2020 18:32:48 +0100 Subject: [PATCH] minor fixes --- README.rst | 2 + examples/gs/gs.s.csx.gcc.s | 1077 ------------------------------- examples/gs/gs.s.csx.icc.s | 1089 +------------------------------ examples/gs/gs.s.tx2.clang.s | 1175 ---------------------------------- examples/gs/gs.s.tx2.gcc.s | 696 -------------------- examples/gs/gs.s.zen.gcc.s | 1016 +---------------------------- 6 files changed, 5 insertions(+), 5050 deletions(-) diff --git a/README.rst b/README.rst index 4463037..861d7af 100644 --- a/README.rst +++ b/README.rst @@ -337,6 +337,8 @@ It shows the whole kernel together with the optimized port pressure of each inst Furthermore, in the two columns on the right, the critical path (CP) and the longest loop-carried dependency (LCD) of the loop kernel. In the bottom, all loop-carried dependencies are shown, each with a list of line numbers being part of this dependency chain on the right. +You can find more (already marked) examples and sample outputs for various architectures in the `examples `_ directory. + Credits ======= Implementation: Jan Laukemann diff --git a/examples/gs/gs.s.csx.gcc.s b/examples/gs/gs.s.csx.gcc.s index e8af358..5c4fad7 100644 --- a/examples/gs/gs.s.csx.gcc.s +++ b/examples/gs/gs.s.csx.gcc.s @@ -1,704 +1,3 @@ - .file "gs.f90" - .text - .section .rodata.str1.1,"aMS",@progbits,1 -.LC0: - .string "gs.f90" - .section .rodata.str1.8,"aMS",@progbits,1 - .align 8 -.LC1: - .string "Integer overflow when calculating the amount of memory to allocate" - .align 8 -.LC2: - .string "Allocation would exceed memory limit" - .section .rodata.str1.1 -.LC8: - .string "# Iterations: " -.LC9: - .string " Performance: " -.LC11: - .string " MLUPs" - .text - .p2align 4 - .type MAIN__, @function -MAIN__: -.LFB0: - .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset 6, -16 - movabsq $21474836608, %rax - movq %rsp, %rbp - .cfi_def_cfa_register 6 - pushq %r15 - pushq %r14 - .cfi_offset 15, -24 - .cfi_offset 14, -32 - movq $-1, %r14 - pushq %r13 - pushq %r12 - pushq %rbx - .cfi_offset 13, -40 - .cfi_offset 12, -48 - .cfi_offset 3, -56 - movq %r14, %rbx - subq $696, %rsp - leaq 160(%rsp), %rdi - movq %rax, 160(%rsp) - movq $.LC0, 168(%rsp) - movl $12, 176(%rsp) - call _gfortran_st_read - movl $4, %edx - leaq 112(%rsp), %rsi - leaq 160(%rsp), %rdi - call _gfortran_transfer_integer - movl $4, %edx - leaq 116(%rsp), %rsi - leaq 160(%rsp), %rdi - call _gfortran_transfer_integer - leaq 160(%rsp), %rdi - call _gfortran_st_read_done - movslq 112(%rsp), %r15 - movslq 116(%rsp), %rdi - testq %r15, %r15 - cmovns %r15, %rbx - movabsq $4611686018427387904, %rcx - incq %rbx - testq %rdi, %rdi - cmovns %rdi, %r14 - movabsq $2305843009213693951, %rsi - incq %r14 - imulq %rbx, %r14 - xorl %edx, %edx - movl %r15d, 88(%rsp) - cmpq %rcx, %r14 - leaq (%r14,%r14), %r13 - sete %dl - cmpq %rsi, %r13 - setg %r8b - movzbl %r8b, %r9d - movq %rdi, 56(%rsp) - movq %rdi, %r12 - addl %r9d, %edx - testq %r15, %r15 - js .L36 - testq %rdi, %rdi - js .L36 - movq %r14, %r10 - salq $4, %r10 -.L2: - testl %edx, %edx - jne .L286 - testq %r10, %r10 - movl $1, %edi - cmovne %r10, %rdi - call malloc - movq %rax, %rdx - testq %rax, %rax - je .L287 - movl 88(%rsp), %r11d - cmpl $1, %r12d - jle .L5 - cmpl $1, %r11d - jle .L6 - movl %r11d, %r9d - subl $2, %r11d - movq %r11, %rcx - addq %rbx, %r11 - leaq 16(%rax,%r11,8), %r10 - leaq 0(,%rbx,8), %rdi - leal -1(%r9), %r11d - leaq 8(%rax,%rdi), %rsi - movq %rdi, 8(%rsp) - movl %r11d, %edi - leaq 0(,%r14,8), %rax - movl %r11d, 52(%rsp) - shrl $2, %edi - andl $-4, %r11d - movq %r10, 80(%rsp) - movq %rax, (%rsp) - leal 2(%r11), %r10d - leal 3(%r11), %eax - salq $5, %rdi - movq %r13, %r8 - movq %rdi, 64(%rsp) - movl %r10d, 48(%rsp) - movq %r10, 24(%rsp) - movl %eax, 20(%rsp) - movq %rax, 40(%rsp) - movl $1, 72(%rsp) - leal 1(%r11), %r9d - subq %r14, %r8 - movq %r9, 32(%rsp) - addq %rbx, %r8 - movq %rbx, %r9 - vxorpd %xmm0, %xmm0, %xmm0 -.L14: - leaq 3(%r8), %rdi - cmpq %rdi, %r9 - leaq 3(%r9), %rax - setg %r10b - cmpq %rax, %r8 - setg %dil - orb %dil, %r10b - je .L39 - movq (%rsp), %rax - cmpl $2, %ecx - seta %r10b - leaq (%rsi,%rax), %rdi - xorl %eax, %eax - testb %r10b, %r10b - je .L39 - movq 64(%rsp), %r10 - subq $32, %r10 - shrq $5, %r10 - incq %r10 - andl $7, %r10d - je .L13 - cmpq $1, %r10 - je .L177 - cmpq $2, %r10 - je .L178 - cmpq $3, %r10 - je .L179 - cmpq $4, %r10 - je .L180 - cmpq $5, %r10 - je .L181 - cmpq $6, %r10 - je .L182 - vmovupd %ymm0, (%rsi) - movl $32, %eax - vmovupd %ymm0, (%rdi) -.L182: - vmovupd %ymm0, (%rsi,%rax) - vmovupd %ymm0, (%rdi,%rax) - addq $32, %rax -.L181: - vmovupd %ymm0, (%rsi,%rax) - vmovupd %ymm0, (%rdi,%rax) - addq $32, %rax -.L180: - vmovupd %ymm0, (%rsi,%rax) - vmovupd %ymm0, (%rdi,%rax) - addq $32, %rax -.L179: - vmovupd %ymm0, (%rsi,%rax) - vmovupd %ymm0, (%rdi,%rax) - addq $32, %rax -.L178: - vmovupd %ymm0, (%rsi,%rax) - vmovupd %ymm0, (%rdi,%rax) - addq $32, %rax -.L177: - vmovupd %ymm0, (%rsi,%rax) - vmovupd %ymm0, (%rdi,%rax) - addq $32, %rax - cmpq 64(%rsp), %rax - je .L156 -.L13: - vmovupd %ymm0, (%rsi,%rax) - vmovupd %ymm0, (%rdi,%rax) - vmovupd %ymm0, 32(%rax,%rsi) - vmovupd %ymm0, 32(%rdi,%rax) - vmovupd %ymm0, 64(%rax,%rsi) - vmovupd %ymm0, 64(%rdi,%rax) - vmovupd %ymm0, 96(%rax,%rsi) - vmovupd %ymm0, 96(%rdi,%rax) - vmovupd %ymm0, 128(%rax,%rsi) - vmovupd %ymm0, 128(%rdi,%rax) - vmovupd %ymm0, 160(%rax,%rsi) - vmovupd %ymm0, 160(%rdi,%rax) - vmovupd %ymm0, 192(%rax,%rsi) - vmovupd %ymm0, 192(%rdi,%rax) - vmovupd %ymm0, 224(%rax,%rsi) - vmovupd %ymm0, 224(%rdi,%rax) - addq $256, %rax - cmpq 64(%rsp), %rax - jne .L13 -.L156: - cmpl %r11d, 52(%rsp) - je .L16 - movq 32(%rsp), %rdi - leaq (%r9,%rdi), %r10 - movq $0x000000000, (%rdx,%r10,8) - leaq (%r8,%rdi), %rax - movl 48(%rsp), %r10d - movl 88(%rsp), %edi - movq $0x000000000, (%rdx,%rax,8) - cmpl %r10d, %edi - jle .L16 - movq 24(%rsp), %r10 - leaq (%r9,%r10), %rax - movq $0x000000000, (%rdx,%rax,8) - movl 20(%rsp), %eax - leaq (%r8,%r10), %r10 - movq $0x000000000, (%rdx,%r10,8) - cmpl %eax, %edi - jle .L16 - movq 40(%rsp), %rdi - leaq (%r9,%rdi), %r10 - leaq (%r8,%rdi), %rax - movq $0x000000000, (%rdx,%r10,8) - movq $0x000000000, (%rdx,%rax,8) -.L16: - incl 72(%rsp) - movq 8(%rsp), %rdi - addq %rbx, %r9 - addq %rdi, 80(%rsp) - movl 72(%rsp), %r10d - addq %rbx, %r8 - addq %rdi, %rsi - cmpl %r10d, %r12d - jne .L14 -.L11: - movq 56(%rsp), %r10 - movl 88(%rsp), %r8d - imulq %rbx, %r10 - movl $0, %r11d - testl %r8d, %r8d - movq %r10, %rax - cmovns %r8d, %r11d - leaq 3(%r10), %rsi - subq %r14, %rax - movq %r13, %r9 - addq %r13, %rax - subq %r14, %r9 - cmpq $6, %rsi - seta %dil - cmpl $2, %r11d - leaq 3(%rax), %r8 - movq %rsi, 80(%rsp) - seta %sil - andl %esi, %edi - cmpq $6, %r8 - movq %r9, 72(%rsp) - seta %sil - leaq 3(%r9), %r9 - andl %edi, %esi - cmpq $6, %r9 - seta %dil - andl %esi, %edi - cmpq %rax, 80(%rsp) - setl %sil - cmpq %r8, %r10 - setg 64(%rsp) - orb 64(%rsp), %sil - andl %esi, %edi - cmpq %rax, %r9 - setl %sil - movb %dil, 64(%rsp) - cmpq 72(%rsp), %r8 - setl %r8b - orl %r8d, %esi - testb %sil, 64(%rsp) - je .L19 - movq 72(%rsp), %rdi - cmpq %r9, %r10 - setg %r9b - cmpq %rdi, 80(%rsp) - setl %sil - orb %sil, %r9b - je .L19 - incl %r11d - movl %r11d, %r9d - shrl $2, %r9d - salq $5, %r9 - movq %r9, 80(%rsp) - subq $32, %r9 - shrq $5, %r9 - incq %r9 - leaq (%rdx,%rax,8), %rdi - vmovapd .LC4(%rip), %ymm1 - leaq (%rdx,%r10,8), %r8 - leaq (%rdx,%r14,8), %rsi - xorl %eax, %eax - vxorpd %xmm2, %xmm2, %xmm2 - andl $7, %r9d - je .L21 - cmpq $1, %r9 - je .L189 - cmpq $2, %r9 - je .L190 - cmpq $3, %r9 - je .L191 - cmpq $4, %r9 - je .L192 - cmpq $5, %r9 - je .L193 - cmpq $6, %r9 - je .L194 - vmovupd %ymm1, (%r8) - movl $32, %eax - vmovupd %ymm1, (%rdi) - vmovupd %ymm2, (%rdx) - vmovupd %ymm2, (%rsi) -.L194: - vmovupd %ymm1, (%r8,%rax) - vmovupd %ymm1, (%rdi,%rax) - vmovupd %ymm2, (%rdx,%rax) - vmovupd %ymm2, (%rsi,%rax) - addq $32, %rax -.L193: - vmovupd %ymm1, (%r8,%rax) - vmovupd %ymm1, (%rdi,%rax) - vmovupd %ymm2, (%rdx,%rax) - vmovupd %ymm2, (%rsi,%rax) - addq $32, %rax -.L192: - vmovupd %ymm1, (%r8,%rax) - vmovupd %ymm1, (%rdi,%rax) - vmovupd %ymm2, (%rdx,%rax) - vmovupd %ymm2, (%rsi,%rax) - addq $32, %rax -.L191: - vmovupd %ymm1, (%r8,%rax) - vmovupd %ymm1, (%rdi,%rax) - vmovupd %ymm2, (%rdx,%rax) - vmovupd %ymm2, (%rsi,%rax) - addq $32, %rax -.L190: - vmovupd %ymm1, (%r8,%rax) - vmovupd %ymm1, (%rdi,%rax) - vmovupd %ymm2, (%rdx,%rax) - vmovupd %ymm2, (%rsi,%rax) - addq $32, %rax -.L189: - vmovupd %ymm1, (%r8,%rax) - vmovupd %ymm1, (%rdi,%rax) - vmovupd %ymm2, (%rdx,%rax) - vmovupd %ymm2, (%rsi,%rax) - addq $32, %rax - cmpq 80(%rsp), %rax - je .L114 -.L21: - vmovupd %ymm1, (%r8,%rax) - vmovupd %ymm1, (%rdi,%rax) - vmovupd %ymm2, (%rdx,%rax) - vmovupd %ymm2, (%rsi,%rax) - vmovupd %ymm1, 32(%r8,%rax) - vmovupd %ymm1, 32(%rdi,%rax) - vmovupd %ymm2, 32(%rdx,%rax) - vmovupd %ymm2, 32(%rsi,%rax) - vmovupd %ymm1, 64(%r8,%rax) - vmovupd %ymm1, 64(%rdi,%rax) - vmovupd %ymm2, 64(%rdx,%rax) - vmovupd %ymm2, 64(%rsi,%rax) - vmovupd %ymm1, 96(%r8,%rax) - vmovupd %ymm1, 96(%rdi,%rax) - vmovupd %ymm2, 96(%rdx,%rax) - vmovupd %ymm2, 96(%rsi,%rax) - vmovupd %ymm1, 128(%r8,%rax) - vmovupd %ymm1, 128(%rdi,%rax) - vmovupd %ymm2, 128(%rdx,%rax) - vmovupd %ymm2, 128(%rsi,%rax) - vmovupd %ymm1, 160(%r8,%rax) - vmovupd %ymm1, 160(%rdi,%rax) - vmovupd %ymm2, 160(%rdx,%rax) - vmovupd %ymm2, 160(%rsi,%rax) - vmovupd %ymm1, 192(%r8,%rax) - vmovupd %ymm1, 192(%rdi,%rax) - vmovupd %ymm2, 192(%rdx,%rax) - vmovupd %ymm2, 192(%rsi,%rax) - vmovupd %ymm1, 224(%r8,%rax) - vmovupd %ymm1, 224(%rdi,%rax) - vmovupd %ymm2, 224(%rdx,%rax) - vmovupd %ymm2, 224(%rsi,%rax) - addq $256, %rax - cmpq 80(%rsp), %rax - jne .L21 -.L114: - movl %r11d, %eax - andl $-4, %eax - testb $3, %r11b - je .L282 - movslq %eax, %r8 - vmovsd .LC5(%rip), %xmm3 - leaq (%r10,%r8), %rsi - movq %r8, %rdi - vmovsd %xmm3, (%rdx,%rsi,8) - subq %r14, %rdi - addq %r14, %rsi - movl 88(%rsp), %r9d - vmovsd %xmm3, (%rdx,%rsi,8) - addq %r13, %rdi - movq $0x000000000, (%rdx,%r8,8) - leal 1(%rax), %r8d - movq $0x000000000, (%rdx,%rdi,8) - cmpl %r8d, %r9d - jl .L282 - movslq %r8d, %rsi - movq %rsi, %r8 - leaq (%r10,%rsi), %rdi - subq %r14, %r8 - vmovsd %xmm3, (%rdx,%rdi,8) - addq %r13, %r8 - addq %r14, %rdi - addl $2, %eax - vmovsd %xmm3, (%rdx,%rdi,8) - movq $0x000000000, (%rdx,%rsi,8) - movq $0x000000000, (%rdx,%r8,8) - cmpl %eax, %r9d - jl .L282 - cltq - movq %rax, %r9 - leaq (%r10,%rax), %r10 - subq %r14, %r9 - vmovsd %xmm3, (%rdx,%r10,8) - addq %r13, %r9 - addq %r14, %r10 - vmovsd %xmm3, (%rdx,%r10,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rdx,%r9,8) - vzeroupper -.L10: - testl %r12d, %r12d - js .L17 -.L18: - vxorpd %xmm5, %xmm5, %xmm5 - vcvtsi2sdl %r11d, %xmm5, %xmm6 - vcvtsi2sdl 88(%rsp), %xmm5, %xmm7 - movq %r15, %r11 - subq %r14, %r11 - leaq 0(%r13,%r11), %rdi - vdivsd %xmm7, %xmm6, %xmm8 - subq %r14, %r13 - leaq 0(,%rbx,8), %rsi - movl %r12d, %r14d - andl $7, %r14d - movl $1, %r9d - leaq (%rdx,%rsi), %rax - vmovsd %xmm8, (%rdx) - vmovsd %xmm8, (%rdx,%r13,8) - vmovsd %xmm8, (%rdx,%r15,8) - vmovsd %xmm8, (%rdx,%rdi,8) - cmpl $1, %r12d - jl .L17 - testl %r14d, %r14d - je .L26 - cmpl $1, %r14d - je .L201 - cmpl $2, %r14d - je .L202 - cmpl $3, %r14d - je .L203 - cmpl $4, %r14d - je .L204 - cmpl $5, %r14d - je .L205 - cmpl $6, %r14d - je .L206 - vmovsd %xmm8, (%rax) - movl $2, %r9d - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax -.L206: - vmovsd %xmm8, (%rax) - incl %r9d - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax -.L205: - vmovsd %xmm8, (%rax) - incl %r9d - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax -.L204: - vmovsd %xmm8, (%rax) - incl %r9d - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax -.L203: - vmovsd %xmm8, (%rax) - incl %r9d - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax -.L202: - vmovsd %xmm8, (%rax) - incl %r9d - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax -.L201: - incl %r9d - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - cmpl %r9d, %r12d - jl .L17 -.L26: - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - vmovsd %xmm8, (%rax) - addl $8, %r9d - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rdi,8) - addq %rsi, %rax - cmpl %r9d, %r12d - jge .L26 -.L17: - movl %ecx, %ecx - leaq 0(,%rbx,8), %r13 - addq %rbx, %rcx - leaq 8(%rdx,%r13), %r15 - leaq (%rbx,%rbx), %r10 - leaq 16(%rdx,%rcx,8), %r8 - movq %r15, 64(%rsp) - movq %r10, 72(%rsp) - movq %r8, 56(%rsp) - movl $10, 80(%rsp) -.L25: - leaq 128(%rsp), %rsi - leaq 144(%rsp), %rdi - sall 80(%rsp) - call timing_ - movq .LC6(%rip), %r11 - xorl %r15d, %r15d - vmovq %r11, %xmm9 - .p2align 4,,10 - .p2align 3 -.L29: - cmpl $1, %r12d - jle .L32 - cmpl $1, 88(%rsp) - jle .L32 - movq 56(%rsp), %r8 - movq 72(%rsp), %r14 - movq 64(%rsp), %rdi - movq %rbx, %r9 - xorl %r11d, %r11d - movl $1, %r10d - .p2align 4,,10 - .p2align 3 -.L33: - movq %r8, %rdx - subq %rdi, %rdx - subq $8, %rdx - shrq $3, %rdx - movq %r11, %rsi - movq %r14, %rcx - incq %rdx - vmovsd -8(%rdi), %xmm8 - incl %r10d - movq %rdi, %rax - subq %r9, %rsi - subq %r9, %rcx - andl $7, %edx - je .L31 - cmpq $1, %rdx - je .L195 - cmpq $2, %rdx - je .L196 - cmpq $3, %rdx - je .L197 - cmpq $4, %rdx - je .L198 - cmpq $5, %rdx - je .L199 - cmpq $6, %rdx - je .L200 - vmovsd (%rdi,%rsi,8), %xmm10 - vaddsd (%rdi,%rcx,8), %xmm8, %xmm12 - vaddsd 8(%rdi), %xmm10, %xmm11 - leaq 8(%rdi), %rax - vaddsd %xmm12, %xmm11, %xmm13 - vmulsd %xmm9, %xmm13, %xmm8 - vmovsd %xmm8, (%rdi) -.L200: - vmovsd (%rax,%rsi,8), %xmm14 - vaddsd (%rax,%rcx,8), %xmm8, %xmm0 - vaddsd 8(%rax), %xmm14, %xmm15 - addq $8, %rax - vaddsd %xmm0, %xmm15, %xmm1 - vmulsd %xmm9, %xmm1, %xmm8 - vmovsd %xmm8, -8(%rax) -.L199: - vmovsd (%rax,%rsi,8), %xmm2 - vaddsd (%rax,%rcx,8), %xmm8, %xmm4 - vaddsd 8(%rax), %xmm2, %xmm3 - addq $8, %rax - vaddsd %xmm4, %xmm3, %xmm5 - vmulsd %xmm9, %xmm5, %xmm8 - vmovsd %xmm8, -8(%rax) -.L198: - vmovsd (%rax,%rsi,8), %xmm6 - vaddsd (%rax,%rcx,8), %xmm8, %xmm8 - vaddsd 8(%rax), %xmm6, %xmm7 - addq $8, %rax - vaddsd %xmm8, %xmm7, %xmm10 - vmulsd %xmm9, %xmm10, %xmm8 - vmovsd %xmm8, -8(%rax) -.L197: - vmovsd (%rax,%rsi,8), %xmm11 - vaddsd (%rax,%rcx,8), %xmm8, %xmm13 - vaddsd 8(%rax), %xmm11, %xmm12 - addq $8, %rax - vaddsd %xmm13, %xmm12, %xmm14 - vmulsd %xmm9, %xmm14, %xmm8 - vmovsd %xmm8, -8(%rax) -.L196: - vmovsd (%rax,%rsi,8), %xmm15 - vaddsd (%rax,%rcx,8), %xmm8, %xmm0 - vaddsd 8(%rax), %xmm15, %xmm1 - addq $8, %rax - vaddsd %xmm0, %xmm1, %xmm2 - vmulsd %xmm9, %xmm2, %xmm8 - vmovsd %xmm8, -8(%rax) -.L195: - vmovsd (%rax,%rsi,8), %xmm3 - vaddsd (%rax,%rcx,8), %xmm8, %xmm5 - vaddsd 8(%rax), %xmm3, %xmm4 - addq $8, %rax - vaddsd %xmm5, %xmm4, %xmm6 - vmulsd %xmm9, %xmm6, %xmm8 - vmovsd %xmm8, -8(%rax) - cmpq %r8, %rax - je .L267 movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY @@ -766,379 +65,3 @@ MAIN__: .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY -.L267: - addq %r13, %rdi - addq %rbx, %r11 - addq %rbx, %r9 - addq %rbx, %r14 - addq %r13, %r8 - cmpl %r10d, %r12d - jne .L33 -.L32: - leal 1(%r15), %r8d - cmpl 80(%rsp), %r8d - je .L28 - movl %r8d, %r15d - jmp .L29 -.L39: - movq 80(%rsp), %r10 - movq %r8, %rdi - subq %rsi, %r10 - subq $8, %r10 - shrq $3, %r10 - incq %r10 - movq %rsi, %rax - subq %r9, %rdi - andl $7, %r10d - je .L9 - cmpq $1, %r10 - je .L183 - cmpq $2, %r10 - je .L184 - cmpq $3, %r10 - je .L185 - cmpq $4, %r10 - je .L186 - cmpq $5, %r10 - je .L187 - cmpq $6, %r10 - je .L188 - movq $0x000000000, (%rsi) - movq $0x000000000, (%rsi,%rdi,8) - leaq 8(%rsi), %rax -.L188: - movq $0x000000000, (%rax) - movq $0x000000000, (%rax,%rdi,8) - addq $8, %rax -.L187: - movq $0x000000000, (%rax) - movq $0x000000000, (%rax,%rdi,8) - addq $8, %rax -.L186: - movq $0x000000000, (%rax) - movq $0x000000000, (%rax,%rdi,8) - addq $8, %rax -.L185: - movq $0x000000000, (%rax) - movq $0x000000000, (%rax,%rdi,8) - addq $8, %rax -.L184: - movq $0x000000000, (%rax) - movq $0x000000000, (%rax,%rdi,8) - addq $8, %rax -.L183: - movq $0x000000000, (%rax) - movq $0x000000000, (%rax,%rdi,8) - addq $8, %rax - cmpq 80(%rsp), %rax - je .L16 -.L9: - movq $0x000000000, (%rax) - movq $0x000000000, (%rax,%rdi,8) - movq $0x000000000, 8(%rax) - movq $0x000000000, 8(%rax,%rdi,8) - movq $0x000000000, 16(%rax) - movq $0x000000000, 16(%rax,%rdi,8) - movq $0x000000000, 24(%rax) - movq $0x000000000, 24(%rax,%rdi,8) - movq $0x000000000, 32(%rax) - movq $0x000000000, 32(%rax,%rdi,8) - movq $0x000000000, 40(%rax) - movq $0x000000000, 40(%rax,%rdi,8) - movq $0x000000000, 48(%rax) - movq $0x000000000, 48(%rax,%rdi,8) - movq $0x000000000, 56(%rax) - movq $0x000000000, 56(%rax,%rdi,8) - addq $64, %rax - cmpq 80(%rsp), %rax - jne .L9 - jmp .L16 -.L36: - xorl %r10d, %r10d - jmp .L2 - .p2align 4,,10 - .p2align 3 -.L28: - addl $2, %r15d - leaq 120(%rsp), %rsi - leaq 136(%rsp), %rdi - movl %r15d, 108(%rsp) - call timing_ - vmovsd 136(%rsp), %xmm9 - vsubsd 144(%rsp), %xmm9, %xmm3 - vcomisd .LC7(%rip), %xmm3 - jnb .L40 - cmpl $999999999, 80(%rsp) - jle .L25 -.L40: - movl 80(%rsp), %ebx - cmpl %ebx, %r15d - jle .L35 - movl %ebx, 108(%rsp) -.L35: - leaq 160(%rsp), %rdi - movabsq $25769803904, %r13 - vmovsd %xmm3, 88(%rsp) - movq $.LC0, 168(%rsp) - movl $72, 176(%rsp) - movq %r13, 160(%rsp) - call _gfortran_st_write - movl $14, %edx - movl $.LC8, %esi - leaq 160(%rsp), %rdi - call _gfortran_transfer_character_write - movl $4, %edx - leaq 108(%rsp), %rsi - leaq 160(%rsp), %rdi - call _gfortran_transfer_integer_write - movl $14, %edx - movl $.LC9, %esi - leaq 160(%rsp), %rdi - call _gfortran_transfer_character_write - decl %r12d - vxorpd %xmm2, %xmm2, %xmm2 - vcvtsi2sdl 52(%rsp), %xmm2, %xmm4 - vcvtsi2sdl %r12d, %xmm2, %xmm5 - vcvtsi2sdl 108(%rsp), %xmm2, %xmm8 - vmovsd 88(%rsp), %xmm11 - movl $8, %edx - vmulsd %xmm5, %xmm4, %xmm6 - vmulsd .LC10(%rip), %xmm8, %xmm7 - leaq 152(%rsp), %rsi - leaq 160(%rsp), %rdi - vmulsd %xmm7, %xmm6, %xmm10 - vdivsd %xmm11, %xmm10, %xmm12 - vmovsd %xmm12, 152(%rsp) - call _gfortran_transfer_real_write - movl $6, %edx - movl $.LC11, %esi - leaq 160(%rsp), %rdi - call _gfortran_transfer_character_write - leaq 160(%rsp), %rdi - call _gfortran_st_write_done - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - call _gfortran_stop_string -.L282: - vzeroupper - jmp .L10 -.L5: - testl %r11d, %r11d - js .L37 -.L284: - leal -2(%r11), %ecx - decl %r11d - movl %r11d, 52(%rsp) - jmp .L11 -.L6: - cmpl $0, 88(%rsp) - jns .L288 - movl 88(%rsp), %eax - xorl %r11d, %r11d - leal -2(%rax), %ecx - decl %eax - movl %eax, 52(%rsp) - jmp .L18 -.L19: - imulq $-8, %r14, %rax - leaq (%rdx,%r10,8), %r8 - addq %r13, %r10 - leaq (%rax,%r10,8), %rdi - movl 88(%rsp), %r10d - vmovsd .LC5(%rip), %xmm4 - leaq (%rax,%r13,8), %rsi - movl %r10d, %r9d - addq %rdx, %rdi - addq %rdx, %rsi - andl $7, %r9d - decl %r10d - vmovsd %xmm4, (%r8) - movl $1, %eax - vmovsd %xmm4, (%rdi) - movq $0x000000000, (%rdx) - movq $0x000000000, (%rsi) - jl .L45 - testl %r9d, %r9d - je .L24 - cmpl $1, %r9d - je .L207 - cmpl $2, %r9d - je .L208 - cmpl $3, %r9d - je .L209 - cmpl $4, %r9d - je .L210 - cmpl $5, %r9d - je .L211 - cmpl $6, %r9d - je .L212 - vmovsd %xmm4, 8(%r8) - vmovsd %xmm4, 8(%rdi) - movq $0x000000000, 8(%rdx) - movq $0x000000000, 8(%rsi) - movl $2, %eax -.L212: - vmovsd %xmm4, (%r8,%rax,8) - vmovsd %xmm4, (%rdi,%rax,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rsi,%rax,8) - incq %rax -.L211: - vmovsd %xmm4, (%r8,%rax,8) - vmovsd %xmm4, (%rdi,%rax,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rsi,%rax,8) - incq %rax -.L210: - vmovsd %xmm4, (%r8,%rax,8) - vmovsd %xmm4, (%rdi,%rax,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rsi,%rax,8) - incq %rax -.L209: - vmovsd %xmm4, (%r8,%rax,8) - vmovsd %xmm4, (%rdi,%rax,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rsi,%rax,8) - incq %rax -.L208: - vmovsd %xmm4, (%r8,%rax,8) - vmovsd %xmm4, (%rdi,%rax,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rsi,%rax,8) - incq %rax -.L207: - vmovsd %xmm4, (%r8,%rax,8) - vmovsd %xmm4, (%rdi,%rax,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rsi,%rax,8) - incq %rax - cmpl %eax, 88(%rsp) - jl .L45 -.L24: - leaq 1(%rax), %r10 - vmovsd %xmm4, (%r8,%rax,8) - leaq 2(%rax), %r9 - vmovsd %xmm4, (%rdi,%rax,8) - movq $0x000000000, (%rdx,%rax,8) - movq $0x000000000, (%rsi,%rax,8) - vmovsd %xmm4, (%r8,%r10,8) - vmovsd %xmm4, (%rdi,%r10,8) - movq $0x000000000, (%rdx,%r10,8) - movq $0x000000000, (%rsi,%r10,8) - leaq 3(%rax), %r10 - vmovsd %xmm4, (%r8,%r9,8) - vmovsd %xmm4, (%rdi,%r9,8) - movq $0x000000000, (%rdx,%r9,8) - movq $0x000000000, (%rsi,%r9,8) - vmovsd %xmm4, (%r8,%r10,8) - leaq 4(%rax), %r9 - vmovsd %xmm4, (%rdi,%r10,8) - movq $0x000000000, (%rdx,%r10,8) - movq $0x000000000, (%rsi,%r10,8) - leaq 5(%rax), %r10 - vmovsd %xmm4, (%r8,%r9,8) - vmovsd %xmm4, (%rdi,%r9,8) - movq $0x000000000, (%rdx,%r9,8) - movq $0x000000000, (%rsi,%r9,8) - vmovsd %xmm4, (%r8,%r10,8) - leaq 6(%rax), %r9 - vmovsd %xmm4, (%rdi,%r10,8) - movq $0x000000000, (%rdx,%r10,8) - movq $0x000000000, (%rsi,%r10,8) - leaq 7(%rax), %r10 - addq $8, %rax - vmovsd %xmm4, (%r8,%r9,8) - vmovsd %xmm4, (%rdi,%r9,8) - movq $0x000000000, (%rdx,%r9,8) - movq $0x000000000, (%rsi,%r9,8) - vmovsd %xmm4, (%r8,%r10,8) - vmovsd %xmm4, (%rdi,%r10,8) - movq $0x000000000, (%rdx,%r10,8) - movq $0x000000000, (%rsi,%r10,8) - cmpl %eax, 88(%rsp) - jge .L24 -.L45: - incl %r11d - vzeroupper - jmp .L10 -.L37: - movl 88(%rsp), %r8d - xorl %r11d, %r11d - leal -2(%r8), %ecx - decl %r8d - movl %r8d, 52(%rsp) - jmp .L10 -.L287: - movl $.LC2, %edi - call _gfortran_os_error -.L286: - movl $.LC1, %edi - xorl %eax, %eax - call _gfortran_runtime_error -.L288: - movl 88(%rsp), %r11d - jmp .L284 - .cfi_endproc -.LFE0: - .size MAIN__, .-MAIN__ - .section .text.startup,"ax",@progbits - .p2align 4 - .globl main - .type main, @function -main: -.LFB1: - .cfi_startproc - subq $8, %rsp - .cfi_def_cfa_offset 16 - call _gfortran_set_args - movl $options.9.4008, %esi - movl $7, %edi - call _gfortran_set_options - call MAIN__ - .cfi_endproc -.LFE1: - .size main, .-main - .section .rodata - .align 16 - .type options.9.4008, @object - .size options.9.4008, 28 -options.9.4008: - .long 2116 - .long 4095 - .long 0 - .long 1 - .long 1 - .long 0 - .long 31 - .section .rodata.cst32,"aM",@progbits,32 - .align 32 -.LC4: - .long 0 - .long 1072693248 - .long 0 - .long 1072693248 - .long 0 - .long 1072693248 - .long 0 - .long 1072693248 - .section .rodata.cst8,"aM",@progbits,8 - .align 8 -.LC5: - .long 0 - .long 1072693248 - .align 8 -.LC6: - .long 0 - .long 1070596096 - .align 8 -.LC7: - .long 2576980378 - .long 1070176665 - .align 8 -.LC10: - .long 2696277389 - .long 1051772663 - .ident "GCC: (GNU) 9.1.0" - .section .note.GNU-stack,"",@progbits diff --git a/examples/gs/gs.s.csx.icc.s b/examples/gs/gs.s.csx.icc.s index 19295fd..7b4cf66 100644 --- a/examples/gs/gs.s.csx.icc.s +++ b/examples/gs/gs.s.csx.icc.s @@ -1,658 +1,3 @@ -# mark_description "Intel(R) Fortran Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 2019"; -# mark_description "0815"; -# mark_description "-qopenmp-simd -fno-alias -unroll -fno-builtin -xCORE-AVX512 -qopt-zmm-usage=high -Ofast -S -use-msasm -o gs."; -# mark_description "s.csx.icc.s"; - .file "gs.f90" - .text -..TXTST0: -.L_2__routine_start_MAIN___0: -# -- Begin MAIN__ - .text -# mark_begin; - .align 16,0x90 - .globl MAIN__ -# --- HEAT -MAIN__: -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_MAIN__.1: -..L2: - #1.9 - pushq %rbp #1.9 - .cfi_def_cfa_offset 16 - movq %rsp, %rbp #1.9 - .cfi_def_cfa 6, 16 - .cfi_offset 6, -16 - andq $-128, %rsp #1.9 - pushq %r12 #1.9 - pushq %r13 #1.9 - pushq %r14 #1.9 - pushq %r15 #1.9 - pushq %rbx #1.9 - subq $216, %rsp #1.9 - movq $0x64199d9ffe, %rsi #1.9 - movl $3, %edi #1.9 - call __intel_new_feature_proc_init #1.9 - .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 - # LOE -..B1.95: # Preds ..B1.1 - # Execution count [1.00e+00] - vstmxcsr (%rsp) #1.9 - movl $__NLITPACK_0.0.1, %edi #1.9 - xorl %eax, %eax #1.9 - orl $32832, (%rsp) #1.9 - vldmxcsr (%rsp) #1.9 -..___tag_value_MAIN__.11: - call for_set_reentrancy #1.9 -..___tag_value_MAIN__.12: - # LOE -..B1.2: # Preds ..B1.95 - # Execution count [1.00e+00] - movl $-4, %esi #12.3 - lea 152(%rsp), %rax #12.3 - movq %rax, -24(%rax) #12.3 - lea (%rsp), %rdi #12.3 - movq $0x1208384ff00, %rdx #12.3 - movl $__STRLITPACK_3.0.1, %ecx #12.3 - xorl %eax, %eax #12.3 - lea 128(%rsp), %r8 #12.3 - movq $0, (%rdi) #12.3 -..___tag_value_MAIN__.13: - call for_read_seq_lis #12.3 -..___tag_value_MAIN__.14: - # LOE -..B1.3: # Preds ..B1.2 - # Execution count [1.00e+00] - movl $__STRLITPACK_4.0.1, %esi #12.3 - lea 156(%rsp), %rax #12.3 - movq %rax, -20(%rax) #12.3 - lea (%rsp), %rdi #12.3 - xorl %eax, %eax #12.3 - lea 136(%rsp), %rdx #12.3 -..___tag_value_MAIN__.15: - call for_read_seq_lis_xmit #12.3 -..___tag_value_MAIN__.16: - # LOE -..B1.4: # Preds ..B1.3 - # Execution count [1.00e+00] - movq 24+heat_$PHI.0.1(%rip), %r9 #15.3 - movq %r9, %r10 #15.3 - andq $-256, %r10 #15.3 - movq $0xf000000000, %r12 #15.3 - shrq $8, %r10 #15.3 - andq %r12, %r9 #15.3 - movl 152(%rsp), %r14d #14.3 - movq $0xffffff0fffffffff, %rbx #15.3 - movslq %r14d, %r12 #15.3 - xorl %esi, %esi #15.3 - shlq $63, %r10 #15.3 - movq %r12, %r15 #15.3 - shrq $55, %r10 #15.3 - movl $8, %r11d #15.3 - addq $133, %r10 #15.3 - sarq $63, %r15 #15.3 - andq %rbx, %r10 #15.3 - movl 156(%rsp), %r13d #13.3 - lea 1(%r12), %rbx #15.3 - andn %rbx, %r15, %rdx #15.3 - movslq %r13d, %rbx #15.3 - movq %rbx, %rdi #15.3 - sarq $63, %rdi #15.3 - shrq $36, %r9 #15.3 - lea (,%rdx,8), %r8 #15.3 - movq %r8, 80+heat_$PHI.0.1(%rip) #15.3 - lea 1(%rbx), %rax #15.3 - andn %rax, %rdi, %rcx #15.3 - lea 144(%rsp), %rdi #15.3 - imulq %rcx, %r8 #15.3 - shlq $60, %r9 #15.3 - xorl %eax, %eax #15.3 - shrq $24, %r9 #15.3 - movq %rsi, 16+heat_$PHI.0.1(%rip) #15.3 - orq %r9, %r10 #15.3 - movq %rsi, 64+heat_$PHI.0.1(%rip) #15.3 - movq %rsi, 88+heat_$PHI.0.1(%rip) #15.3 - movl $3, %esi #15.3 - movq %r8, 104+heat_$PHI.0.1(%rip) #15.3 - movl $16, %r8d #15.3 - movq %r10, 24+heat_$PHI.0.1(%rip) #15.3 - movq %r11, 8+heat_$PHI.0.1(%rip) #15.3 - movq $3, 32+heat_$PHI.0.1(%rip) #15.3 - movq %r11, 56+heat_$PHI.0.1(%rip) #15.3 - movq %rdx, 48+heat_$PHI.0.1(%rip) #15.3 - movq $1, 112+heat_$PHI.0.1(%rip) #15.3 - movq $2, 96+heat_$PHI.0.1(%rip) #15.3 - movq %rcx, 72+heat_$PHI.0.1(%rip) #15.3 -..___tag_value_MAIN__.17: - call for_check_mult_overflow64 #15.3 -..___tag_value_MAIN__.18: - # LOE rbx r12 eax r13d r14d -..B1.5: # Preds ..B1.4 - # Execution count [1.00e+00] - movq $0xfffffff00fffffff, %r8 #15.3 - movq $0xf000000000, %rcx #15.3 - andq 24+heat_$PHI.0.1(%rip), %r8 #15.3 - andl $1, %eax #15.3 - addq $1073741824, %r8 #15.3 - movl $heat_$PHI.0.1, %esi #15.3 - movq %r8, 24+heat_$PHI.0.1(%rip) #15.3 - andq %r8, %rcx #15.3 - movl %r8d, %edx #15.3 - andq $-256, %r8 #15.3 - shrq $8, %r8 #15.3 - andl $1, %edx #15.3 - shll $4, %eax #15.3 - addl %edx, %edx #15.3 - andl $1, %r8d #15.3 - orl %eax, %edx #15.3 - shll $21, %r8d #15.3 - xorl %eax, %eax #15.3 - shrq $36, %rcx #15.3 - orl %r8d, %edx #15.3 - andl $-31457281, %edx #15.3 - shll $21, %ecx #15.3 - orl %ecx, %edx #15.3 - addl $262144, %edx #15.3 - movq 144(%rsp), %rdi #15.3 -..___tag_value_MAIN__.19: - call for_alloc_allocatable #15.3 -..___tag_value_MAIN__.20: - # LOE rbx r12 r13d r14d -..B1.6: # Preds ..B1.5 - # Execution count [1.00e+00] - xorl %r8d, %r8d #21.3 - lea -1(%r13), %eax #21.3 - movl %eax, 96(%rsp) #21.3[spill] - testl %eax, %eax #21.3 - jle ..B1.31 # Prob 2% #21.3 - # LOE rbx r8 r12 r13d r14d -..B1.7: # Preds ..B1.6 - # Execution count [9.79e-01] - movq heat_$PHI.0.1(%rip), %r9 #23.9 - lea -1(%r14), %r15d #22.6 - movq 104+heat_$PHI.0.1(%rip), %rcx #23.9 - lea -1(%rbx), %r11 #21.3 - movq 80+heat_$PHI.0.1(%rip), %r10 #23.9 - xorl %edx, %edx #21.3 - movslq %r15d, %rdi #22.6 - vmovdqu .L_2il0floatpacket.0(%rip), %ymm2 #22.6 - lea (%r9,%rcx,2), %rsi #24.9 - vmovdqu .L_2il0floatpacket.1(%rip), %ymm0 #22.6 - movl %r14d, 104(%rsp) #21.3[spill] - movq %rbx, 112(%rsp) #21.3[spill] - movq %r12, 120(%rsp) #21.3[spill] - movl %r13d, 64(%rsp) #21.3[spill] - vpxord %zmm1, %zmm1, %zmm1 #23.9 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 r15d ymm0 ymm2 zmm1 -..B1.8: # Preds ..B1.29 ..B1.7 - # Execution count [5.00e+00] - testl %r15d, %r15d #22.6 - jle ..B1.29 # Prob 50% #22.6 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 r15d ymm0 ymm2 zmm1 -..B1.9: # Preds ..B1.8 - # Execution count [4.89e+00] - movq %rdi, 72(%rsp) #[spill] - movq %r11, 80(%rsp) #[spill] - # LOE rdx rcx rsi r8 r9 r10 r15d ymm0 ymm2 zmm1 -..B1.10: # Preds ..B1.90 ..B1.9 - # Execution count [5.33e+00] - cmpl $16, %r15d #22.6 - jl ..B1.92 # Prob 10% #22.6 - # LOE rdx rcx rsi r8 r9 r10 r15d ymm0 ymm2 zmm1 -..B1.11: # Preds ..B1.10 - # Execution count [5.33e+00] - movq %r10, %r12 #24.9 - subq %rcx, %r12 #24.9 - lea 8(%r12,%rsi), %rbx #22.6 - addq %rdx, %rbx #22.6 - andq $63, %rbx #22.6 - testb $7, %bl #22.6 - je ..B1.13 # Prob 50% #22.6 - # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 -..B1.12: # Preds ..B1.11 - # Execution count [2.66e+00] - xorl %ebx, %ebx #22.6 - jmp ..B1.15 # Prob 100% #22.6 - # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 -..B1.13: # Preds ..B1.11 - # Execution count [2.66e+00] - testl %ebx, %ebx #22.6 - je ..B1.15 # Prob 50% #22.6 - # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 -..B1.14: # Preds ..B1.13 - # Execution count [2.96e+01] - negl %ebx #22.6 - addl $64, %ebx #22.6 - shrl $3, %ebx #22.6 - cmpl %ebx, %r15d #22.6 - cmovl %r15d, %ebx #22.6 - # LOE rdx rcx rsi r8 r9 r10 r12 ebx r15d ymm0 ymm2 zmm1 -..B1.15: # Preds ..B1.12 ..B1.14 ..B1.13 - # Execution count [5.44e+00] - movl %r15d, %eax #22.6 - subl %ebx, %eax #22.6 - andl $15, %eax #22.6 - negl %eax #22.6 - addl %r15d, %eax #22.6 - cmpl $1, %ebx #22.6 - jb ..B1.20 # Prob 50% #22.6 - # LOE rdx rcx rsi r8 r9 r10 r12 eax ebx r15d ymm0 ymm2 zmm1 -..B1.17: # Preds ..B1.15 - # Execution count [5.33e+00] - vmovdqa %ymm2, %ymm4 #22.6 - lea (%r12,%rcx,2), %r13 #24.9 - addq %r9, %r13 #24.9 - lea (%r10,%r9), %r11 #23.9 - vpbroadcastd %ebx, %ymm3 #22.6 - xorl %r14d, %r14d #22.6 - movslq %ebx, %rdi #22.6 - addq %rdx, %r13 #24.9 - addq %rdx, %r11 #23.9 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax ebx r15d ymm0 ymm2 ymm3 ymm4 zmm1 -..B1.18: # Preds ..B1.18 ..B1.17 - # Execution count [2.96e+01] - vpcmpgtd %ymm4, %ymm3, %k1 #22.6 - vpaddd %ymm0, %ymm4, %ymm4 #22.6 - vmovupd %zmm1, 8(%r11,%r14,8){%k1} #23.9 - vmovupd %zmm1, 8(%r13,%r14,8){%k1} #24.9 - addq $8, %r14 #22.6 - cmpq %rdi, %r14 #22.6 - jb ..B1.18 # Prob 82% #22.6 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax ebx r15d ymm0 ymm2 ymm3 ymm4 zmm1 -..B1.19: # Preds ..B1.18 - # Execution count [5.33e+00] - cmpl %ebx, %r15d #22.6 - je ..B1.90 # Prob 10% #22.6 - # LOE rdx rcx rsi r8 r9 r10 r12 eax ebx r15d ymm0 ymm2 zmm1 -..B1.20: # Preds ..B1.15 ..B1.19 - # Execution count [4.79e+00] - movq 72(%rsp), %rdi #[spill] - movq 80(%rsp), %r11 #[spill] - # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 eax ebx r15d ymm0 ymm2 zmm1 -..B1.21: # Preds ..B1.20 - # Execution count [2.96e+01] - lea 16(%rbx), %r13d #22.6 - cmpl %r13d, %eax #22.6 - jl ..B1.25 # Prob 50% #22.6 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 r12 eax ebx r15d ymm0 ymm2 zmm1 -..B1.22: # Preds ..B1.21 - # Execution count [5.33e+00] - movslq %ebx, %rbx #22.6 - lea (%r12,%rcx,2), %r14 #24.9 - addq %r9, %r14 #24.9 - lea (%r10,%r9), %r13 #23.9 - movslq %eax, %r12 #22.6 - addq %rdx, %r14 #24.9 - addq %rdx, %r13 #23.9 - # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax r15d ymm0 ymm2 zmm1 -..B1.23: # Preds ..B1.23 ..B1.22 - # Execution count [2.96e+01] - vmovupd %zmm1, 8(%r13,%rbx,8) #23.9 - vmovupd %zmm1, 8(%r14,%rbx,8) #24.9 - vmovupd %zmm1, 72(%r13,%rbx,8) #23.9 - vmovupd %zmm1, 72(%r14,%rbx,8) #24.9 - addq $16, %rbx #22.6 - cmpq %r12, %rbx #22.6 - jb ..B1.23 # Prob 82% #22.6 - # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 eax r15d ymm0 ymm2 zmm1 -..B1.25: # Preds ..B1.23 ..B1.21 ..B1.92 - # Execution count [5.44e+00] - lea 1(%rax), %ebx #22.6 - cmpl %r15d, %ebx #22.6 - ja ..B1.29 # Prob 50% #22.6 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 eax r15d ymm0 ymm2 zmm1 -..B1.26: # Preds ..B1.25 - # Execution count [5.33e+00] - movq %r10, %rbx #23.9 - lea (%rcx,%r9), %r14 #23.9 - subq %rcx, %rbx #23.9 - xorl %r13d, %r13d #22.6 - movslq %eax, %r12 #23.9 - negl %eax #22.6 - addl %r15d, %eax #22.6 - vpbroadcastd %eax, %ymm3 #22.6 - vmovdqa %ymm2, %ymm4 #22.6 - lea (%rsi,%rbx), %rax #24.9 - addq %r14, %rbx #23.9 - addq %rdx, %rax #24.9 - addq %rdx, %rbx #23.9 - lea (%rax,%r12,8), %rax #24.9 - lea (%rbx,%r12,8), %rbx #23.9 - negq %r12 #22.6 - addq %rdi, %r12 #22.6 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r15d ymm0 ymm2 ymm3 ymm4 zmm1 -..B1.27: # Preds ..B1.27 ..B1.26 - # Execution count [2.96e+01] - vpcmpgtd %ymm4, %ymm3, %k1 #22.6 - vpaddd %ymm0, %ymm4, %ymm4 #22.6 - vmovupd %zmm1, 8(%rbx,%r13,8){%k1} #23.9 - vmovupd %zmm1, 8(%rax,%r13,8){%k1} #24.9 - addq $8, %r13 #22.6 - cmpq %r12, %r13 #22.6 - jb ..B1.27 # Prob 82% #22.6 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r15d ymm0 ymm2 ymm3 ymm4 zmm1 -..B1.29: # Preds ..B1.27 ..B1.8 ..B1.25 - # Execution count [4.91e+00] - incq %r8 #21.3 - addq %r10, %rdx #21.3 - cmpq %r11, %r8 #21.3 - jb ..B1.8 # Prob 82% #21.3 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 r15d ymm0 ymm2 zmm1 -..B1.30: # Preds ..B1.90 ..B1.29 - # Execution count [8.83e-01] - movl 104(%rsp), %r14d #[spill] - movq 112(%rsp), %rbx #[spill] - movq 120(%rsp), %r12 #[spill] - movl 64(%rsp), %r13d #[spill] - # LOE rbx r12 r13d r14d -..B1.31: # Preds ..B1.6 ..B1.30 - # Execution count [1.00e+00] - xorl %eax, %eax #29.3 - testl %r14d, %r14d #29.3 - jl ..B1.40 # Prob 50% #29.3 - # LOE rbx r12 eax r13d r14d -..B1.32: # Preds ..B1.31 - # Execution count [4.35e-01] - movq 80+heat_$PHI.0.1(%rip), %r8 #30.6 - lea 1(%r14), %edx #14.3 - movq 104+heat_$PHI.0.1(%rip), %rdi #30.6 - movq heat_$PHI.0.1(%rip), %rcx #30.6 - cmpl $8, %edx #29.3 - jl ..B1.89 # Prob 10% #29.3 - # LOE rcx rbx rdi r8 r12 eax edx r13d r14d -..B1.33: # Preds ..B1.32 - # Execution count [4.35e-01] - movq %rbx, %r10 #30.6 - movq %rcx, %rax #31.6 - imulq %r8, %r10 #30.6 - vmovupd .L_2il0floatpacket.2(%rip), %ymm1 #30.6 - subq %rdi, %rax #31.6 - movl %edx, %esi #29.3 - andl $-8, %esi #29.3 - subq %rdi, %r10 #30.6 - vxorpd %ymm0, %ymm0, %ymm0 #31.6 - lea (%rdi,%rcx), %r9 #30.6 - xorl %r11d, %r11d #29.3 - lea (%rcx,%rdi,2), %r15 #30.6 - addq %r10, %r9 #30.6 - lea (%rax,%rdi,2), %rax #31.6 - addq %r15, %r10 #30.6 - movslq %esi, %r15 #29.3 - # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r15 edx esi r13d r14d ymm0 ymm1 -..B1.34: # Preds ..B1.34 ..B1.33 - # Execution count [4.90e+00] - vmovupd %ymm1, (%r9,%r11,8) #30.6 - vmovupd %ymm0, (%rcx,%r11,8) #31.6 - vmovupd %ymm1, (%r10,%r11,8) #30.6 - vmovupd %ymm0, (%rax,%r11,8) #31.6 - vmovupd %ymm1, 32(%r9,%r11,8) #30.6 - vmovupd %ymm0, 32(%rcx,%r11,8) #31.6 - vmovupd %ymm1, 32(%r10,%r11,8) #30.6 - vmovupd %ymm0, 32(%rax,%r11,8) #31.6 - addq $8, %r11 #29.3 - cmpq %r15, %r11 #29.3 - jb ..B1.34 # Prob 91% #29.3 - # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r15 edx esi r13d r14d ymm0 ymm1 -..B1.35: # Preds ..B1.34 - # Execution count [4.35e-01] - movl %esi, %eax #32.3 - # LOE rcx rbx rdi r8 r12 eax edx esi r13d r14d -..B1.36: # Preds ..B1.35 ..B1.89 - # Execution count [1.00e+00] - lea 1(%rsi), %r9d #29.3 - cmpl %edx, %r9d #29.3 - ja ..B1.40 # Prob 50% #29.3 - # LOE rcx rbx rdi r8 r12 eax edx esi r13d r14d -..B1.37: # Preds ..B1.36 - # Execution count [4.35e-01] - imulq %rbx, %r8 #30.6 - vmovupd .L_2il0floatpacket.2(%rip), %ymm2 #30.6 - vmovdqu .L_2il0floatpacket.5(%rip), %xmm4 #29.3 - vmovdqu .L_2il0floatpacket.6(%rip), %xmm3 #29.3 - movq %r8, %r9 #30.6 - movq %rcx, %r11 #31.6 - subq %rdi, %r9 #30.6 - subq %rdi, %r11 #31.6 - addq %rcx, %r8 #30.6 - xorl %r10d, %r10d #29.3 - vxorpd %ymm1, %ymm1, %ymm1 #31.6 - lea (%r9,%rdi,2), %rax #30.6 - addq %rcx, %rax #30.6 - lea (%r11,%rdi,2), %r15 #31.6 - movslq %esi, %rdi #30.6 - negl %esi #29.3 - addl %edx, %esi #29.3 - vpbroadcastd %esi, %xmm0 #29.3 - lea (%rcx,%rdi,8), %r11 #31.6 - movslq %edx, %rcx #29.3 - subq %rdi, %rcx #29.3 - lea (%r15,%rdi,8), %r9 #31.6 - lea (%rax,%rdi,8), %rax #30.6 - lea (%r8,%rdi,8), %r8 #30.6 - # LOE rax rcx rbx r8 r9 r10 r11 r12 edx r13d r14d xmm0 xmm3 xmm4 ymm1 ymm2 -..B1.38: # Preds ..B1.38 ..B1.37 - # Execution count [4.90e+00] - vpcmpgtd %xmm3, %xmm0, %k1 #29.3 - vpaddd %xmm4, %xmm3, %xmm3 #29.3 - vmovupd %ymm2, (%r8,%r10,8){%k1} #30.6 - vmovupd %ymm1, (%r11,%r10,8){%k1} #31.6 - vmovupd %ymm2, (%rax,%r10,8){%k1} #30.6 - vmovupd %ymm1, (%r9,%r10,8){%k1} #31.6 - addq $4, %r10 #29.3 - cmpq %rcx, %r10 #29.3 - jb ..B1.38 # Prob 91% #29.3 - # LOE rax rcx rbx r8 r9 r10 r11 r12 edx r13d r14d xmm0 xmm3 xmm4 ymm1 ymm2 -..B1.39: # Preds ..B1.38 - # Execution count [4.35e-01] - movl %edx, %eax #32.3 - # LOE rbx r12 eax r13d r14d -..B1.40: # Preds ..B1.39 ..B1.36 ..B1.31 - # Execution count [1.00e+00] - testl %r13d, %r13d #33.3 - jl ..B1.49 # Prob 50% #33.3 - # LOE rbx r12 eax r13d r14d -..B1.41: # Preds ..B1.40 - # Execution count [4.35e-01] - movq 80+heat_$PHI.0.1(%rip), %r9 #34.6 - incl %r13d #13.3 - movq 104+heat_$PHI.0.1(%rip), %r15 #34.6 - movl 152(%rsp), %r11d #34.27 - movq heat_$PHI.0.1(%rip), %r10 #34.6 - testq %r9, %r9 #55.82 - je ..B1.79 # Prob 10% #55.82 - # LOE rbx r9 r10 r12 r15 eax r11d r13d r14d -..B1.42: # Preds ..B1.41 - # Execution count [4.35e-01] - cmpl $8, %r13d #33.3 - jl ..B1.78 # Prob 10% #33.3 - # LOE rbx r9 r10 r12 r15 eax r11d r13d r14d -..B1.43: # Preds ..B1.42 - # Execution count [4.35e-01] - vxorpd %xmm1, %xmm1, %xmm1 #34.19 - vxorpd %xmm0, %xmm0, %xmm0 #34.27 - vcvtsi2sd %eax, %xmm1, %xmm1 #34.19 - vcvtsi2sd %r11d, %xmm0, %xmm0 #34.27 - vpbroadcastd %r9d, %zmm3 #34.6 - vdivsd %xmm0, %xmm1, %xmm2 #34.6 - movq %r10, %rsi #34.6 - movl %r13d, %r8d #33.3 - subq %r15, %rsi #34.6 - andl $-8, %r8d #33.3 - movslq %r8d, %r8 #33.3 - lea (,%r12,8), %rdi #35.6 - xorl %ecx, %ecx #33.3 - subq %r15, %rdi #35.6 - movl %r11d, 80(%rsp) #34.6[spill] - lea (%rsi,%r15,2), %rdx #34.6 - movq %rdx, 64(%rsp) #34.6[spill] - lea (%r15,%r10), %rsi #35.6 - movl %eax, 88(%rsp) #34.6[spill] - lea (%r10,%r15,2), %rdx #35.6 - vbroadcastsd %xmm2, %zmm1 #34.6 - addq %rdi, %rsi #35.6 - vpmuldq .L_2il0floatpacket.8(%rip), %zmm3, %zmm0 #34.6 - movq %r15, 72(%rsp) #34.6[spill] - addq %rdx, %rdi #35.6 - movq %r8, %r11 #34.6 - xorl %edx, %edx #33.3 - movq 64(%rsp), %rax #34.6[spill] - .align 16,0x90 - # LOE rax rdx rcx rbx rsi rdi r9 r10 r11 r12 r8d r13d r14d zmm0 zmm1 -..B1.44: # Preds ..B1.44 ..B1.43 - # Execution count [4.90e+00] - vpcmpeqb %xmm0, %xmm0, %k1 #34.6 - lea (%r10,%rdx), %r15 #34.6 - vpcmpeqb %xmm0, %xmm0, %k2 #35.6 - vpcmpeqb %xmm0, %xmm0, %k3 #34.6 - vpcmpeqb %xmm0, %xmm0, %k4 #35.6 - vscatterqpd %zmm1, (%r15,%zmm0){%k1} #34.6 - addq $8, %rcx #33.3 - lea (%rsi,%rdx), %r15 #35.6 - vscatterqpd %zmm1, (%r15,%zmm0){%k2} #35.6 - lea (%rax,%rdx), %r15 #34.6 - vscatterqpd %zmm1, (%r15,%zmm0){%k3} #34.6 - lea (%rdi,%rdx), %r15 #35.6 - vscatterqpd %zmm1, (%r15,%zmm0){%k4} #35.6 - lea (%rdx,%r9,8), %rdx #33.3 - cmpq %r11, %rcx #33.3 - jb ..B1.44 # Prob 91% #33.3 - # LOE rax rdx rcx rbx rsi rdi r9 r10 r11 r12 r8d r13d r14d zmm0 zmm1 -..B1.45: # Preds ..B1.44 - # Execution count [4.35e-01] - movq 72(%rsp), %r15 #[spill] - movl 80(%rsp), %r11d #[spill] - movl 88(%rsp), %eax #[spill] - # LOE rbx r9 r10 r12 r15 eax r8d r11d r13d r14d -..B1.46: # Preds ..B1.45 ..B1.78 - # Execution count [9.56e-01] - lea 1(%r8), %edx #33.3 - cmpl %r13d, %edx #33.3 - ja ..B1.49 # Prob 50% #33.3 - # LOE rbx r9 r10 r12 r15 eax r8d r11d r13d r14d -..B1.47: # Preds ..B1.46 - # Execution count [4.35e-01] - vxorpd %xmm1, %xmm1, %xmm1 #34.19 - vxorpd %xmm2, %xmm2, %xmm2 #34.27 - vcvtsi2sd %eax, %xmm1, %xmm1 #34.19 - vcvtsi2sd %r11d, %xmm2, %xmm2 #34.27 - vdivsd %xmm2, %xmm1, %xmm3 #34.6 - subl %r8d, %r13d #33.3 - movq %r10, %rax #34.6 - movslq %r8d, %r8 #34.6 - lea (,%r12,8), %rdi #35.6 - imulq %r9, %r8 #34.6 - vpbroadcastd %r13d, %ymm0 #33.3 - vpbroadcastd %r9d, %zmm4 #34.6 - vbroadcastsd %xmm3, %zmm6 #34.6 - vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k4 #33.3 - vpmuldq .L_2il0floatpacket.8(%rip), %zmm4, %zmm5 #34.6 - subq %r15, %rax #34.6 - subq %r15, %rdi #35.6 - kmovw %k4, %k1 #34.6 - lea (%r15,%r10), %rcx #35.6 - addq %rdi, %rcx #35.6 - lea (%r10,%r8), %rdx #34.6 - kmovw %k4, %k2 #35.6 - lea (%r10,%r15,2), %r10 #35.6 - addq %r10, %rdi #35.6 - lea (%rax,%r15,2), %rsi #34.6 - addq %r8, %rcx #35.6 - addq %r8, %rsi #34.6 - addq %r8, %rdi #35.6 - kmovw %k4, %k3 #34.6 - vscatterqpd %zmm6, (%rdx,%zmm5){%k1} #34.6 - vscatterqpd %zmm6, (%rcx,%zmm5){%k2} #35.6 - vscatterqpd %zmm6, (%rsi,%zmm5){%k3} #34.6 - vscatterqpd %zmm6, (%rdi,%zmm5){%k4} #35.6 - # LOE rbx r12 r14d -..B1.49: # Preds ..B1.79 ..B1.40 ..B1.80 ..B1.83 ..B1.47 - # ..B1.46 - # Execution count [8.00e-01] - decl %r14d #54.9 - decq %rbx #53.6 - movl %r14d, %r13d #54.9 - decq %r12 #54.9 - shrl $2, %r13d #54.9 - movl $10, %r15d #43.3 - movl %r13d, %eax #54.9 - movq %rbx, 112(%rsp) #54.9[spill] - vmovsd .L_2il0floatpacket.3(%rip), %xmm1 #44.17 - vmovsd .L_2il0floatpacket.4(%rip), %xmm0 #55.31 - movq %rax, 80(%rsp) #54.9[spill] - movq %r12, 120(%rsp) #54.9[spill] - movl 96(%rsp), %ebx #54.9[spill] - # LOE ebx r13d r14d r15d -..B1.50: # Preds ..B1.87 ..B1.49 ..B1.69 - # Execution count [2.33e+00] - xorl %eax, %eax #47.8 - lea 168(%rsp), %rdi #47.8 - addl %r15d, %r15d #45.3 - lea 176(%rsp), %rsi #47.8 - vzeroupper #47.8 -..___tag_value_MAIN__.46: - call timing_ #47.8 -..___tag_value_MAIN__.47: - # LOE ebx r13d r14d r15d -..B1.51: # Preds ..B1.50 - # Execution count [2.28e+00] - movl $1, %r12d #50.3 - testl %r15d, %r15d #50.3 - jle ..B1.86 # Prob 0% #50.3 - # LOE ebx r12d r13d r14d r15d -..B1.52: # Preds ..B1.51 - # Execution count [2.28e+00] - movq 80+heat_$PHI.0.1(%rip), %rsi #55.35 - xorl %r10d, %r10d #50.3 - movq heat_$PHI.0.1(%rip), %r9 #55.12 - movq %rsi, %rcx #55.50 - movq 104+heat_$PHI.0.1(%rip), %rax #55.35 - subq %rax, %rcx #55.50 - addq %r9, %rax #55.50 - xorl %r11d, %r11d #55.66 - vmovsd .L_2il0floatpacket.4(%rip), %xmm0 #55.66 - lea (%rsi,%r9), %rdi #55.50 - addq %rax, %rcx #55.50 - lea (%r9,%rsi,2), %r8 #55.66 - # LOE rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 -..B1.53: # Preds ..B1.66 ..B1.52 - # Execution count [1.27e+01] - movq %r11, %rdx #53.6 - movq %rdx, %rax #53.6 - testl %ebx, %ebx #53.6 - jle ..B1.66 # Prob 2% #53.6 - # LOE rax rdx rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 -..B1.54: # Preds ..B1.53 - # Execution count [1.24e+01] - movl %r10d, 64(%rsp) #[spill] - movl %r15d, 72(%rsp) #[spill] - # LOE rax rdx rcx rsi rdi r8 r9 r13d r14d xmm0 -..B1.55: # Preds ..B1.64 ..B1.54 - # Execution count [6.88e+01] - testl %r14d, %r14d #54.9 - jle ..B1.64 # Prob 50% #54.9 - # LOE rax rdx rcx rsi rdi r8 r9 r13d r14d xmm0 -..B1.56: # Preds ..B1.55 - # Execution count [6.88e+01] - xorl %r15d, %r15d #54.9 - movl $1, %r12d #54.9 - xorl %r11d, %r11d #54.9 - testl %r13d, %r13d #54.9 - je ..B1.60 # Prob 2% #54.9 - # LOE rax rdx rcx rsi rdi r8 r9 r11 r15 r12d r13d r14d xmm0 -..B1.57: # Preds ..B1.56 - # Execution count [6.74e+01] - movl %r14d, 104(%rsp) #55.66[spill] - lea (%rdi,%rax), %r12 #55.50 - vmovsd (%rax,%rcx), %xmm1 #55.50 - lea (%r9,%rax), %r10 #55.35 - movq 80(%rsp), %r14 #55.66[spill] - lea (%r8,%rax), %rbx #55.66 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY @@ -688,436 +33,4 @@ MAIN__: .byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY .byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 -..B1.59: # Preds ..B1.58 - # Execution count [6.74e+01] - movl 104(%rsp), %r14d #[spill] - lea 1(,%r15,4), %r12d #55.12 - # LOE rax rdx rcx rsi rdi r8 r9 r12d r13d r14d xmm0 -..B1.60: # Preds ..B1.59 ..B1.56 - # Execution count [6.88e+01] - movslq %r12d, %r12 #54.9 - decq %r12 #54.9 - cmpq 120(%rsp), %r12 #54.9[spill] - jae ..B1.64 # Prob 2% #54.9 - # LOE rax rdx rcx rsi rdi r8 r9 r12 r13d r14d xmm0 -..B1.61: # Preds ..B1.60 - # Execution count [6.74e+01] - movq 120(%rsp), %r15 #55.66[spill] - lea (%rdi,%rax), %r11 #55.50 - lea (%r9,%rax), %r10 #55.35 - lea (%r8,%rax), %rbx #55.66 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r15 r13d r14d xmm0 -..B1.62: # Preds ..B1.62 ..B1.61 - # Execution count [2.02e+02] - vmovsd 8(%r10,%r12,8), %xmm1 #55.35 - vaddsd 16(%r11,%r12,8), %xmm1, %xmm2 #55.48 - vaddsd 8(%rbx,%r12,8), %xmm2, %xmm3 #55.63 - vaddsd (%r11,%r12,8), %xmm3, %xmm4 #55.79 - vmulsd %xmm4, %xmm0, %xmm5 #55.12 - vmovsd %xmm5, 8(%r11,%r12,8) #55.12 - incq %r12 #54.9 - cmpq %r15, %r12 #54.9 - jb ..B1.62 # Prob 66% #54.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r15 r13d r14d xmm0 -..B1.64: # Preds ..B1.62 ..B1.55 ..B1.60 - # Execution count [6.88e+01] - incq %rdx #53.6 - addq %rsi, %rax #53.6 - cmpq 112(%rsp), %rdx #53.6[spill] - jb ..B1.55 # Prob 82% #53.6 - # LOE rax rdx rcx rsi rdi r8 r9 r13d r14d xmm0 -..B1.65: # Preds ..B1.64 - # Execution count [1.24e+01] - movl 64(%rsp), %r10d #[spill] - xorl %r11d, %r11d # - movl 72(%rsp), %r15d #[spill] - movl 96(%rsp), %ebx #[spill] - # LOE rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 -..B1.66: # Preds ..B1.65 ..B1.53 - # Execution count [1.27e+01] - incl %r10d #50.3 - cmpl %r15d, %r10d #50.3 - jb ..B1.53 # Prob 82% #50.3 - # LOE rcx rsi rdi r8 r9 r11 ebx r10d r13d r14d r15d xmm0 -..B1.67: # Preds ..B1.66 - # Execution count [2.28e+00] - xorl %eax, %eax #66.8 - lea 184(%rsp), %rdi #66.8 - lea 160(%rsp), %rsi #66.8 - lea 1(%r15), %r12d #50.3 -..___tag_value_MAIN__.59: - call timing_ #66.8 -..___tag_value_MAIN__.60: - # LOE ebx r12d r13d r14d r15d -..B1.68: # Preds ..B1.67 - # Execution count [2.33e+00] - vmovsd 184(%rsp), %xmm16 #67.3 - vmovsd .L_2il0floatpacket.3(%rip), %xmm0 #44.17 - vsubsd 168(%rsp), %xmm16, %xmm1 #67.3 - vcomisd %xmm1, %xmm0 #44.17 - jbe ..B1.71 # Prob 18% #44.17 - # LOE ebx r12d r13d r14d r15d -..B1.69: # Preds ..B1.68 - # Execution count [1.91e+00] - cmpl $1000000000, %r15d #44.36 - jl ..B1.50 # Prob 80% #44.36 - # LOE ebx r12d r13d r14d r15d -..B1.71: # Preds ..B1.87 ..B1.68 ..B1.69 - # Execution count [1.00e+00] - cmpl %r12d, %r15d #70.8 - lea (%rsp), %rdi #72.3 - movq $0x1208384ff00, %rdx #72.3 - movl $__STRLITPACK_5.0.1, %ecx #72.3 - lea 64(%rsp), %r8 #72.3 - cmovl %r15d, %r12d #70.8 - movl $-1, %esi #70.8 - xorl %eax, %eax #70.8 - movq $0, (%rdi) #72.3 - movq $14, 64(%rdi) #72.3 - movq $__STRLITPACK_2, 72(%rdi) #72.3 -..___tag_value_MAIN__.61: - call for_write_seq_lis #72.3 -..___tag_value_MAIN__.62: - # LOE r12d -..B1.72: # Preds ..B1.71 - # Execution count [1.00e+00] - movl $__STRLITPACK_6.0.1, %esi #72.3 - lea (%rsp), %rdi #72.3 - xorl %eax, %eax #72.3 - lea 112(%rsp), %rdx #72.3 - movl %r12d, (%rdx) #72.3 -..___tag_value_MAIN__.63: - call for_write_seq_lis_xmit #72.3 -..___tag_value_MAIN__.64: - # LOE r12d -..B1.73: # Preds ..B1.72 - # Execution count [1.00e+00] - movl $__STRLITPACK_7.0.1, %esi #72.3 - lea (%rsp), %rdi #72.3 - xorl %eax, %eax #72.3 - lea 80(%rsp), %rdx #72.3 - movq $14, (%rdx) #72.3 - movq $__STRLITPACK_1, 8(%rdx) #72.3 -..___tag_value_MAIN__.65: - call for_write_seq_lis_xmit #72.3 -..___tag_value_MAIN__.66: - # LOE r12d -..B1.74: # Preds ..B1.73 - # Execution count [1.00e+00] - movl 152(%rsp), %eax #72.3 - vxorpd %xmm0, %xmm0, %xmm0 #72.49 - decl %eax #72.49 - vxorpd %xmm2, %xmm2, %xmm2 #72.60 - vcvtsi2sd %eax, %xmm0, %xmm0 #72.49 - movl 156(%rsp), %edx #72.49 - vxorpd %xmm7, %xmm7, %xmm7 #72.71 - decl %edx #72.60 - lea (%rsp), %rdi #72.3 - vcvtsi2sd %edx, %xmm2, %xmm2 #72.60 - vcvtsi2sd %r12d, %xmm7, %xmm7 #72.71 - vmulsd .L_2il0floatpacket.7(%rip), %xmm0, %xmm1 #72.59 - vmovsd 184(%rdi), %xmm3 #72.70 - lea 120(%rsp), %rdx #72.3 - vmulsd %xmm2, %xmm1, %xmm4 #72.70 - vsubsd 48(%rdx), %xmm3, %xmm5 #72.83 - vdivsd %xmm5, %xmm4, %xmm6 #72.79 - vmulsd %xmm7, %xmm6, %xmm8 #72.3 - movl $__STRLITPACK_8.0.1, %esi #72.3 - xorl %eax, %eax #72.3 - vmovsd %xmm8, (%rdx) #72.3 -..___tag_value_MAIN__.67: - call for_write_seq_lis_xmit #72.3 -..___tag_value_MAIN__.68: - # LOE -..B1.75: # Preds ..B1.74 - # Execution count [1.00e+00] - movl $__STRLITPACK_9.0.1, %esi #72.3 - lea (%rsp), %rdi #72.3 - xorl %eax, %eax #72.3 - lea 96(%rsp), %rdx #72.3 - movq $6, (%rdx) #72.3 - movq $__STRLITPACK_0, 8(%rdx) #72.3 -..___tag_value_MAIN__.69: - call for_write_seq_lis_xmit #72.3 -..___tag_value_MAIN__.70: - # LOE -..B1.76: # Preds ..B1.75 - # Execution count [1.00e+00] - xorl %esi, %esi #73.3 - movl $__STRLITPACK_10, %edi #73.3 - movq $0x1208384ff00, %rdx #73.3 - xorl %ecx, %ecx #73.3 - xorl %r8d, %r8d #73.3 - xorl %eax, %eax #73.3 -..___tag_value_MAIN__.71: - call for_stop_core #73.3 -..___tag_value_MAIN__.72: - # LOE -..B1.77: # Preds ..B1.76 - # Execution count [1.00e+00] - xorl %eax, %eax #74.3 - addq $216, %rsp #74.3 - .cfi_restore 3 - popq %rbx #74.3 - .cfi_restore 15 - popq %r15 #74.3 - .cfi_restore 14 - popq %r14 #74.3 - .cfi_restore 13 - popq %r13 #74.3 - .cfi_restore 12 - popq %r12 #74.3 - movq %rbp, %rsp #74.3 - popq %rbp #74.3 - .cfi_def_cfa 7, 8 - .cfi_restore 6 - ret #74.3 - .cfi_def_cfa 6, 16 - .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 - .cfi_offset 6, -16 - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 - # LOE -..B1.78: # Preds ..B1.42 - # Execution count [4.35e-02]: Infreq - xorl %r8d, %r8d #33.3 - jmp ..B1.46 # Prob 100% #33.3 - # LOE rbx r9 r10 r12 r15 eax r8d r11d r13d r14d -..B1.79: # Preds ..B1.41 - # Execution count [4.35e-02]: Infreq - cmpl $1, %r13d #33.3 - jb ..B1.49 # Prob 50% #33.3 - # LOE rbx r10 r12 r15 eax r11d r13d r14d -..B1.80: # Preds ..B1.79 - # Execution count [4.35e-01]: Infreq - xorl %ecx, %ecx #33.3 - testl %r13d, %r13d #33.3 - je ..B1.49 # Prob 56% #33.3 - # LOE rcx rbx r10 r12 r15 eax r11d r13d r14d -..B1.81: # Preds ..B1.80 - # Execution count [4.35e-01]: Infreq - vxorpd %xmm0, %xmm0, %xmm0 #34.19 - vxorpd %xmm1, %xmm1, %xmm1 #34.27 - vcvtsi2sd %eax, %xmm0, %xmm0 #34.19 - vcvtsi2sd %r11d, %xmm1, %xmm1 #34.27 - movq %r15, %rax #34.6 - lea (,%r12,8), %rdx #35.6 - negq %rax #34.6 - movq %rdx, %rsi #35.6 - vdivsd %xmm1, %xmm0, %xmm0 #34.6 - movslq %r13d, %r13 #33.3 - subq %r15, %rsi #35.6 - lea (%rax,%r15,2), %rax #34.6 - # LOE rax rdx rcx rbx rsi r10 r12 r13 r15 r14d xmm0 -..B1.82: # Preds ..B1.82 ..B1.81 - # Execution count [4.90e+00]: Infreq - incq %rcx #33.3 - vmovsd %xmm0, (%r10) #34.6 - vmovsd %xmm0, (%r10,%rdx) #35.6 - vmovsd %xmm0, (%rax,%r10) #34.6 - cmpq %r13, %rcx #33.3 - jb ..B1.82 # Prob 91% #33.3 - # LOE rax rdx rcx rbx rsi r10 r12 r13 r15 r14d xmm0 -..B1.83: # Preds ..B1.82 - # Execution count [4.35e-01]: Infreq - lea (%rsi,%r15,2), %rax #35.6 - vmovsd %xmm0, (%rax,%r10) #35.6 - jmp ..B1.49 # Prob 100% #35.6 - # LOE rbx r12 r14d -..B1.86: # Preds ..B1.51 - # Execution count [4.82e-02]: Infreq - xorl %eax, %eax #66.8 - lea 184(%rsp), %rdi #66.8 - lea 160(%rsp), %rsi #66.8 -..___tag_value_MAIN__.87: - call timing_ #66.8 -..___tag_value_MAIN__.88: - # LOE ebx r12d r13d r14d r15d -..B1.87: # Preds ..B1.86 - # Execution count [0.00e+00]: Infreq - vmovsd 184(%rsp), %xmm16 #67.3 - vmovsd .L_2il0floatpacket.3(%rip), %xmm0 #44.17 - vsubsd 168(%rsp), %xmm16, %xmm1 #67.3 - vcomisd %xmm1, %xmm0 #44.17 - ja ..B1.50 # Prob 82% #44.17 - jmp ..B1.71 # Prob 100% #44.17 - # LOE ebx r12d r13d r14d r15d -..B1.89: # Preds ..B1.32 - # Execution count [4.35e-02]: Infreq - xorl %esi, %esi #29.3 - jmp ..B1.36 # Prob 100% #29.3 - # LOE rcx rbx rdi r8 r12 eax edx esi r13d r14d -..B1.90: # Preds ..B1.19 - # Execution count [5.33e-01]: Infreq - incq %r8 #21.3 - addq %r10, %rdx #21.3 - cmpq 80(%rsp), %r8 #21.3[spill] - jb ..B1.10 # Prob 82% #21.3 - jmp ..B1.30 # Prob 100% #21.3 - # LOE rdx rcx rsi r8 r9 r10 r15d ymm0 ymm2 zmm1 -..B1.92: # Preds ..B1.10 - # Execution count [5.33e-01]: Infreq - movq 72(%rsp), %rdi #[spill] - xorl %eax, %eax #22.6 - movq 80(%rsp), %r11 #[spill] - jmp ..B1.25 # Prob 100% # - .align 16,0x90 - # LOE rdx rcx rsi rdi r8 r9 r10 r11 eax r15d ymm0 ymm2 zmm1 - .cfi_endproc -# mark_end; - .type MAIN__,@function - .size MAIN__,.-MAIN__ -..LNMAIN__.0: - .data - .align 32 - .align 32 -heat_$PHI.0.1: - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x40000080,0x00000000 - .long 0x00000003,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .long 0x00000000,0x00000000 - .section .rodata, "a" - .align 64 - .align 4 -__NLITPACK_0.0.1: - .long 2 - .align 4 -__STRLITPACK_3.0.1: - .long 131849 - .byte 0 - .space 3, 0x00 # pad - .align 4 -__STRLITPACK_4.0.1: - .long 66313 - .byte 0 - .space 3, 0x00 # pad - .align 4 -__STRLITPACK_5.0.1: - .long 132152 - .byte 0 - .space 3, 0x00 # pad - .align 4 -__STRLITPACK_6.0.1: - .long 131337 - .byte 0 - .space 3, 0x00 # pad - .align 4 -__STRLITPACK_7.0.1: - .long 132152 - .byte 0 - .space 3, 0x00 # pad - .align 4 -__STRLITPACK_8.0.1: - .long 131376 - .byte 0 - .space 3, 0x00 # pad - .align 4 -__STRLITPACK_9.0.1: - .long 66616 - .byte 0 - .data -# -- End MAIN__ - .section .rodata, "a" - .space 7, 0x00 # pad - .align 64 -.L_2il0floatpacket.8: - .long 0x00000000,0x00000000,0x00000001,0x00000000,0x00000002,0x00000000,0x00000003,0x00000000,0x00000004,0x00000000,0x00000005,0x00000000,0x00000006,0x00000000,0x00000007,0x00000000 - .type .L_2il0floatpacket.8,@object - .size .L_2il0floatpacket.8,64 - .align 32 -.L_2il0floatpacket.0: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 - .type .L_2il0floatpacket.0,@object - .size .L_2il0floatpacket.0,32 - .align 32 -.L_2il0floatpacket.1: - .long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008 - .type .L_2il0floatpacket.1,@object - .size .L_2il0floatpacket.1,32 - .align 32 -.L_2il0floatpacket.2: - .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000 - .type .L_2il0floatpacket.2,@object - .size .L_2il0floatpacket.2,32 - .align 16 -.L_2il0floatpacket.5: - .long 0x00000004,0x00000004,0x00000004,0x00000004 - .type .L_2il0floatpacket.5,@object - .size .L_2il0floatpacket.5,16 - .align 16 -.L_2il0floatpacket.6: - .long 0x00000000,0x00000001,0x00000002,0x00000003 - .type .L_2il0floatpacket.6,@object - .size .L_2il0floatpacket.6,16 - .align 8 -.L_2il0floatpacket.3: - .long 0x9999999a,0x3fc99999 - .type .L_2il0floatpacket.3,@object - .size .L_2il0floatpacket.3,8 - .align 8 -.L_2il0floatpacket.4: - .long 0x00000000,0x3fd00000 - .type .L_2il0floatpacket.4,@object - .size .L_2il0floatpacket.4,8 - .align 8 -.L_2il0floatpacket.7: - .long 0xa0b5ed8d,0x3eb0c6f7 - .type .L_2il0floatpacket.7,@object - .size .L_2il0floatpacket.7,8 - .align 8 -.L_2il0floatpacket.9: - .long 0x00000000,0x3ff00000 - .type .L_2il0floatpacket.9,@object - .size .L_2il0floatpacket.9,8 - .section .rodata.str1.4, "aMS",@progbits,1 - .align 4 - .align 4 -__STRLITPACK_2: - .long 1950949411 - .long 1952543333 - .long 1936617321 - .word 8250 - .byte 0 - .type __STRLITPACK_2,@object - .size __STRLITPACK_2,15 - .space 1, 0x00 # pad - .align 4 -__STRLITPACK_1: - .long 1919242272 - .long 1836216166 - .long 1701015137 - .word 8250 - .byte 0 - .type __STRLITPACK_1,@object - .size __STRLITPACK_1,15 - .space 1, 0x00 # pad - .align 4 -__STRLITPACK_0: - .long 1431063840 - .word 29520 - .byte 0 - .type __STRLITPACK_0,@object - .size __STRLITPACK_0,7 - .space 1, 0x00 # pad - .align 4 -__STRLITPACK_10: - .byte 0 - .type __STRLITPACK_10,@object - .size __STRLITPACK_10,1 - .data - .section .note.GNU-stack, "" -# End + diff --git a/examples/gs/gs.s.tx2.clang.s b/examples/gs/gs.s.tx2.clang.s index efce506..2089e7f 100644 --- a/examples/gs/gs.s.tx2.clang.s +++ b/examples/gs/gs.s.tx2.clang.s @@ -1,740 +1,3 @@ - .text - .file "gs-e4c67a.ll" - .section .rodata.cst8,"aM",@progbits,8 - .p2align 3 // -- Begin function MAIN_ -.LCPI0_0: - .xword 4596373779694328218 // double 0.20000000000000001 -.LCPI0_1: - .xword 4696837146684686336 // double 1.0E+6 - .text - .globl MAIN_ - .p2align 6 - .type MAIN_,@function -MAIN_: // @MAIN_ - .cfi_startproc -// %bb.0: // %L.entry - stp d9, d8, [sp, #-112]! // 16-byte Folded Spill - stp x28, x27, [sp, #16] // 16-byte Folded Spill - stp x26, x25, [sp, #32] // 16-byte Folded Spill - stp x24, x23, [sp, #48] // 16-byte Folded Spill - stp x22, x21, [sp, #64] // 16-byte Folded Spill - stp x20, x19, [sp, #80] // 16-byte Folded Spill - stp x29, x30, [sp, #96] // 16-byte Folded Spill - sub sp, sp, #432 // =432 - .cfi_def_cfa_offset 544 - .cfi_offset w30, -8 - .cfi_offset w29, -16 - .cfi_offset w19, -24 - .cfi_offset w20, -32 - .cfi_offset w21, -40 - .cfi_offset w22, -48 - .cfi_offset w23, -56 - .cfi_offset w24, -64 - .cfi_offset w25, -72 - .cfi_offset w26, -80 - .cfi_offset w27, -88 - .cfi_offset w28, -96 - .cfi_offset b8, -104 - .cfi_offset b9, -112 - adrp x19, .C283_MAIN_ - add x19, x19, :lo12:.C283_MAIN_ - mov x0, x19 - bl fort_init - adrp x0, .C329_MAIN_ - adrp x1, .C327_MAIN_ - add x0, x0, :lo12:.C329_MAIN_ - add x1, x1, :lo12:.C327_MAIN_ - orr w2, wzr, #0x6 - str xzr, [sp, #424] - bl f90io_src_info03a - adrp x0, .C330_MAIN_ - mov x1, xzr - mov x2, x19 - mov x3, x19 - add x0, x0, :lo12:.C330_MAIN_ - bl f90io_ldr_init - adrp x20, .C334_MAIN_ - adrp x21, .C285_MAIN_ - add x20, x20, :lo12:.C334_MAIN_ - add x21, x21, :lo12:.C285_MAIN_ - mov x0, x20 - mov x1, x21 - mov x2, x19 - add x3, sp, #420 // =420 - bl f90io_ldra - mov x0, x20 - mov x1, x21 - mov x2, x19 - add x3, sp, #416 // =416 - bl f90io_ldra - bl f90io_ldr_end - ldrsw x24, [sp, #416] - ldr w22, [sp, #420] - sxtw x21, w22 - and x8, x24, #0xffffffff - str x8, [sp, #160] // 8-byte Folded Spill - add x9, x21, #1 // =1 - add x8, x24, #1 // =1 - adrp x1, .C366_MAIN_ - mul x23, x9, x8 - stp xzr, xzr, [sp] - adrp x2, .C365_MAIN_ - adrp x6, .C286_MAIN_ - adrp x7, .C284_MAIN_ - mov x3, xzr - mov x5, xzr - add x1, x1, :lo12:.C366_MAIN_ - add x2, x2, :lo12:.C365_MAIN_ - add x6, x6, :lo12:.C286_MAIN_ - add x7, x7, :lo12:.C284_MAIN_ - add x0, sp, #408 // =408 - lsl x20, x23, #1 - add x4, sp, #424 // =424 - str x9, [sp, #360] // 8-byte Folded Spill - str x20, [sp, #408] - bl f90_alloc04_chka_i8 - str x22, [sp, #200] // 8-byte Folded Spill - cmp w24, #2 // =2 - b.lt .LBB0_30 -// %bb.1: // %L.LB1_367.preheader - cmp w22, #2 // =2 - b.lt .LBB0_30 -// %bb.2: // %L.LB1_367.preheader64 - mvn w9, w22 - orr w10, wzr, #0xfffffffd - ldr x8, [sp, #424] - cmn w9, #3 // =3 - csinv w9, w10, w22, le - ldr x18, [sp, #160] // 8-byte Folded Reload - add w11, w22, w9 - add x12, x23, x21 - mvn w16, w18 - add w9, w11, #1 // =1 - add x10, x9, #1 // =1 - add x13, x12, x9 - add x9, x21, x9 - add x15, x8, x13, lsl #3 - add x13, x8, x9, lsl #3 - add x4, x8, x21, lsl #3 - add x9, x8, x12, lsl #3 - add x14, x4, #16 // =16 - add x15, x15, #24 // =24 - add x12, x13, #24 // =24 - add x13, x9, #16 // =16 - and x16, x16, #0x1 - cmp w18, #2 // =2 - b.ne .LBB0_10 -// %bb.3: - orr w9, wzr, #0x1 - cbz w16, .LBB0_30 -.LBB0_4: // %L.LB1_367.epil - cmp x10, #8 // =8 - b.lo .LBB0_7 -// %bb.5: // %vector.memcheck.epil - cmp x14, x15 - b.hs .LBB0_27 -// %bb.6: // %vector.memcheck.epil - cmp x13, x12 - b.hs .LBB0_27 -.LBB0_7: - orr w10, wzr, #0x1 - mov w11, w22 -.LBB0_8: // %L.LB1_370.preheader.epil - ldr x14, [sp, #360] // 8-byte Folded Reload - add x13, x9, x24 - add x12, x8, x10, lsl #3 - lsl x13, x13, #3 - add x13, x13, #8 // =8 - madd x9, x9, x14, x10 - madd x12, x13, x14, x12 - add x8, x8, x9, lsl #3 - add w9, w11, #1 // =1 - .p2align 6 -.LBB0_9: // %L.LB1_370.epil - // =>This Inner Loop Header: Depth=1 - str xzr, [x8], #8 - str xzr, [x12], #8 - sub w9, w9, #1 // =1 - cmp w9, #2 // =2 - b.gt .LBB0_9 - b .LBB0_30 -.LBB0_10: // %L.LB1_367.preheader64.new - mvn x17, x16 - cmp x14, x15 - add w1, w11, #2 // =2 - add x5, x23, x21, lsl #1 - add x17, x17, x18 - cset w18, lo - cmp x13, x12 - cset w0, lo - and w18, w18, w0 - and w0, w1, #0x7 - sub x1, x10, x0 - add x6, x9, #8 // =8 - ldr x9, [sp, #360] // 8-byte Folded Reload - movi v0.2d, #0000000000000000 - sub w3, w22, w1 - add x22, x8, x5, lsl #3 - lsl x5, x21, #4 - add x25, x8, x5 - add x7, x8, x9, lsl #3 - add x2, x1, #1 // =1 - add x4, x4, #64 // =64 - add x5, x5, #16 // =16 - add x19, x25, #40 // =40 - add x22, x22, #16 // =16 - add x25, x25, #16 // =16 - orr w9, wzr, #0x1 - .p2align 6 -.LBB0_11: // %L.LB1_367 - // =>This Loop Header: Depth=1 - // Child Loop BB0_14 Depth 2 - // Child Loop BB0_17 Depth 2 - // Child Loop BB0_21 Depth 2 - // Child Loop BB0_24 Depth 2 - cmp x10, #8 // =8 - cset w26, lo - orr w26, w26, w18 - tbz w26, #0, .LBB0_13 -// %bb.12: // in Loop: Header=BB0_11 Depth=1 - ldr x28, [sp, #200] // 8-byte Folded Reload - orr w27, wzr, #0x1 - mov w29, w28 - b .LBB0_16 - .p2align 6 -.LBB0_13: // %vector.ph - // in Loop: Header=BB0_11 Depth=1 - mov x27, x4 - mov x28, x1 - .p2align 6 -.LBB0_14: // %vector.body - // Parent Loop BB0_11 Depth=1 - // => This Inner Loop Header: Depth=2 - add x29, x27, x23, lsl #3 - stp q0, q0, [x27, #-48] - stp q0, q0, [x27, #-16] - add x27, x27, #64 // =64 - stp q0, q0, [x29, #-48] - stp q0, q0, [x29, #-16] - subs x28, x28, #8 // =8 - b.ne .LBB0_14 -// %bb.15: // %middle.block - // in Loop: Header=BB0_11 Depth=1 - mov x27, x2 - mov w29, w3 - cbz w0, .LBB0_18 -.LBB0_16: // %L.LB1_370.preheader - // in Loop: Header=BB0_11 Depth=1 - lsl x28, x27, #3 - add x27, x6, x28 - add x28, x7, x28 - add w29, w29, #1 // =1 - .p2align 6 -.LBB0_17: // %L.LB1_370 - // Parent Loop BB0_11 Depth=1 - // => This Inner Loop Header: Depth=2 - str xzr, [x28], #8 - str xzr, [x27], #8 - sub w29, w29, #1 // =1 - cmp w29, #2 // =2 - b.gt .LBB0_17 -.LBB0_18: // %L.LB1_371 - // in Loop: Header=BB0_11 Depth=1 - tbz w26, #0, .LBB0_20 -// %bb.19: // in Loop: Header=BB0_11 Depth=1 - ldr x27, [sp, #200] // 8-byte Folded Reload - orr w26, wzr, #0x1 - mov w28, w27 - b .LBB0_23 - .p2align 6 -.LBB0_20: // %vector.ph.1 - // in Loop: Header=BB0_11 Depth=1 - mov x26, x19 - mov x27, x1 - .p2align 6 -.LBB0_21: // %vector.body.1 - // Parent Loop BB0_11 Depth=1 - // => This Inner Loop Header: Depth=2 - add x28, x26, x23, lsl #3 - stp q0, q0, [x26] - stur q0, [x26, #-16] - str q0, [x26, #32] - add x26, x26, #64 // =64 - stp q0, q0, [x28, #-16] - stp q0, q0, [x28, #16] - subs x27, x27, #8 // =8 - b.ne .LBB0_21 -// %bb.22: // %middle.block.1 - // in Loop: Header=BB0_11 Depth=1 - mov x26, x2 - mov w28, w3 - cbz w0, .LBB0_25 -.LBB0_23: // %L.LB1_370.preheader.1 - // in Loop: Header=BB0_11 Depth=1 - lsl x27, x26, #3 - add x26, x22, x27 - add x27, x25, x27 - add w28, w28, #1 // =1 - .p2align 6 -.LBB0_24: // %L.LB1_370.1 - // Parent Loop BB0_11 Depth=1 - // => This Inner Loop Header: Depth=2 - str xzr, [x27], #8 - str xzr, [x26], #8 - sub w28, w28, #1 // =1 - cmp w28, #2 // =2 - b.gt .LBB0_24 -.LBB0_25: // %L.LB1_371.1 - // in Loop: Header=BB0_11 Depth=1 - add x4, x4, x5 - add x6, x6, x5 - add x7, x7, x5 - add x19, x19, x5 - add x22, x22, x5 - add x9, x9, #2 // =2 - add x25, x25, x5 - subs x17, x17, #2 // =2 - b.ne .LBB0_11 -// %bb.26: // %L.LB1_368.loopexit.unr-lcssa.loopexit - ldr x22, [sp, #200] // 8-byte Folded Reload - cbnz w16, .LBB0_4 - b .LBB0_30 -.LBB0_27: // %vector.ph.epil - ldr x16, [sp, #360] // 8-byte Folded Reload - add x12, x9, x24 - movi v0.2d, #0000000000000000 - lsl x12, x12, #3 - add x15, x12, #8 // =8 - mul x14, x9, x16 - add w11, w11, #2 // =2 - and w12, w11, #0x7 - madd x15, x15, x16, x8 - sub x13, x10, x12 - sub w11, w22, w13 - add x10, x13, #1 // =1 - add x14, x8, x14, lsl #3 - add x14, x14, #40 // =40 - add x15, x15, #40 // =40 - .p2align 6 -.LBB0_28: // %vector.body.epil - // =>This Inner Loop Header: Depth=1 - stp q0, q0, [x14, #-32] - stp q0, q0, [x14], #64 - stp q0, q0, [x15, #-32] - stp q0, q0, [x15], #64 - subs x13, x13, #8 // =8 - b.ne .LBB0_28 -// %bb.29: // %middle.block.epil - cbnz w12, .LBB0_8 -.LBB0_30: // %L.LB1_368 - tbnz w22, #31, .LBB0_33 -// %bb.31: // %L.LB1_373.preheader - orr w8, wzr, #0xfffffffe - sub w12, w8, w22 - ldr x10, [sp, #424] - cmn w12, #2 // =2 - csel w8, w12, w8, gt - add w13, w22, w8 - mvn x11, x21 - add w14, w13, #2 // =2 - add w9, w22, #1 // =1 - add x12, x14, #1 // =1 - cmp x12, #8 // =8 - b.hs .LBB0_34 -// %bb.32: - ldr x6, [sp, #160] // 8-byte Folded Reload - mov x8, xzr - b .LBB0_43 -.LBB0_33: - ldr x6, [sp, #160] // 8-byte Folded Reload - fmov d0, xzr - tbz w6, #31, .LBB0_47 - b .LBB0_49 -.LBB0_34: // %vector.memcheck159 - add x16, x23, x14 - add x14, x20, x14 - add x17, x10, x16, lsl #3 - sub x16, x16, x21 - add x15, x23, x11 - add x6, x17, #8 // =8 - sub x14, x14, x21 - add x18, x10, x16, lsl #3 - add x16, x20, x11 - add x15, x10, x15, lsl #3 - add x2, x10, x14, lsl #3 - add x4, x10, x12, lsl #3 - add x0, x10, x16, lsl #3 - cmp x15, x2 - cset w7, lo - cmp x0, x18 - cset w19, lo - cmp x15, x4 - cset w14, lo - add x5, x10, x23, lsl #3 - cmp x10, x18 - cset w16, lo - cmp x15, x6 - cset w15, lo - cmp x5, x18 - cset w18, lo - cmp x0, x4 - cset w17, lo - cmp x10, x2 - cset w1, lo - cmp x0, x6 - cset w0, lo - cmp x5, x2 - cset w3, lo - cmp x10, x6 - cset w2, lo - ldr x6, [sp, #160] // 8-byte Folded Reload - mov x8, xzr - cmp x5, x4 - cset w4, lo - and w5, w7, w19 - tbnz w5, #0, .LBB0_43 -// %bb.35: // %vector.memcheck159 - and w14, w14, w16 - tbnz w14, #0, .LBB0_43 -// %bb.36: // %vector.memcheck159 - and w14, w15, w18 - tbnz w14, #0, .LBB0_43 -// %bb.37: // %vector.memcheck159 - and w14, w17, w1 - tbnz w14, #0, .LBB0_43 -// %bb.38: // %vector.memcheck159 - and w14, w0, w3 - tbnz w14, #0, .LBB0_43 -// %bb.39: // %vector.memcheck159 - and w14, w2, w4 - tbnz w14, #0, .LBB0_43 -// %bb.40: // %vector.ph160 - add w8, w13, #3 // =3 - and w13, w8, #0x7 - fmov v0.2d, #1.00000000 - movi v1.2d, #0000000000000000 - sub x8, x12, x13 - lsl x14, x23, #4 - lsl x15, x21, #3 - lsl x12, x23, #3 - sub x14, x14, x15 - sub w9, w9, w8 - sub x12, x12, x15 - mov x15, x10 - mov x16, x8 - .p2align 6 -.LBB0_41: // %vector.body115 - // =>This Inner Loop Header: Depth=1 - add x17, x15, x12 - stur q0, [x17, #-8] - stur q0, [x17, #8] - stur q0, [x17, #24] - stur q0, [x17, #40] - add x17, x15, x14 - stur q0, [x17, #-8] - stur q0, [x17, #8] - stur q0, [x17, #24] - stur q0, [x17, #40] - add x17, x15, x23, lsl #3 - stp q1, q1, [x15] - stp q1, q1, [x15, #32] - add x15, x15, #64 // =64 - stp q1, q1, [x17] - stp q1, q1, [x17, #32] - subs x16, x16, #8 // =8 - b.ne .LBB0_41 -// %bb.42: // %middle.block116 - cbz w13, .LBB0_46 -.LBB0_43: // %L.LB1_373.preheader189 - add x15, x8, x23 - add x16, x8, x20 - add x14, x15, x11 - add x11, x16, x11 - mov x12, xzr - add w9, w9, #1 // =1 - add x13, x10, x8, lsl #3 - add x14, x10, x14, lsl #3 - add x11, x10, x11, lsl #3 - add x10, x10, x15, lsl #3 - orr x15, xzr, #0x3ff0000000000000 - .p2align 6 -.LBB0_44: // %L.LB1_373 - // =>This Inner Loop Header: Depth=1 - lsl x16, x12, #3 - add x12, x12, #1 // =1 - sub w9, w9, #1 // =1 - str x15, [x14, x16] - str x15, [x11, x16] - str xzr, [x13, x16] - str xzr, [x10, x16] - cmp w9, #1 // =1 - b.gt .LBB0_44 -// %bb.45: // %L.LB1_374.loopexit.loopexit - add w8, w8, w12 -.LBB0_46: // %L.LB1_374.loopexit - scvtf d0, w8 - tbnz w6, #31, .LBB0_49 -.LBB0_47: // %L.LB1_382.preheader - ldr s1, [sp, #420] - ldr x8, [sp, #424] - lsl x10, x21, #3 - add x9, x10, #8 // =8 - sshll v1.2d, v1.2s, #0 - add x10, x10, x23, lsl #3 - add w11, w24, #2 // =2 - scvtf d1, d1 - fdiv d0, d0, d1 - .p2align 6 -.LBB0_48: // %L.LB1_382 - // =>This Inner Loop Header: Depth=1 - str d0, [x8] - sub w11, w11, #1 // =1 - str d0, [x8, x23, lsl #3] - str d0, [x8, x21, lsl #3] - str d0, [x8, x10] - add x8, x8, x9 - cmp w11, #1 // =1 - b.gt .LBB0_48 -.LBB0_49: // %L.LB1_383 - sub w9, w6, #1 // =1 - and w25, w9, #0x7 - mvn x9, x25 - add x9, x9, x6 - str x9, [sp, #168] // 8-byte Folded Spill - lsl x9, x21, #6 - lsl x28, x21, #1 - mov w19, #10 - add x29, x9, #64 // =64 - add x9, x21, #2 // =2 - str x9, [sp, #152] // 8-byte Folded Spill - add x9, x28, #4 // =4 - str x9, [sp, #144] // 8-byte Folded Spill - add x9, x28, x21 - add x10, x9, #4 // =4 - str x10, [sp, #136] // 8-byte Folded Spill - add x10, x28, #3 // =3 - str x10, [sp, #128] // 8-byte Folded Spill - add x10, x9, #5 // =5 - lsl x9, x9, #1 - str x10, [sp, #120] // 8-byte Folded Spill - lsl x10, x21, #2 - add x11, x10, #5 // =5 - str x11, [sp, #112] // 8-byte Folded Spill - add x11, x10, #6 // =6 - add x10, x10, x21 - str x11, [sp, #104] // 8-byte Folded Spill - add x11, x10, #6 // =6 - add x10, x10, #7 // =7 - lsl x8, x21, #3 - add x24, x8, #8 // =8 - fmov d9, #0.25000000 - sub x23, x6, #2 // =2 - add w20, w22, #1 // =1 - stp x25, x23, [sp, #176] // 16-byte Folded Spill - stp x10, x11, [sp, #88] // 16-byte Folded Spill - add x10, x9, #7 // =7 - add x9, x9, #8 // =8 - stp x9, x10, [sp, #72] // 16-byte Folded Spill - sub x9, x8, x21 - add x10, x9, #8 // =8 - add x9, x9, #9 // =9 - stp x9, x10, [sp, #56] // 16-byte Folded Spill - add x9, x8, #9 // =9 - str x9, [sp, #48] // 8-byte Folded Spill - add x9, x8, #10 // =10 - add x8, x8, x21 - add x8, x8, #10 // =10 - stp x8, x9, [sp, #32] // 16-byte Folded Spill - adrp x8, .LCPI0_0 - ldr d8, [x8, :lo12:.LCPI0_0] - .p2align 6 -.LBB0_50: // %L.LB1_471 - // =>This Loop Header: Depth=1 - // Child Loop BB0_55 Depth 2 - // Child Loop BB0_59 Depth 3 - // Child Loop BB0_60 Depth 4 - // Child Loop BB0_62 Depth 4 - // Child Loop BB0_64 Depth 4 - // Child Loop BB0_66 Depth 4 - // Child Loop BB0_68 Depth 4 - // Child Loop BB0_70 Depth 4 - // Child Loop BB0_72 Depth 4 - // Child Loop BB0_74 Depth 4 - // Child Loop BB0_78 Depth 3 - // Child Loop BB0_79 Depth 4 - lsl w8, w19, #1 - add x0, sp, #400 // =400 - add x1, sp, #392 // =392 - str w8, [sp, #196] // 4-byte Folded Spill - bl timing_ - cbz w19, .LBB0_53 -// %bb.51: // %L.LB1_392.preheader - // in Loop: Header=BB0_50 Depth=1 - ldr x8, [sp, #160] // 8-byte Folded Reload - cmp w8, #2 // =2 - b.ge .LBB0_54 -// %bb.52: // %L.LB1_392.us.preheader - // in Loop: Header=BB0_50 Depth=1 - ldr w9, [sp, #196] // 4-byte Folded Reload - mvn w8, w9 - cmn w8, #2 // =2 - orr w8, wzr, #0xfffffffe - csinv w8, w8, w9, le - add w8, w8, w9 - add w26, w8, #3 // =3 - b .LBB0_82 - .p2align 6 -.LBB0_53: // in Loop: Header=BB0_50 Depth=1 - orr w26, wzr, #0x1 - b .LBB0_82 - .p2align 6 -.LBB0_54: // %L.LB1_392.preheader90 - // in Loop: Header=BB0_50 Depth=1 - ldr x10, [sp, #424] - ldr x8, [sp, #360] // 8-byte Folded Reload - add x9, x10, x8, lsl #3 - orr w26, wzr, #0x1 - ldr x8, [sp, #152] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #328] // 8-byte Folded Spill - ldr x8, [sp, #144] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #320] // 8-byte Folded Spill - ldr x8, [sp, #136] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #312] // 8-byte Folded Spill - ldr x8, [sp, #128] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #304] // 8-byte Folded Spill - ldr x8, [sp, #120] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #296] // 8-byte Folded Spill - ldr x8, [sp, #112] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #288] // 8-byte Folded Spill - ldr x8, [sp, #104] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #280] // 8-byte Folded Spill - ldr x8, [sp, #96] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #272] // 8-byte Folded Spill - ldr x8, [sp, #88] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #264] // 8-byte Folded Spill - ldr x8, [sp, #80] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #256] // 8-byte Folded Spill - ldr x8, [sp, #72] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #248] // 8-byte Folded Spill - ldr x8, [sp, #64] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #240] // 8-byte Folded Spill - ldr x8, [sp, #56] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #232] // 8-byte Folded Spill - ldr x8, [sp, #48] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #224] // 8-byte Folded Spill - ldr x8, [sp, #40] // 8-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #216] // 8-byte Folded Spill - add x8, x10, #8 // =8 - str x8, [sp, #352] // 8-byte Folded Spill - add x8, x10, #16 // =16 - stp x10, x8, [sp, #336] // 16-byte Folded Spill - ldr x8, [sp, #32] // 8-byte Folded Reload - ldr w30, [sp, #196] // 4-byte Folded Reload - add x8, x10, x8, lsl #3 - str x8, [sp, #208] // 8-byte Folded Spill - .p2align 6 -.LBB0_55: // %L.LB1_392 - // Parent Loop BB0_50 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB0_59 Depth 3 - // Child Loop BB0_60 Depth 4 - // Child Loop BB0_62 Depth 4 - // Child Loop BB0_64 Depth 4 - // Child Loop BB0_66 Depth 4 - // Child Loop BB0_68 Depth 4 - // Child Loop BB0_70 Depth 4 - // Child Loop BB0_72 Depth 4 - // Child Loop BB0_74 Depth 4 - // Child Loop BB0_78 Depth 3 - // Child Loop BB0_79 Depth 4 - cmp w22, #2 // =2 - b.lt .LBB0_81 -// %bb.56: // %L.LB1_395.preheader - // in Loop: Header=BB0_55 Depth=2 - cmp x23, #7 // =7 - b.hs .LBB0_58 -// %bb.57: // in Loop: Header=BB0_55 Depth=2 - mov x11, xzr - orr w12, wzr, #0x1 - cbnz w25, .LBB0_77 - b .LBB0_81 - .p2align 6 -.LBB0_58: // %L.LB1_395.preheader199 - // in Loop: Header=BB0_55 Depth=2 - ldp x10, x5, [sp, #208] // 16-byte Folded Reload - ldp x4, x3, [sp, #224] // 16-byte Folded Reload - ldp x2, x1, [sp, #240] // 16-byte Folded Reload - ldp x0, x18, [sp, #256] // 16-byte Folded Reload - ldp x17, x16, [sp, #272] // 16-byte Folded Reload - ldp x15, x14, [sp, #288] // 16-byte Folded Reload - ldp x13, x25, [sp, #304] // 16-byte Folded Reload - ldp x19, x27, [sp, #320] // 16-byte Folded Reload - ldr x8, [sp, #336] // 8-byte Folded Reload - ldr x6, [sp, #168] // 8-byte Folded Reload - mov x11, xzr - orr w12, wzr, #0x1 - str w26, [sp, #372] // 4-byte Folded Spill - .p2align 6 -.LBB0_59: // %L.LB1_395 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // => This Loop Header: Depth=3 - // Child Loop BB0_60 Depth 4 - // Child Loop BB0_62 Depth 4 - // Child Loop BB0_64 Depth 4 - // Child Loop BB0_66 Depth 4 - // Child Loop BB0_68 Depth 4 - // Child Loop BB0_70 Depth 4 - // Child Loop BB0_72 Depth 4 - // Child Loop BB0_74 Depth 4 - mul x7, x24, x11 - mov w22, w20 - ldr d0, [x9, x7] - mov x7, x8 - .p2align 6 -.LBB0_60: // %L.LB1_398 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_59 Depth=3 - // => This Inner Loop Header: Depth=4 - add x23, x7, x28, lsl #3 - add x26, x7, x21, lsl #3 - ldr d1, [x23, #24] - ldr d2, [x26, #24] - ldr d3, [x7, #8]! - fadd d0, d1, d0 - fadd d1, d2, d3 - sub w22, w22, #1 // =1 - fadd d0, d0, d1 - fmul d0, d0, d9 - str d0, [x26, #16] - cmp w22, #2 // =2 - b.gt .LBB0_60 -// %bb.61: // %L.LB1_399 - // in Loop: Header=BB0_59 Depth=3 - orr x7, x11, #0x1 - mov x22, x19 - mul x7, x24, x7 - mov x23, x27 - mov w26, w20 - ldr d0, [x9, x7] - mov x7, x25 - .p2align 6 // OSACA-BEGIN .LBB0_62: // %L.LB1_398.1 // Parent Loop BB0_50 Depth=1 @@ -754,441 +17,3 @@ MAIN_: // @MAIN_ cmp w26, #2 // =2 b.gt .LBB0_62 // OSACA-END -// %bb.63: // %L.LB1_399.1 - // in Loop: Header=BB0_59 Depth=3 - orr x7, x11, #0x2 - mov x22, x14 - mul x7, x24, x7 - mov x23, x13 - mov w26, w20 - ldr d0, [x9, x7] - mov x7, x15 - .p2align 6 -.LBB0_64: // %L.LB1_398.2 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_59 Depth=3 - // => This Inner Loop Header: Depth=4 - ldr d1, [x7], #8 - fadd d0, d1, d0 - ldr d2, [x22] - ldr d3, [x23], #8 - fadd d2, d2, d3 - fadd d0, d0, d2 - sub w26, w26, #1 // =1 - fmul d0, d0, d9 - stur d0, [x22, #-8] - add x22, x22, #8 // =8 - cmp w26, #2 // =2 - b.gt .LBB0_64 -// %bb.65: // %L.LB1_399.2 - // in Loop: Header=BB0_59 Depth=3 - orr x22, x11, #0x3 - mov x7, xzr - mul x22, x24, x22 - ldr d0, [x9, x22] - mov w22, w20 - .p2align 6 -.LBB0_66: // %L.LB1_398.3 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_59 Depth=3 - // => This Inner Loop Header: Depth=4 - add x23, x16, x7 - sub w22, w22, #1 // =1 - ldr d1, [x17, x7] - ldr d2, [x25, x7] - ldr d3, [x23] - fadd d2, d3, d2 - fadd d0, d1, d0 - add x7, x7, #8 // =8 - fadd d0, d0, d2 - fmul d0, d0, d9 - stur d0, [x23, #-8] - cmp w22, #2 // =2 - b.gt .LBB0_66 -// %bb.67: // %L.LB1_399.3 - // in Loop: Header=BB0_59 Depth=3 - orr x22, x11, #0x4 - mov x7, xzr - mul x22, x24, x22 - ldr d0, [x9, x22] - mov w22, w20 - .p2align 6 -.LBB0_68: // %L.LB1_398.4 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_59 Depth=3 - // => This Inner Loop Header: Depth=4 - add x23, x18, x7 - sub w22, w22, #1 // =1 - ldr d1, [x0, x7] - ldr d2, [x15, x7] - ldr d3, [x23] - fadd d2, d3, d2 - fadd d0, d1, d0 - add x7, x7, #8 // =8 - fadd d0, d0, d2 - fmul d0, d0, d9 - stur d0, [x23, #-8] - cmp w22, #2 // =2 - b.gt .LBB0_68 -// %bb.69: // %L.LB1_399.4 - // in Loop: Header=BB0_59 Depth=3 - mov w22, #5 - orr x22, x11, x22 - mul x22, x24, x22 - mov x7, xzr - ldr d0, [x9, x22] - mov w22, w20 - .p2align 6 -.LBB0_70: // %L.LB1_398.5 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_59 Depth=3 - // => This Inner Loop Header: Depth=4 - add x23, x1, x7 - sub w22, w22, #1 // =1 - ldr d1, [x2, x7] - ldr d2, [x17, x7] - ldr d3, [x23] - fadd d2, d3, d2 - fadd d0, d1, d0 - add x7, x7, #8 // =8 - fadd d0, d0, d2 - fmul d0, d0, d9 - stur d0, [x23, #-8] - cmp w22, #2 // =2 - b.gt .LBB0_70 -// %bb.71: // %L.LB1_399.5 - // in Loop: Header=BB0_59 Depth=3 - orr x22, x11, #0x6 - mov x7, xzr - mul x22, x24, x22 - ldr d0, [x9, x22] - mov w22, w20 - .p2align 6 -.LBB0_72: // %L.LB1_398.6 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_59 Depth=3 - // => This Inner Loop Header: Depth=4 - add x23, x3, x7 - sub w22, w22, #1 // =1 - ldr d1, [x4, x7] - ldr d2, [x0, x7] - ldr d3, [x23] - fadd d2, d3, d2 - fadd d0, d1, d0 - add x7, x7, #8 // =8 - fadd d0, d0, d2 - fmul d0, d0, d9 - stur d0, [x23, #-8] - cmp w22, #2 // =2 - b.gt .LBB0_72 -// %bb.73: // %L.LB1_399.6 - // in Loop: Header=BB0_59 Depth=3 - orr x22, x11, #0x7 - mov x7, xzr - mul x22, x24, x22 - add x12, x12, #8 // =8 - ldr d0, [x9, x22] - mov w22, w20 - .p2align 6 -.LBB0_74: // %L.LB1_398.7 - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_59 Depth=3 - // => This Inner Loop Header: Depth=4 - add x23, x5, x7 - sub w22, w22, #1 // =1 - ldr d1, [x10, x7] - ldr d2, [x2, x7] - ldr d3, [x23] - fadd d2, d3, d2 - fadd d0, d1, d0 - add x7, x7, #8 // =8 - fadd d0, d0, d2 - fmul d0, d0, d9 - stur d0, [x23, #-8] - cmp w22, #2 // =2 - b.gt .LBB0_74 -// %bb.75: // %L.LB1_399.7 - // in Loop: Header=BB0_59 Depth=3 - add x8, x8, x29 - add x27, x27, x29 - add x19, x19, x29 - add x25, x25, x29 - add x13, x13, x29 - add x11, x11, #8 // =8 - add x14, x14, x29 - add x15, x15, x29 - add x16, x16, x29 - add x17, x17, x29 - add x18, x18, x29 - add x0, x0, x29 - add x1, x1, x29 - add x2, x2, x29 - add x3, x3, x29 - add x4, x4, x29 - add x5, x5, x29 - add x10, x10, x29 - subs x6, x6, #8 // =8 - b.ne .LBB0_59 -// %bb.76: // %L.LB1_396.loopexit.unr-lcssa.loopexit - // in Loop: Header=BB0_55 Depth=2 - ldp x25, x23, [sp, #176] // 16-byte Folded Reload - ldr x22, [sp, #200] // 8-byte Folded Reload - ldr w26, [sp, #372] // 4-byte Folded Reload - cbz w25, .LBB0_81 -.LBB0_77: // %L.LB1_395.epil.preheader - // in Loop: Header=BB0_55 Depth=2 - ldr x13, [sp, #360] // 8-byte Folded Reload - mul x8, x13, x12 - ldr x14, [sp, #344] // 8-byte Folded Reload - sub x10, x12, #1 // =1 - add x12, x12, #1 // =1 - mul x10, x13, x10 - mul x12, x13, x12 - add x8, x14, x8, lsl #3 - mov x13, x25 - ldr x14, [sp, #352] // 8-byte Folded Reload - add x10, x14, x10, lsl #3 - add x12, x14, x12, lsl #3 - .p2align 6 -.LBB0_78: // %L.LB1_395.epil - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // => This Loop Header: Depth=3 - // Child Loop BB0_79 Depth 4 - mul x14, x24, x11 - mov x15, x8 - mov x16, x10 - ldr d0, [x9, x14] - mov x14, x12 - mov w17, w20 - .p2align 6 -.LBB0_79: // %L.LB1_398.epil - // Parent Loop BB0_50 Depth=1 - // Parent Loop BB0_55 Depth=2 - // Parent Loop BB0_78 Depth=3 - // => This Inner Loop Header: Depth=4 - ldr d1, [x14], #8 - fadd d0, d1, d0 - ldr d2, [x15] - ldr d3, [x16], #8 - fadd d2, d2, d3 - fadd d0, d0, d2 - sub w17, w17, #1 // =1 - fmul d0, d0, d9 - stur d0, [x15, #-8] - add x15, x15, #8 // =8 - cmp w17, #2 // =2 - b.gt .LBB0_79 -// %bb.80: // %L.LB1_399.epil - // in Loop: Header=BB0_78 Depth=3 - add x10, x10, x24 - add x8, x8, x24 - add x12, x12, x24 - add x11, x11, #1 // =1 - subs x13, x13, #1 // =1 - b.ne .LBB0_78 -.LBB0_81: // %L.LB1_396 - // in Loop: Header=BB0_55 Depth=2 - add w26, w26, #1 // =1 - subs w30, w30, #1 // =1 - b.gt .LBB0_55 -.LBB0_82: // %L.LB1_393 - // in Loop: Header=BB0_50 Depth=1 - add x0, sp, #384 // =384 - add x1, sp, #376 // =376 - bl timing_ - ldr d0, [sp, #384] - ldr d1, [sp, #400] - fsub d0, d0, d1 - ldr w19, [sp, #196] // 4-byte Folded Reload - mov w8, #51712 - movk w8, #15258, lsl #16 - fcmp d0, d8 - ccmp w19, w8, #2, lt - b.lo .LBB0_50 -// %bb.83: // %L.LB1_391 - adrp x0, .C345_MAIN_ - adrp x1, .C327_MAIN_ - cmp w26, w19 - add x0, x0, :lo12:.C345_MAIN_ - add x1, x1, :lo12:.C327_MAIN_ - orr w2, wzr, #0x6 - csel w19, w19, w26, gt - bl f90io_src_info03a - adrp x20, .C283_MAIN_ - add x20, x20, :lo12:.C283_MAIN_ - adrp x0, .C326_MAIN_ - mov x1, xzr - mov x2, x20 - mov x3, x20 - add x0, x0, :lo12:.C326_MAIN_ - bl f90io_print_init - adrp x0, .C348_MAIN_ - add x0, x0, :lo12:.C348_MAIN_ - orr w1, wzr, #0xe - orr w2, wzr, #0xe - bl f90io_sc_ch_ldw - mov w0, w19 - mov w1, #25 - bl f90io_sc_i_ldw - adrp x0, .C349_MAIN_ - add x0, x0, :lo12:.C349_MAIN_ - orr w1, wzr, #0xe - orr w2, wzr, #0xe - bl f90io_sc_ch_ldw - ldr w8, [sp, #416] - sub w8, w8, #1 // =1 - orr w0, wzr, #0x1c - scvtf d0, w19 - scvtf d1, w8 - ldr w8, [sp, #420] - sub w8, w8, #1 // =1 - scvtf d2, w8 - fmul d0, d1, d0 - ldr d1, [sp, #384] - adrp x8, .LCPI0_1 - fmul d0, d0, d2 - ldr d2, [sp, #400] - fsub d1, d1, d2 - ldr d2, [x8, :lo12:.LCPI0_1] - fmul d1, d1, d2 - fdiv d0, d0, d1 - bl f90io_sc_d_ldw - adrp x0, .C351_MAIN_ - add x0, x0, :lo12:.C351_MAIN_ - orr w1, wzr, #0xe - orr w2, wzr, #0x6 - bl f90io_sc_ch_ldw - bl f90io_ldw_end - mov x0, x20 - mov x1, xzr - mov x2, xzr - bl f90_stop08a - add sp, sp, #432 // =432 - ldp x29, x30, [sp, #96] // 16-byte Folded Reload - ldp x20, x19, [sp, #80] // 16-byte Folded Reload - ldp x22, x21, [sp, #64] // 16-byte Folded Reload - ldp x24, x23, [sp, #48] // 16-byte Folded Reload - ldp x26, x25, [sp, #32] // 16-byte Folded Reload - ldp x28, x27, [sp, #16] // 16-byte Folded Reload - ldp d9, d8, [sp], #112 // 16-byte Folded Reload - ret -.Lfunc_end0: - .size MAIN_, .Lfunc_end0-MAIN_ - .cfi_endproc - // -- End function - .type .C351_MAIN_,@object // @.C351_MAIN_ - .section .rodata,"a",@progbits - .p2align 2 -.C351_MAIN_: - .asciz " MLUPs" - .size .C351_MAIN_, 7 - - .type .C349_MAIN_,@object // @.C349_MAIN_ - .p2align 2 -.C349_MAIN_: - .asciz " Performance: " - .size .C349_MAIN_, 15 - - .type .C348_MAIN_,@object // @.C348_MAIN_ - .p2align 2 -.C348_MAIN_: - .asciz "# Iterations: " - .size .C348_MAIN_, 15 - - .type .C326_MAIN_,@object // @.C326_MAIN_ - .p2align 2 -.C326_MAIN_: - .word 6 // 0x6 - .size .C326_MAIN_, 4 - - .type .C345_MAIN_,@object // @.C345_MAIN_ - .p2align 2 -.C345_MAIN_: - .word 72 // 0x48 - .size .C345_MAIN_, 4 - - .type .C366_MAIN_,@object // @.C366_MAIN_ - .p2align 3 -.C366_MAIN_: - .xword 28 // 0x1c - .size .C366_MAIN_, 8 - - .type .C365_MAIN_,@object // @.C365_MAIN_ - .p2align 3 -.C365_MAIN_: - .xword 8 // 0x8 - .size .C365_MAIN_, 8 - - .type .C286_MAIN_,@object // @.C286_MAIN_ - .p2align 3 -.C286_MAIN_: - .xword 1 // 0x1 - .size .C286_MAIN_, 8 - - .type .C285_MAIN_,@object // @.C285_MAIN_ - .p2align 2 -.C285_MAIN_: - .word 1 // 0x1 - .size .C285_MAIN_, 4 - - .type .C334_MAIN_,@object // @.C334_MAIN_ - .p2align 2 -.C334_MAIN_: - .word 25 // 0x19 - .size .C334_MAIN_, 4 - - .type .C330_MAIN_,@object // @.C330_MAIN_ - .p2align 2 -.C330_MAIN_: - .word 5 // 0x5 - .size .C330_MAIN_, 4 - - .type .C327_MAIN_,@object // @.C327_MAIN_ - .p2align 2 -.C327_MAIN_: - .asciz "gs.f90" - .size .C327_MAIN_, 7 - - .type .C329_MAIN_,@object // @.C329_MAIN_ - .p2align 2 -.C329_MAIN_: - .word 12 // 0xc - .size .C329_MAIN_, 4 - - .type .C284_MAIN_,@object // @.C284_MAIN_ - .p2align 3 -.C284_MAIN_: - .xword 0 // 0x0 - .size .C284_MAIN_, 8 - - .type .C283_MAIN_,@object // @.C283_MAIN_ - .p2align 2 -.C283_MAIN_: - .word 0 // 0x0 - .size .C283_MAIN_, 4 - - - .section ".note.GNU-stack","",@progbits - .addrsig - .addrsig_sym .C351_MAIN_ - .addrsig_sym .C349_MAIN_ - .addrsig_sym .C348_MAIN_ - .addrsig_sym .C326_MAIN_ - .addrsig_sym .C345_MAIN_ - .addrsig_sym .C366_MAIN_ - .addrsig_sym .C365_MAIN_ - .addrsig_sym .C286_MAIN_ - .addrsig_sym .C285_MAIN_ - .addrsig_sym .C334_MAIN_ - .addrsig_sym .C330_MAIN_ - .addrsig_sym .C327_MAIN_ - .addrsig_sym .C329_MAIN_ - .addrsig_sym .C284_MAIN_ - .addrsig_sym .C283_MAIN_ diff --git a/examples/gs/gs.s.tx2.gcc.s b/examples/gs/gs.s.tx2.gcc.s index d8ddc17..acb8e1e 100644 --- a/examples/gs/gs.s.tx2.gcc.s +++ b/examples/gs/gs.s.tx2.gcc.s @@ -1,519 +1,3 @@ - .arch armv8.1-a+crypto+crc - .file "gs.f90" - .text - .align 2 - .p2align 4,,15 - .type MAIN__, %function -MAIN__: -.LFB0: - .cfi_startproc - sub sp, sp, #720 - .cfi_def_cfa_offset 720 - mov x0, 128 - mov w1, 12 - stp x29, x30, [sp] - .cfi_offset 29, -720 - .cfi_offset 30, -712 - mov x29, sp - movk x0, 0x5, lsl 32 - stp x19, x20, [sp, 16] - .cfi_offset 19, -704 - .cfi_offset 20, -696 - adrp x19, .LC0 - add x19, x19, :lo12:.LC0 - stp x21, x22, [sp, 32] - stp x0, x19, [sp, 192] - add x0, sp, 192 - stp x23, x24, [sp, 48] - stp x25, x26, [sp, 64] - stp x27, x28, [sp, 80] - str w1, [sp, 208] - .cfi_offset 21, -688 - .cfi_offset 22, -680 - .cfi_offset 23, -672 - .cfi_offset 24, -664 - .cfi_offset 25, -656 - .cfi_offset 26, -648 - .cfi_offset 27, -640 - .cfi_offset 28, -632 - bl _gfortran_st_read - mov w2, 4 - add x1, sp, 144 - add x0, sp, 192 - bl _gfortran_transfer_integer - mov w2, 4 - add x1, sp, 148 - add x0, sp, 192 - bl _gfortran_transfer_integer - add x0, sp, 192 - bl _gfortran_st_read_done - ldp w24, w23, [sp, 144] - mov x3, -1 - mov x5, 4611686018427387904 - mov x2, 2305843009213693951 - sxtw x25, w24 - sxtw x20, w23 - cmp x25, 0 - csel x21, x25, x3, ge - cmp x20, 0 - csel x4, x20, x3, ge - add x21, x21, 1 - add x6, x4, 1 - mul x26, x6, x21 - cmp x26, x5 - lsl x27, x26, 1 - lsl x7, x26, 4 - cset w8, eq - cmp x27, x2 - cinc w9, w8, gt - cmp x25, 0 - ccmp x20, 0, 1, ge - csel x10, x7, xzr, ge - cbnz w9, .L159 - cmp x10, 0 - mov x28, 1 - csel x0, x10, x28, ne - bl malloc - stp d8, d9, [sp, 96] - .cfi_offset 73, -616 - .cfi_offset 72, -624 - cbz x0, .L160 - cmp w23, 1 - ble .L5 - cmp w24, 1 - ble .L6 - sub w12, w24, #2 - sub x4, x27, x26 - lsl x22, x21, 3 - mov w8, w28 - add x13, x21, x12 - mvn x14, x12 - add x10, x4, x21 - mov x6, x12 - add x15, x0, x13, lsl 3 - lsl x17, x14, 3 - mov x9, x21 - add x5, x15, 16 -.L10: - add x1, x17, x5 - sub x18, x10, x9 - sub x16, x5, x1 - sub x30, x16, #8 - lsr x3, x30, 3 - add x2, x3, 1 - ands x7, x2, 7 - beq .L7 - cmp x7, 1 - beq .L104 - cmp x7, 2 - beq .L105 - cmp x7, 3 - beq .L106 - cmp x7, 4 - beq .L107 - cmp x7, 5 - beq .L108 - cmp x7, 6 - beq .L109 - str xzr, [x1] - str xzr, [x1, x18, lsl 3] - add x1, x1, 8 -.L109: - str xzr, [x1] - str xzr, [x1, x18, lsl 3] - add x1, x1, 8 -.L108: - str xzr, [x1] - str xzr, [x1, x18, lsl 3] - add x1, x1, 8 -.L107: - str xzr, [x1] - str xzr, [x1, x18, lsl 3] - add x1, x1, 8 -.L106: - str xzr, [x1] - str xzr, [x1, x18, lsl 3] - add x1, x1, 8 -.L105: - str xzr, [x1] - str xzr, [x1, x18, lsl 3] - add x1, x1, 8 -.L104: - str xzr, [x1] - str xzr, [x1, x18, lsl 3] - add x1, x1, 8 - cmp x1, x5 - beq .L155 -.L7: - str xzr, [x1] - add x28, x1, 8 - add x16, x1, 16 - add x15, x1, 24 - str xzr, [x1, x18, lsl 3] - add x14, x1, 32 - add x13, x1, 40 - add x12, x1, 48 - str xzr, [x1, 8] - add x11, x1, 56 - add x1, x1, 64 - str xzr, [x28, x18, lsl 3] - str xzr, [x1, -48] - str xzr, [x16, x18, lsl 3] - str xzr, [x1, -40] - str xzr, [x15, x18, lsl 3] - str xzr, [x1, -32] - str xzr, [x14, x18, lsl 3] - str xzr, [x1, -24] - str xzr, [x13, x18, lsl 3] - str xzr, [x1, -16] - str xzr, [x12, x18, lsl 3] - str xzr, [x1, -8] - str xzr, [x11, x18, lsl 3] - cmp x1, x5 - bne .L7 -.L155: - add w8, w8, 1 - add x10, x10, x21 - add x9, x9, x21 - add x5, x5, x22 - cmp w23, w8 - bne .L10 -.L9: - mul x20, x21, x20 - fmov d0, 1.0e+0 - sub x17, x26, x27 - and w18, w24, 7 - mov x2, 1 - add x30, x4, x20 - neg x3, x20, lsl 3 - add x7, x0, x30, lsl 3 - str d0, [x7, x17, lsl 3] - add x1, x7, 8 - str d0, [x7] - str xzr, [x0] - str xzr, [x7, x3] - cmp w24, 1 - blt .L151 - cbz w18, .L13 - cmp w18, 1 - beq .L119 - cmp w18, 2 - beq .L120 - cmp w18, 3 - beq .L121 - cmp w18, 4 - beq .L122 - cmp w18, 5 - beq .L123 - cmp w18, 6 - beq .L124 - str d0, [x1, x17, lsl 3] - mov x2, 2 - str d0, [x1] - str xzr, [x0, 8] - str xzr, [x1, x3] - add x1, x1, 8 -.L124: - str d0, [x1, x17, lsl 3] - str d0, [x1] - str xzr, [x0, x2, lsl 3] - add x2, x2, 1 - str xzr, [x1, x3] - add x1, x1, 8 -.L123: - str d0, [x1, x17, lsl 3] - str d0, [x1] - str xzr, [x0, x2, lsl 3] - add x2, x2, 1 - str xzr, [x1, x3] - add x1, x1, 8 -.L122: - str d0, [x1, x17, lsl 3] - str d0, [x1] - str xzr, [x0, x2, lsl 3] - add x2, x2, 1 - str xzr, [x1, x3] - add x1, x1, 8 -.L121: - str d0, [x1, x17, lsl 3] - str d0, [x1] - str xzr, [x0, x2, lsl 3] - add x2, x2, 1 - str xzr, [x1, x3] - add x1, x1, 8 -.L120: - str d0, [x1, x17, lsl 3] - str d0, [x1] - str xzr, [x0, x2, lsl 3] - add x2, x2, 1 - str xzr, [x1, x3] - add x1, x1, 8 -.L119: - str d0, [x1, x17, lsl 3] - str d0, [x1] - str xzr, [x0, x2, lsl 3] - add x2, x2, 1 - str xzr, [x1, x3] - add x1, x1, 8 - cmp w24, w2 - blt .L151 -.L13: - str d0, [x1, x17, lsl 3] - add x28, x1, 8 - add x15, x2, 1 - add x16, x1, 16 - str d0, [x1] - add x13, x2, 2 - add x14, x1, 24 - add x12, x2, 3 - str xzr, [x0, x2, lsl 3] - add x9, x1, 32 - add x4, x2, 4 - add x8, x1, 40 - str xzr, [x1, x3] - add x11, x2, 5 - add x5, x1, 48 - add x10, x2, 6 - str d0, [x28, x17, lsl 3] - add x20, x1, 56 - add x18, x2, 7 - add x2, x2, 8 - str d0, [x1, 8] - add x1, x1, 64 - str xzr, [x0, x15, lsl 3] - str xzr, [x28, x3] - str d0, [x16, x17, lsl 3] - str d0, [x1, -48] - str xzr, [x0, x13, lsl 3] - str xzr, [x16, x3] - str d0, [x14, x17, lsl 3] - str d0, [x1, -40] - str xzr, [x0, x12, lsl 3] - str xzr, [x14, x3] - str d0, [x9, x17, lsl 3] - str d0, [x1, -32] - str xzr, [x0, x4, lsl 3] - str xzr, [x9, x3] - str d0, [x8, x17, lsl 3] - str d0, [x1, -24] - str xzr, [x0, x11, lsl 3] - str xzr, [x8, x3] - str d0, [x5, x17, lsl 3] - str d0, [x1, -16] - str xzr, [x0, x10, lsl 3] - str xzr, [x5, x3] - str d0, [x20, x17, lsl 3] - str d0, [x1, -8] - str xzr, [x0, x18, lsl 3] - str xzr, [x20, x3] - cmp w24, w2 - bge .L13 -.L151: - cmp w24, 0 - csel w17, w24, wzr, ge - add w11, w17, 1 -.L8: - tbnz w23, #31, .L11 -.L12: - scvtf d2, w11 - scvtf d1, w24 - sub x30, x27, x26 - sub x25, x25, x26 - add x26, x25, x26 - add x27, x25, x27 - mov w3, 1 - and w7, w23, 7 - add x2, x0, x22 - fdiv d3, d2, d1 - str d3, [x0] - str d3, [x0, x30, lsl 3] - str d3, [x0, x26, lsl 3] - str d3, [x0, x27, lsl 3] - cmp w23, w3 - blt .L11 - cbz w7, .L15 - cmp w7, 1 - beq .L113 - cmp w7, 2 - beq .L114 - cmp w7, 3 - beq .L115 - cmp w7, 4 - beq .L116 - cmp w7, 5 - beq .L117 - cmp w7, 6 - beq .L118 - str d3, [x2] - mov w3, 2 - str d3, [x2, x30, lsl 3] - str d3, [x2, x26, lsl 3] - str d3, [x2, x27, lsl 3] - add x2, x2, x22 -.L118: - str d3, [x2] - add w3, w3, 1 - str d3, [x2, x30, lsl 3] - str d3, [x2, x26, lsl 3] - str d3, [x2, x27, lsl 3] - add x2, x2, x22 -.L117: - str d3, [x2] - add w3, w3, 1 - str d3, [x2, x30, lsl 3] - str d3, [x2, x26, lsl 3] - str d3, [x2, x27, lsl 3] - add x2, x2, x22 -.L116: - str d3, [x2] - add w3, w3, 1 - str d3, [x2, x30, lsl 3] - str d3, [x2, x26, lsl 3] - str d3, [x2, x27, lsl 3] - add x2, x2, x22 -.L115: - str d3, [x2] - add w3, w3, 1 - str d3, [x2, x30, lsl 3] - str d3, [x2, x26, lsl 3] - str d3, [x2, x27, lsl 3] - add x2, x2, x22 -.L114: - str d3, [x2] - add w3, w3, 1 - str d3, [x2, x30, lsl 3] - str d3, [x2, x26, lsl 3] - str d3, [x2, x27, lsl 3] - add x2, x2, x22 -.L113: - str d3, [x2] - add w3, w3, 1 - str d3, [x2, x30, lsl 3] - str d3, [x2, x26, lsl 3] - str d3, [x2, x27, lsl 3] - add x2, x2, x22 - cmp w23, w3 - blt .L11 -.L15: - str d3, [x2] - add x1, x2, x22 - add w3, w3, 8 - str d3, [x2, x30, lsl 3] - add x28, x1, x22 - str d3, [x2, x26, lsl 3] - add x15, x28, x22 - str d3, [x2, x27, lsl 3] - add x14, x15, x22 - str d3, [x1] - add x16, x14, x22 - str d3, [x1, x30, lsl 3] - add x13, x16, x22 - str d3, [x1, x26, lsl 3] - add x12, x13, x22 - str d3, [x1, x27, lsl 3] - add x2, x12, x22 - str d3, [x28] - str d3, [x28, x30, lsl 3] - str d3, [x28, x26, lsl 3] - str d3, [x28, x27, lsl 3] - str d3, [x15] - str d3, [x15, x30, lsl 3] - str d3, [x15, x26, lsl 3] - str d3, [x15, x27, lsl 3] - str d3, [x14] - str d3, [x14, x30, lsl 3] - str d3, [x14, x26, lsl 3] - str d3, [x14, x27, lsl 3] - str d3, [x16] - str d3, [x16, x30, lsl 3] - str d3, [x16, x26, lsl 3] - str d3, [x16, x27, lsl 3] - str d3, [x13] - str d3, [x13, x30, lsl 3] - str d3, [x13, x26, lsl 3] - str d3, [x13, x27, lsl 3] - str d3, [x12] - str d3, [x12, x30, lsl 3] - str d3, [x12, x26, lsl 3] - str d3, [x12, x27, lsl 3] - cmp w23, w3 - bge .L15 -.L11: - add x6, x21, x6, uxtw - adrp x4, .LC6 - add x9, x22, 8 - fmov d9, 2.5e-1 - ldr d8, [x4, #:lo12:.LC6] - add x27, x0, x9 - mov w20, 51711 - add x0, x0, x6, lsl 3 - lsl x28, x21, 1 - mov w26, 10 - movk w20, 0x3b9a, lsl 16 - add x25, x0, 16 -.L14: - add x0, sp, 176 - add x1, sp, 160 - lsl w26, w26, 1 - bl timing_ - mov w0, 0 - .p2align 4 -.L18: - cmp w23, 1 - ble .L21 - cmp w24, 1 - ble .L21 - mov x11, 0 - mov w10, 1 - mov x7, x25 - mov x9, x28 - mov x8, x21 - mov x6, x27 - .p2align 4 -.L22: - sub x5, x7, x6 - add w10, w10, 1 - mov x15, x6 - sub x18, x11, x8 - sub x17, x5, #8 - sub x30, x9, x8 - ldr d30, [x6, -8] - lsr x3, x17, 3 - add x2, x3, 1 - ands x1, x2, 3 - beq .L20 - cmp x1, 1 - beq .L111 - cmp x1, 2 - beq .L112 - ldr d4, [x6, x18, lsl 3] - ldr d6, [x6, 8] - ldr d5, [x6, x30, lsl 3] - fadd d7, d4, d6 - fadd d16, d7, d30 - fadd d17, d16, d5 - fmul d30, d17, d9 - str d30, [x15], 8 -.L112: - ldr d18, [x15, x18, lsl 3] - ldr d20, [x15, 8] - ldr d19, [x15, x30, lsl 3] - fadd d21, d18, d20 - fadd d22, d21, d30 - fadd d23, d22, d19 - fmul d30, d23, d9 - str d30, [x15], 8 -.L111: - ldr d24, [x15, x18, lsl 3] - ldr d26, [x15, 8] - ldr d25, [x15, x30, lsl 3] - fadd d27, d24, d26 - fadd d28, d27, d30 - fadd d29, d28, d25 - fmul d30, d29, d9 - str d30, [x15], 8 - cmp x7, x15 - beq .L154 // OSACA-BEGIN .L20: ldr d31, [x15, x18, lsl 3] @@ -555,183 +39,3 @@ MAIN__: cmp x7, x15 bne .L20 // OSACA-END -.L154: - add x6, x6, x22 - add x11, x11, x21 - add x8, x8, x21 - add x9, x9, x21 - add x7, x7, x22 - cmp w23, w10 - bne .L22 -.L21: - add w4, w0, 1 - cmp w26, w4 - beq .L17 - mov w0, w4 - b .L18 -.L17: - add w12, w0, 2 - add x1, sp, 152 - add x0, sp, 168 - str w12, [sp, 124] - str w12, [sp, 140] - bl timing_ - ldp d3, d1, [sp, 168] - ldr w5, [sp, 124] - fsub d4, d3, d1 - fcmpe d4, d8 - ccmp w26, w20, 0, lt - ble .L14 - cmp w5, w26 - ble .L23 - str w26, [sp, 140] -.L23: - mov x21, 128 - add x0, sp, 192 - mov w22, 72 - movk x21, 0x6, lsl 32 - str w22, [sp, 208] - sub w24, w24, #1 - sub w23, w23, #1 - stp x21, x19, [sp, 192] - bl _gfortran_st_write - adrp x19, .LANCHOR0 - adrp x27, .LC7 - add x28, x19, :lo12:.LANCHOR0 - mov x2, 14 - add x0, sp, 192 - mov x1, x28 - bl _gfortran_transfer_character_write - mov w2, 4 - add x1, sp, 140 - add x0, sp, 192 - bl _gfortran_transfer_integer_write - add x1, x28, 16 - mov x2, 14 - add x0, sp, 192 - bl _gfortran_transfer_character_write - ldr w25, [sp, 140] - scvtf d9, w24 - scvtf d8, w23 - ldr d5, [x27, #:lo12:.LC7] - ldp d18, d19, [sp, 168] - mov w2, 8 - add x1, sp, 184 - add x0, sp, 192 - scvtf d7, w25 - fsub d20, d18, d19 - fmul d6, d9, d8 - fmul d16, d7, d5 - fmul d17, d6, d16 - fdiv d21, d17, d20 - str d21, [sp, 184] - bl _gfortran_transfer_real_write - add x1, x28, 32 - mov x2, 6 - add x0, sp, 192 - bl _gfortran_transfer_character_write - add x0, sp, 192 - bl _gfortran_st_write_done - mov w2, 0 - mov x1, 0 - mov x0, 0 - bl _gfortran_stop_string -.L5: - tbnz w24, #31, .L25 -.L157: - sub x4, x27, x26 - lsl x22, x21, 3 - sub w6, w24, #2 - b .L9 -.L6: - tbz w24, #31, .L157 - mov w11, 0 - lsl x22, x21, 3 - sub w6, w24, #2 - b .L12 -.L159: - .cfi_restore 72 - .cfi_restore 73 - adrp x26, .LC1 - stp d8, d9, [sp, 96] - .cfi_offset 73, -616 - .cfi_offset 72, -624 - add x0, x26, :lo12:.LC1 - bl _gfortran_runtime_error -.L25: - mov w11, 0 - lsl x22, x21, 3 - sub w6, w24, #2 - b .L8 -.L160: - adrp x20, .LC2 - add x0, x20, :lo12:.LC2 - bl _gfortran_os_error - .cfi_endproc -.LFE0: - .size MAIN__, .-MAIN__ - .section .text.startup,"ax",@progbits - .align 2 - .p2align 4,,15 - .global main - .type main, %function -main: -.LFB1: - .cfi_startproc - stp x29, x30, [sp, -16]! - .cfi_def_cfa_offset 16 - .cfi_offset 29, -16 - .cfi_offset 30, -8 - mov x29, sp - bl _gfortran_set_args - adrp x1, .LANCHOR0 - mov w0, 7 - add x2, x1, :lo12:.LANCHOR0 - add x1, x2, 40 - bl _gfortran_set_options - bl MAIN__ - .cfi_endproc -.LFE1: - .size main, .-main - .section .rodata - .align 3 - .set .LANCHOR0,. + 0 -.LC3: - .ascii "# Iterations: " - .zero 2 -.LC4: - .ascii " Performance: " - .zero 2 -.LC5: - .ascii " MLUPs" - .zero 2 - .type options.8.2753, %object - .size options.8.2753, 28 -options.8.2753: - .word 68 - .word 8191 - .word 0 - .word 1 - .word 1 - .word 0 - .word 31 - .section .rodata.cst8,"aM",@progbits,8 - .align 3 -.LC6: - .word 2576980378 - .word 1070176665 -.LC7: - .word 2696277389 - .word 1051772663 - .section .rodata.str1.8,"aMS",@progbits,1 - .align 3 -.LC0: - .string "gs.f90" - .zero 1 -.LC1: - .string "Integer overflow when calculating the amount of memory to allocate" - .zero 5 -.LC2: - .string "Allocation would exceed memory limit" - .ident "GCC: (ARM-build-8) 8.2.0" - .section .note.GNU-stack,"",@progbits diff --git a/examples/gs/gs.s.zen.gcc.s b/examples/gs/gs.s.zen.gcc.s index e4b854f..6aa9cf8 100644 --- a/examples/gs/gs.s.zen.gcc.s +++ b/examples/gs/gs.s.zen.gcc.s @@ -1,649 +1,4 @@ - .file "gs.f90" - .text - .section .rodata.str1.1,"aMS",@progbits,1 -.LC0: - .string "gs.f90" - .section .rodata.str1.8,"aMS",@progbits,1 - .align 8 -.LC1: - .string "Integer overflow when calculating the amount of memory to allocate" - .align 8 -.LC2: - .string "Allocation would exceed memory limit" - .section .rodata.str1.1 -.LC8: - .string "# Iterations: " -.LC9: - .string " Performance: " -.LC11: - .string " MLUPs" - .text - .p2align 4 - .type MAIN__, @function -MAIN__: -.LFB0: - .cfi_startproc - pushq %r15 - .cfi_def_cfa_offset 16 - .cfi_offset 15, -16 - pushq %r14 - .cfi_def_cfa_offset 24 - .cfi_offset 14, -24 - movabsq $21474836608, %rax - movq $-1, %r14 - pushq %r13 - .cfi_def_cfa_offset 32 - .cfi_offset 13, -32 - pushq %r12 - .cfi_def_cfa_offset 40 - .cfi_offset 12, -40 - pushq %rbp - .cfi_def_cfa_offset 48 - .cfi_offset 6, -48 - pushq %rbx - .cfi_def_cfa_offset 56 - .cfi_offset 3, -56 - subq $664, %rsp - .cfi_def_cfa_offset 720 - leaq 128(%rsp), %rdi - movq %rax, 128(%rsp) - movq $.LC0, 136(%rsp) - movl $12, 144(%rsp) - call _gfortran_st_read - movl $4, %edx - leaq 80(%rsp), %rsi - leaq 128(%rsp), %rdi - call _gfortran_transfer_integer - movl $4, %edx - leaq 84(%rsp), %rsi - leaq 128(%rsp), %rdi - call _gfortran_transfer_integer - leaq 128(%rsp), %rdi - call _gfortran_st_read_done - movslq 80(%rsp), %rdi - movabsq $4611686018427387904, %rcx - movabsq $2305843009213693951, %r8 - movslq 84(%rsp), %rsi - testq %rdi, %rdi - movq %rdi, %r15 - movq %rdi, %r12 - movq %rdi, 16(%rsp) - cmovs %r14, %r15 - testq %rsi, %rsi - movq %rsi, %rbp - movq %rsi, 24(%rsp) - cmovns %rsi, %r14 - leaq 1(%r15), %rbx - xorl %edx, %edx - incq %r14 - imulq %rbx, %r14 - cmpq %rcx, %r14 - leaq (%r14,%r14), %r13 - sete %dl - cmpq %r8, %r13 - setg %r9b - movzbl %r9b, %r10d - addl %r10d, %edx - testq %rdi, %rdi - js .L37 - testq %rsi, %rsi - js .L37 - movq %r14, %r11 - salq $4, %r11 -.L2: - testl %edx, %edx - jne .L282 - testq %r11, %r11 - movl $1, %edi - cmovne %r11, %rdi - call malloc - testq %rax, %rax - je .L283 - cmpl $1, %ebp - jle .L5 - cmpl $1, %r12d - jle .L6 - leal -1(%r12), %r9d - movq %r13, %r10 - leal -2(%r12), %r11d - leaq (%rax,%r14,8), %rdi - movl %r9d, %ecx - movl %r9d, %r8d - subq %r14, %r10 - leaq 8(%rax), %rdx - shrl %ecx - andl $-2, %r8d - movl %r9d, 32(%rsp) - addq %rbx, %r10 - salq $4, %rcx - movl %r8d, 36(%rsp) - addq $2, %r15 - movq %rdi, 40(%rsp) - movq %rcx, 8(%rsp) - orl $1, %r9d - movl $1, (%rsp) - movq %r11, %r8 - movq %r11, 48(%rsp) - movq %rdx, 56(%rsp) - vxorps %xmm0, %xmm0, %xmm0 -.L14: - leaq -1(%r15), %rcx - leaq 1(%r10), %rsi - cmpq %rcx, %rsi - setl %dil - cmpq %r15, %r10 - setg %r11b - orl %r11d, %edi - andl $1, %edi - je .L9 - cmpl $3, %r8d - jbe .L9 - movq 8(%rsp), %r11 - leaq 0(,%r15,8), %rsi - xorl %edx, %edx - leaq (%rax,%rsi), %rdi - addq 40(%rsp), %rsi - subq $16, %r11 - shrq $4, %r11 - incq %r11 - andl $7, %r11d - je .L13 - cmpq $1, %r11 - je .L176 - cmpq $2, %r11 - je .L177 - cmpq $3, %r11 - je .L178 - cmpq $4, %r11 - je .L179 - cmpq $5, %r11 - je .L180 - cmpq $6, %r11 - je .L181 - vmovups %xmm0, (%rdi) - movl $16, %edx - vmovups %xmm0, (%rsi) -.L181: - vmovups %xmm0, (%rdi,%rdx) - vmovups %xmm0, (%rsi,%rdx) - addq $16, %rdx -.L180: - vmovups %xmm0, (%rdi,%rdx) - vmovups %xmm0, (%rsi,%rdx) - addq $16, %rdx -.L179: - vmovups %xmm0, (%rdi,%rdx) - vmovups %xmm0, (%rsi,%rdx) - addq $16, %rdx -.L178: - vmovups %xmm0, (%rdi,%rdx) - vmovups %xmm0, (%rsi,%rdx) - addq $16, %rdx -.L177: - vmovups %xmm0, (%rdi,%rdx) - vmovups %xmm0, (%rsi,%rdx) - addq $16, %rdx -.L176: - vmovups %xmm0, (%rdi,%rdx) - vmovups %xmm0, (%rsi,%rdx) - addq $16, %rdx - cmpq 8(%rsp), %rdx - je .L155 -.L13: - vmovups %xmm0, (%rdi,%rdx) - vmovups %xmm0, (%rsi,%rdx) - vmovups %xmm0, 16(%rdi,%rdx) - vmovups %xmm0, 16(%rsi,%rdx) - vmovups %xmm0, 32(%rdi,%rdx) - vmovups %xmm0, 32(%rsi,%rdx) - vmovups %xmm0, 48(%rdi,%rdx) - vmovups %xmm0, 48(%rsi,%rdx) - vmovups %xmm0, 64(%rdi,%rdx) - vmovups %xmm0, 64(%rsi,%rdx) - vmovups %xmm0, 80(%rdi,%rdx) - vmovups %xmm0, 80(%rsi,%rdx) - vmovups %xmm0, 96(%rdi,%rdx) - vmovups %xmm0, 96(%rsi,%rdx) - vmovups %xmm0, 112(%rdi,%rdx) - vmovups %xmm0, 112(%rsi,%rdx) - subq $-128, %rdx - cmpq 8(%rsp), %rdx - jne .L13 -.L155: - movl 36(%rsp), %esi - cmpl %esi, 32(%rsp) - je .L16 - addq %r9, %rcx - movq $0x000000000, (%rax,%rcx,8) - leaq (%r10,%r9), %rcx - movq $0x000000000, (%rax,%rcx,8) -.L16: - incl (%rsp) - addq %rbx, %r10 - addq %rbx, %r15 - movl (%rsp), %r11d - cmpl %r11d, %ebp - jne .L14 -.L11: - movq 24(%rsp), %r11 - movl $0, %edx - movq %r13, %rsi - imulq %rbx, %r11 - testl %r12d, %r12d - cmovns %r12d, %edx - subq %r14, %rsi - movq %r11, %r10 - leaq 1(%r11), %r9 - subq %r14, %r10 - movq %r9, (%rsp) - leaq (%r10,%r13), %rcx - leaq 1(%r13,%r10), %r15 - leaq 1(%rsi), %r10 - cmpq %rcx, %r10 - setl %r9b - cmpq %rsi, %r15 - setl %dil - orl %edi, %r9d - cmpq %rcx, (%rsp) - setl %dil - cmpq %r15, %r11 - setg 8(%rsp) - orw 8(%rsp), %di - andl %edi, %r9d - cmpq %r10, %r11 - setg %dil - cmpq %rsi, (%rsp) - setl %sil - orl %edi, %esi - andl %r9d, %esi - andl $1, %esi - je .L20 - cmpq $2, %r10 - seta %r10b - cmpq $2, %r15 - seta %r15b - andl %r15d, %r10d - cmpq $2, (%rsp) - seta %dil - cmpl $2, %edx - seta %r9b - andl %r9d, %edi - andl %edi, %r10d - andl $1, %r10d - je .L20 - incl %edx - leaq (%rax,%rcx,8), %rdi - xorl %ecx, %ecx - vmovaps .LC4(%rip), %xmm1 - movl %edx, %r15d - leaq (%rax,%r11,8), %r9 - leaq (%rax,%r14,8), %rsi - vxorps %xmm2, %xmm2, %xmm2 - shrl %r15d - salq $4, %r15 - leaq -16(%r15), %r10 - shrq $4, %r10 - incq %r10 - andl $7, %r10d - je .L22 - cmpq $1, %r10 - je .L188 - cmpq $2, %r10 - je .L189 - cmpq $3, %r10 - je .L190 - cmpq $4, %r10 - je .L191 - cmpq $5, %r10 - je .L192 - cmpq $6, %r10 - je .L193 - vmovups %xmm1, (%r9) - movl $16, %ecx - vmovups %xmm1, (%rdi) - vmovups %xmm2, (%rax) - vmovups %xmm2, (%rsi) -.L193: - vmovups %xmm1, (%r9,%rcx) - vmovups %xmm1, (%rdi,%rcx) - vmovups %xmm2, (%rax,%rcx) - vmovups %xmm2, (%rsi,%rcx) - addq $16, %rcx -.L192: - vmovups %xmm1, (%r9,%rcx) - vmovups %xmm1, (%rdi,%rcx) - vmovups %xmm2, (%rax,%rcx) - vmovups %xmm2, (%rsi,%rcx) - addq $16, %rcx -.L191: - vmovups %xmm1, (%r9,%rcx) - vmovups %xmm1, (%rdi,%rcx) - vmovups %xmm2, (%rax,%rcx) - vmovups %xmm2, (%rsi,%rcx) - addq $16, %rcx -.L190: - vmovups %xmm1, (%r9,%rcx) - vmovups %xmm1, (%rdi,%rcx) - vmovups %xmm2, (%rax,%rcx) - vmovups %xmm2, (%rsi,%rcx) - addq $16, %rcx -.L189: - vmovups %xmm1, (%r9,%rcx) - vmovups %xmm1, (%rdi,%rcx) - vmovups %xmm2, (%rax,%rcx) - vmovups %xmm2, (%rsi,%rcx) - addq $16, %rcx -.L188: - vmovups %xmm1, (%r9,%rcx) - vmovups %xmm1, (%rdi,%rcx) - vmovups %xmm2, (%rax,%rcx) - vmovups %xmm2, (%rsi,%rcx) - addq $16, %rcx - cmpq %r15, %rcx - je .L113 -.L22: - vmovups %xmm1, (%r9,%rcx) - vmovups %xmm1, (%rdi,%rcx) - vmovups %xmm2, (%rax,%rcx) - vmovups %xmm2, (%rsi,%rcx) - vmovups %xmm1, 16(%r9,%rcx) - vmovups %xmm1, 16(%rdi,%rcx) - vmovups %xmm2, 16(%rax,%rcx) - vmovups %xmm2, 16(%rsi,%rcx) - vmovups %xmm1, 32(%r9,%rcx) - vmovups %xmm1, 32(%rdi,%rcx) - vmovups %xmm2, 32(%rax,%rcx) - vmovups %xmm2, 32(%rsi,%rcx) - vmovups %xmm1, 48(%r9,%rcx) - vmovups %xmm1, 48(%rdi,%rcx) - vmovups %xmm2, 48(%rax,%rcx) - vmovups %xmm2, 48(%rsi,%rcx) - vmovups %xmm1, 64(%r9,%rcx) - vmovups %xmm1, 64(%rdi,%rcx) - vmovups %xmm2, 64(%rax,%rcx) - vmovups %xmm2, 64(%rsi,%rcx) - vmovups %xmm1, 80(%r9,%rcx) - vmovups %xmm1, 80(%rdi,%rcx) - vmovups %xmm2, 80(%rax,%rcx) - vmovups %xmm2, 80(%rsi,%rcx) - vmovups %xmm1, 96(%r9,%rcx) - vmovups %xmm1, 96(%rdi,%rcx) - vmovups %xmm2, 96(%rax,%rcx) - vmovups %xmm2, 96(%rsi,%rcx) - vmovups %xmm1, 112(%r9,%rcx) - vmovups %xmm1, 112(%rdi,%rcx) - vmovups %xmm2, 112(%rax,%rcx) - vmovups %xmm2, 112(%rsi,%rcx) - subq $-128, %rcx - cmpq %r15, %rcx - jne .L22 -.L113: - movl %edx, %r9d - andl $-2, %r9d - testb $1, %dl - je .L10 - vmovsd .LC5(%rip), %xmm3 - movslq %r9d, %r15 - movq %r15, %rdi - leaq (%r11,%r15), %r11 - subq %r14, %rdi - leaq 0(%r13,%rdi), %rsi - vmovsd %xmm3, (%rax,%r11,8) - addq %r14, %r11 - vmovsd %xmm3, (%rax,%r11,8) - movq $0x000000000, (%rax,%r15,8) - movq $0x000000000, (%rax,%rsi,8) -.L10: - testl %ebp, %ebp - js .L18 -.L19: - vxorps %xmm5, %xmm5, %xmm5 - movq 16(%rsp), %r15 - leaq 0(,%rbx,8), %rdi - movl $1, %r9d - vcvtsi2sdl %edx, %xmm5, %xmm6 - vcvtsi2sdl %r12d, %xmm5, %xmm7 - vdivsd %xmm7, %xmm6, %xmm8 - leaq (%rax,%rdi), %r10 - movq %r15, %rdx - subq %r14, %rdx - leaq 0(%r13,%rdx), %rcx - subq %r14, %r13 - movl %ebp, %r14d - andl $7, %r14d - vmovsd %xmm8, (%rax) - vmovsd %xmm8, (%rax,%r13,8) - vmovsd %xmm8, (%rax,%r15,8) - vmovsd %xmm8, (%rax,%rcx,8) - cmpl $1, %ebp - jl .L18 - testl %r14d, %r14d - je .L27 - cmpl $1, %r14d - je .L200 - cmpl $2, %r14d - je .L201 - cmpl $3, %r14d - je .L202 - cmpl $4, %r14d - je .L203 - cmpl $5, %r14d - je .L204 - cmpl $6, %r14d - je .L205 - vmovsd %xmm8, (%r10) - movl $2, %r9d - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 -.L205: - vmovsd %xmm8, (%r10) - incl %r9d - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 -.L204: - vmovsd %xmm8, (%r10) - incl %r9d - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 -.L203: - vmovsd %xmm8, (%r10) - incl %r9d - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 -.L202: - vmovsd %xmm8, (%r10) - incl %r9d - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 -.L201: - vmovsd %xmm8, (%r10) - incl %r9d - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 -.L200: - incl %r9d - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - cmpl %r9d, %ebp - jl .L18 -.L27: - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - addl $8, %r9d - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - vmovsd %xmm8, (%r10) - vmovsd %xmm8, (%r10,%r13,8) - vmovsd %xmm8, (%r10,%r15,8) - vmovsd %xmm8, (%r10,%rcx,8) - addq %rdi, %r10 - cmpl %r9d, %ebp - jge .L27 -.L18: - movl %r8d, %r8d - leaq 0(,%rbx,8), %r13 - leaq (%rbx,%rbx), %rsi - movl $10, (%rsp) - addq %rbx, %r8 - leaq 8(%rax,%r13), %r11 - movq %rsi, 8(%rsp) - leaq 16(%rax,%r8,8), %rax - movq %r11, 16(%rsp) - movq %rax, 24(%rsp) -.L26: - leaq 96(%rsp), %rsi - leaq 112(%rsp), %rdi - xorl %r15d, %r15d - sall (%rsp) - call timing_ - movq .LC6(%rip), %rdx - vmovq %rdx, %xmm9 - .p2align 4 - .p2align 3 -.L30: - cmpl $1, %ebp - jle .L33 - cmpl $1, %r12d - jle .L33 - movq 24(%rsp), %r8 - movq 8(%rsp), %r14 - movq %rbx, %r9 - xorl %r11d, %r11d - movq 16(%rsp), %rdi - movl $1, %r10d - .p2align 4 - .p2align 3 -.L34: - movq %r8, %rdx - movq %r11, %rsi - movq %r14, %rcx - incl %r10d - subq %rdi, %rdx - subq %r9, %rsi - subq %r9, %rcx - vmovsd -8(%rdi), %xmm8 - subq $8, %rdx - movq %rdi, %rax - shrq $3, %rdx - incq %rdx - andl $7, %edx - je .L32 - cmpq $1, %rdx - je .L194 - cmpq $2, %rdx - je .L195 - cmpq $3, %rdx - je .L196 - cmpq $4, %rdx - je .L197 - cmpq $5, %rdx - je .L198 - cmpq $6, %rdx - je .L199 - vmovsd (%rdi,%rsi,8), %xmm10 - vaddsd (%rdi,%rcx,8), %xmm8, %xmm12 - leaq 8(%rdi), %rax - vaddsd 8(%rdi), %xmm10, %xmm11 - vaddsd %xmm12, %xmm11, %xmm13 - vmulsd %xmm9, %xmm13, %xmm8 - vmovsd %xmm8, (%rdi) -.L199: - vmovsd (%rax,%rsi,8), %xmm14 - vaddsd (%rax,%rcx,8), %xmm8, %xmm0 - vaddsd 8(%rax), %xmm14, %xmm15 - addq $8, %rax - vaddsd %xmm0, %xmm15, %xmm1 - vmulsd %xmm9, %xmm1, %xmm8 - vmovsd %xmm8, -8(%rax) -.L198: - vmovsd (%rax,%rsi,8), %xmm2 - vaddsd (%rax,%rcx,8), %xmm8, %xmm4 - vaddsd 8(%rax), %xmm2, %xmm3 - addq $8, %rax - vaddsd %xmm4, %xmm3, %xmm5 - vmulsd %xmm9, %xmm5, %xmm8 - vmovsd %xmm8, -8(%rax) -.L197: - vmovsd (%rax,%rsi,8), %xmm6 - vaddsd (%rax,%rcx,8), %xmm8, %xmm8 - vaddsd 8(%rax), %xmm6, %xmm7 - addq $8, %rax - vaddsd %xmm8, %xmm7, %xmm10 - vmulsd %xmm9, %xmm10, %xmm8 - vmovsd %xmm8, -8(%rax) -.L196: - vmovsd (%rax,%rsi,8), %xmm11 - vaddsd (%rax,%rcx,8), %xmm8, %xmm13 - vaddsd 8(%rax), %xmm11, %xmm12 - addq $8, %rax - vaddsd %xmm13, %xmm12, %xmm14 - vmulsd %xmm9, %xmm14, %xmm8 - vmovsd %xmm8, -8(%rax) -.L195: - vmovsd (%rax,%rsi,8), %xmm15 - vaddsd (%rax,%rcx,8), %xmm8, %xmm0 - vaddsd 8(%rax), %xmm15, %xmm1 - addq $8, %rax - vaddsd %xmm0, %xmm1, %xmm2 - vmulsd %xmm9, %xmm2, %xmm8 - vmovsd %xmm8, -8(%rax) -.L194: - vmovsd (%rax,%rsi,8), %xmm3 - vaddsd (%rax,%rcx,8), %xmm8, %xmm5 - vaddsd 8(%rax), %xmm3, %xmm4 - addq $8, %rax - vaddsd %xmm5, %xmm4, %xmm6 - vmulsd %xmm9, %xmm6, %xmm8 - vmovsd %xmm8, -8(%rax) - cmpq %r8, %rax - je .L266 -# OSACA-BEGIN + # OSACA-BEGIN .L32: vmovsd (%rax,%rsi,8), %xmm7 leaq 8(%rax), %rdx @@ -703,371 +58,4 @@ MAIN__: vmovsd %xmm8, -8(%rax) cmpq %r8, %rax jne .L32 -# OSACA-END -.L266: - addq %r13, %rdi - addq %rbx, %r11 - addq %rbx, %r9 - addq %rbx, %r14 - addq %r13, %r8 - cmpl %r10d, %ebp - jne .L34 -.L33: - leal 1(%r15), %r8d - cmpl (%rsp), %r8d - je .L29 - movl %r8d, %r15d - jmp .L30 -.L9: - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - leaq (%rax,%r15,8), %r11 - leaq (%rdi,%r15), %rdx - movq %r10, %rdi - leaq (%rsi,%rdx,8), %rdx - subq %rcx, %rdi - movq %rdx, %rsi - movq %rdi, %rcx - subq %r11, %rsi - subq $8, %rsi - shrq $3, %rsi - incq %rsi - andl $7, %esi - je .L17 - cmpq $1, %rsi - je .L182 - cmpq $2, %rsi - je .L183 - cmpq $3, %rsi - je .L184 - cmpq $4, %rsi - je .L185 - cmpq $5, %rsi - je .L186 - cmpq $6, %rsi - je .L187 - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rdi,8) - addq $8, %r11 -.L187: - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rcx,8) - addq $8, %r11 -.L186: - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rcx,8) - addq $8, %r11 -.L185: - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rcx,8) - addq $8, %r11 -.L184: - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rcx,8) - addq $8, %r11 -.L183: - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rcx,8) - addq $8, %r11 -.L182: - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rcx,8) - addq $8, %r11 - cmpq %rdx, %r11 - je .L16 -.L17: - movq $0x000000000, (%r11) - movq $0x000000000, (%r11,%rcx,8) - movq $0x000000000, 8(%r11) - movq $0x000000000, 8(%r11,%rcx,8) - movq $0x000000000, 16(%r11) - movq $0x000000000, 16(%r11,%rcx,8) - movq $0x000000000, 24(%r11) - movq $0x000000000, 24(%r11,%rcx,8) - movq $0x000000000, 32(%r11) - movq $0x000000000, 32(%r11,%rcx,8) - movq $0x000000000, 40(%r11) - movq $0x000000000, 40(%r11,%rcx,8) - movq $0x000000000, 48(%r11) - movq $0x000000000, 48(%r11,%rcx,8) - movq $0x000000000, 56(%r11) - movq $0x000000000, 56(%r11,%rcx,8) - addq $64, %r11 - cmpq %rdx, %r11 - jne .L17 - jmp .L16 -.L37: - xorl %r11d, %r11d - jmp .L2 - .p2align 4 - .p2align 3 -.L29: - addl $2, %r15d - leaq 88(%rsp), %rsi - leaq 104(%rsp), %rdi - movl %r15d, 76(%rsp) - call timing_ - vmovsd 104(%rsp), %xmm9 - vsubsd 112(%rsp), %xmm9, %xmm3 - vcomisd .LC7(%rip), %xmm3 - jnb .L40 - cmpl $999999999, (%rsp) - jle .L26 -.L40: - movl (%rsp), %ebx - cmpl %ebx, %r15d - jle .L36 - movl %ebx, 76(%rsp) -.L36: - leaq 128(%rsp), %rdi - movabsq $25769803904, %r12 - vmovsd %xmm3, (%rsp) - movq $.LC0, 136(%rsp) - movl $72, 144(%rsp) - movq %r12, 128(%rsp) - decl %ebp - call _gfortran_st_write - movl $14, %edx - movl $.LC8, %esi - leaq 128(%rsp), %rdi - call _gfortran_transfer_character_write - movl $4, %edx - leaq 76(%rsp), %rsi - leaq 128(%rsp), %rdi - call _gfortran_transfer_integer_write - movl $14, %edx - movl $.LC9, %esi - leaq 128(%rsp), %rdi - call _gfortran_transfer_character_write - vxorps %xmm2, %xmm2, %xmm2 - vmovsd (%rsp), %xmm11 - movl $8, %edx - vcvtsi2sdl 76(%rsp), %xmm2, %xmm8 - vmulsd .LC10(%rip), %xmm8, %xmm7 - vcvtsi2sdl 32(%rsp), %xmm2, %xmm4 - vcvtsi2sdl %ebp, %xmm2, %xmm5 - vmulsd %xmm5, %xmm4, %xmm6 - leaq 120(%rsp), %rsi - leaq 128(%rsp), %rdi - vmulsd %xmm7, %xmm6, %xmm10 - vdivsd %xmm11, %xmm10, %xmm12 - vmovsd %xmm12, 120(%rsp) - call _gfortran_transfer_real_write - movl $6, %edx - movl $.LC11, %esi - leaq 128(%rsp), %rdi - call _gfortran_transfer_character_write - leaq 128(%rsp), %rdi - call _gfortran_st_write_done - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - call _gfortran_stop_string -.L5: - testl %r12d, %r12d - js .L38 -.L280: - leal -1(%r12), %r15d - leal -2(%r12), %r8d - movl %r15d, 32(%rsp) - jmp .L11 -.L6: - testl %r12d, %r12d - jns .L280 - leal -1(%r12), %esi - xorl %edx, %edx - leal -2(%r12), %r8d - movl %esi, 32(%rsp) - jmp .L19 -.L20: - vmovsd .LC5(%rip), %xmm4 - imulq $-8, %r14, %r10 - leaq (%rax,%r11,8), %r9 - addq %r13, %r11 - movl $1, %ecx - leaq (%r10,%r11,8), %r15 - leaq (%r10,%r13,8), %rdi - movl %r12d, %r11d - addq %rax, %r15 - addq %rax, %rdi - andl $7, %r11d - vmovsd %xmm4, (%r9) - vmovsd %xmm4, (%r15) - movq $0x000000000, (%rax) - movq $0x000000000, (%rdi) - cmpl $1, %r12d - jl .L45 - testl %r11d, %r11d - je .L25 - cmpl $1, %r11d - je .L206 - cmpl $2, %r11d - je .L207 - cmpl $3, %r11d - je .L208 - cmpl $4, %r11d - je .L209 - cmpl $5, %r11d - je .L210 - cmpl $6, %r11d - je .L211 - vmovsd %xmm4, 8(%r9) - movl $2, %ecx - vmovsd %xmm4, 8(%r15) - movq $0x000000000, 8(%rax) - movq $0x000000000, 8(%rdi) -.L211: - vmovsd %xmm4, (%r9,%rcx,8) - vmovsd %xmm4, (%r15,%rcx,8) - movq $0x000000000, (%rax,%rcx,8) - movq $0x000000000, (%rdi,%rcx,8) - incq %rcx -.L210: - vmovsd %xmm4, (%r9,%rcx,8) - vmovsd %xmm4, (%r15,%rcx,8) - movq $0x000000000, (%rax,%rcx,8) - movq $0x000000000, (%rdi,%rcx,8) - incq %rcx -.L209: - vmovsd %xmm4, (%r9,%rcx,8) - vmovsd %xmm4, (%r15,%rcx,8) - movq $0x000000000, (%rax,%rcx,8) - movq $0x000000000, (%rdi,%rcx,8) - incq %rcx -.L208: - vmovsd %xmm4, (%r9,%rcx,8) - vmovsd %xmm4, (%r15,%rcx,8) - movq $0x000000000, (%rax,%rcx,8) - movq $0x000000000, (%rdi,%rcx,8) - incq %rcx -.L207: - vmovsd %xmm4, (%r9,%rcx,8) - vmovsd %xmm4, (%r15,%rcx,8) - movq $0x000000000, (%rax,%rcx,8) - movq $0x000000000, (%rdi,%rcx,8) - incq %rcx -.L206: - vmovsd %xmm4, (%r9,%rcx,8) - vmovsd %xmm4, (%r15,%rcx,8) - movq $0x000000000, (%rax,%rcx,8) - movq $0x000000000, (%rdi,%rcx,8) - incq %rcx - cmpl %ecx, %r12d - jl .L45 -.L25: - leaq 1(%rcx), %rsi - vmovsd %xmm4, (%r9,%rcx,8) - leaq 2(%rcx), %r10 - vmovsd %xmm4, (%r15,%rcx,8) - leaq 3(%rcx), %r11 - movq $0x000000000, (%rax,%rcx,8) - movq $0x000000000, (%rdi,%rcx,8) - vmovsd %xmm4, (%r9,%rsi,8) - vmovsd %xmm4, (%r15,%rsi,8) - movq $0x000000000, (%rax,%rsi,8) - movq $0x000000000, (%rdi,%rsi,8) - leaq 4(%rcx), %rsi - vmovsd %xmm4, (%r9,%r10,8) - vmovsd %xmm4, (%r15,%r10,8) - movq $0x000000000, (%rax,%r10,8) - movq $0x000000000, (%rdi,%r10,8) - leaq 5(%rcx), %r10 - vmovsd %xmm4, (%r9,%r11,8) - vmovsd %xmm4, (%r15,%r11,8) - movq $0x000000000, (%rax,%r11,8) - movq $0x000000000, (%rdi,%r11,8) - leaq 6(%rcx), %r11 - vmovsd %xmm4, (%r9,%rsi,8) - vmovsd %xmm4, (%r15,%rsi,8) - movq $0x000000000, (%rax,%rsi,8) - movq $0x000000000, (%rdi,%rsi,8) - leaq 7(%rcx), %rsi - addq $8, %rcx - vmovsd %xmm4, (%r9,%r10,8) - vmovsd %xmm4, (%r15,%r10,8) - movq $0x000000000, (%rax,%r10,8) - movq $0x000000000, (%rdi,%r10,8) - vmovsd %xmm4, (%r9,%r11,8) - vmovsd %xmm4, (%r15,%r11,8) - movq $0x000000000, (%rax,%r11,8) - movq $0x000000000, (%rdi,%r11,8) - vmovsd %xmm4, (%r9,%rsi,8) - vmovsd %xmm4, (%r15,%rsi,8) - movq $0x000000000, (%rax,%rsi,8) - movq $0x000000000, (%rdi,%rsi,8) - cmpl %ecx, %r12d - jge .L25 -.L45: - incl %edx - jmp .L10 -.L282: - movl $.LC1, %edi - xorl %eax, %eax - call _gfortran_runtime_error -.L38: - leal -1(%r12), %r8d - xorl %edx, %edx - movl %r8d, 32(%rsp) - leal -2(%r12), %r8d - jmp .L10 -.L283: - movl $.LC2, %edi - call _gfortran_os_error - .cfi_endproc -.LFE0: - .size MAIN__, .-MAIN__ - .section .text.startup,"ax",@progbits - .p2align 4 - .globl main - .type main, @function -main: -.LFB1: - .cfi_startproc - subq $8, %rsp - .cfi_def_cfa_offset 16 - call _gfortran_set_args - movl $options.9.4008, %esi - movl $7, %edi - call _gfortran_set_options - call MAIN__ - .cfi_endproc -.LFE1: - .size main, .-main - .section .rodata - .align 16 - .type options.9.4008, @object - .size options.9.4008, 28 -options.9.4008: - .long 2116 - .long 4095 - .long 0 - .long 1 - .long 1 - .long 0 - .long 31 - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -.LC4: - .long 0 - .long 1072693248 - .long 0 - .long 1072693248 - .section .rodata.cst8,"aM",@progbits,8 - .align 8 -.LC5: - .long 0 - .long 1072693248 - .align 8 -.LC6: - .long 0 - .long 1070596096 - .align 8 -.LC7: - .long 2576980378 - .long 1070176665 - .align 8 -.LC10: - .long 2696277389 - .long 1051772663 - .ident "GCC: (GNU) 9.1.0" - .section .note.GNU-stack,"",@progbits + # OSACA-END