mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2025-12-15 16:40:05 +01:00
646 lines
18 KiB
ArmAsm
646 lines
18 KiB
ArmAsm
.text
|
|
.file "triad.c"
|
|
.section .rodata.cst8,"aM",@progbits,8
|
|
.p2align 3 // -- Begin function triad
|
|
.LCPI0_0:
|
|
.xword 4596373779694328218 // double 0.20000000000000001
|
|
.LCPI0_1:
|
|
.xword 4652007308841189376 // double 1000
|
|
.LCPI0_2:
|
|
.xword 4517329193108106637 // double 9.9999999999999995E-7
|
|
.LCPI0_3:
|
|
.xword 4629700416936869888 // double 32
|
|
.LCPI0_4:
|
|
.xword 4562146422526312448 // double 9.765625E-4
|
|
.text
|
|
.globl triad
|
|
.p2align 6
|
|
.type triad,@function
|
|
triad: // @triad
|
|
.cfi_startproc
|
|
// %bb.0:
|
|
sub sp, sp, #224 // =224
|
|
str d8, [sp, #112] // 8-byte Folded Spill
|
|
stp x28, x27, [sp, #128] // 16-byte Folded Spill
|
|
stp x26, x25, [sp, #144] // 16-byte Folded Spill
|
|
stp x24, x23, [sp, #160] // 16-byte Folded Spill
|
|
stp x22, x21, [sp, #176] // 16-byte Folded Spill
|
|
stp x20, x19, [sp, #192] // 16-byte Folded Spill
|
|
stp x29, x30, [sp, #208] // 16-byte Folded Spill
|
|
add x29, sp, #208 // =208
|
|
.cfi_def_cfa w29, 16
|
|
.cfi_offset w30, -8
|
|
.cfi_offset w29, -16
|
|
.cfi_offset w19, -24
|
|
.cfi_offset w20, -32
|
|
.cfi_offset w21, -40
|
|
.cfi_offset w22, -48
|
|
.cfi_offset w23, -56
|
|
.cfi_offset w24, -64
|
|
.cfi_offset w25, -72
|
|
.cfi_offset w26, -80
|
|
.cfi_offset w27, -88
|
|
.cfi_offset w28, -96
|
|
.cfi_offset b8, -112
|
|
mov w19, w0
|
|
orr w0, wzr, #0x40
|
|
sbfiz x23, x19, #3, #32
|
|
mov x1, x23
|
|
bl aligned_alloc
|
|
mov x20, x0
|
|
orr w0, wzr, #0x40
|
|
mov x1, x23
|
|
bl aligned_alloc
|
|
str x0, [sp, #88] // 8-byte Folded Spill
|
|
orr w0, wzr, #0x40
|
|
mov x1, x23
|
|
bl aligned_alloc
|
|
mov x22, x0
|
|
orr w0, wzr, #0x40
|
|
mov x1, x23
|
|
bl aligned_alloc
|
|
mov x23, x0
|
|
cmp w19, #0 // =0
|
|
b.le .LBB0_3
|
|
// %bb.1:
|
|
mov w24, w19
|
|
cmp w19, #7 // =7
|
|
b.hi .LBB0_9
|
|
// %bb.2:
|
|
mov x8, xzr
|
|
b .LBB0_17
|
|
.LBB0_3:
|
|
adrp x8, .LCPI0_0
|
|
orr w25, wzr, #0x1
|
|
ldr d8, [x8, :lo12:.LCPI0_0]
|
|
.p2align 6
|
|
.LBB0_4: // =>This Loop Header: Depth=1
|
|
// Child Loop BB0_5 Depth 2
|
|
sub x0, x29, #88 // =88
|
|
add x1, sp, #96 // =96
|
|
bl timing
|
|
mov w21, w25
|
|
cbz w25, .LBB0_8
|
|
.p2align 6
|
|
.LBB0_5: // Parent Loop BB0_4 Depth=1
|
|
// => This Inner Loop Header: Depth=2
|
|
ldr d0, [x20]
|
|
fcmp d0, #0.0
|
|
b.le .LBB0_7
|
|
// %bb.6: // in Loop: Header=BB0_5 Depth=2
|
|
mov x0, x20
|
|
bl dummy
|
|
.LBB0_7: // in Loop: Header=BB0_5 Depth=2
|
|
subs w21, w21, #1 // =1
|
|
b.ne .LBB0_5
|
|
.LBB0_8: // in Loop: Header=BB0_4 Depth=1
|
|
add x0, sp, #104 // =104
|
|
add x1, sp, #96 // =96
|
|
bl timing
|
|
ldr d0, [sp, #104]
|
|
ldur d1, [x29, #-88]
|
|
fsub d1, d0, d1
|
|
lsl w25, w25, #1
|
|
fcmp d1, d8
|
|
b.mi .LBB0_4
|
|
b .LBB0_38
|
|
.LBB0_9:
|
|
and x8, x24, #0xfffffff8
|
|
sub x10, x8, #8 // =8
|
|
lsr x11, x10, #3
|
|
add w9, w11, #1 // =1
|
|
and x9, x9, #0x3
|
|
cmp x10, #24 // =24
|
|
b.hs .LBB0_11
|
|
// %bb.10:
|
|
orr w13, wzr, #0x20
|
|
cbnz x9, .LBB0_14
|
|
b .LBB0_16
|
|
.LBB0_11:
|
|
mov x16, #28286
|
|
movk x16, #29109, lsl #16
|
|
ldr x15, [sp, #88] // 8-byte Folded Reload
|
|
movk x16, #34426, lsl #32
|
|
movk x16, #16000, lsl #48
|
|
dup v0.2d, x16
|
|
mvn x11, x11
|
|
mov x10, xzr
|
|
add x11, x9, x11
|
|
add x12, x23, #128 // =128
|
|
add x13, x20, #128 // =128
|
|
add x14, x22, #128 // =128
|
|
add x15, x15, #128 // =128
|
|
.p2align 6
|
|
.LBB0_12: // =>This Inner Loop Header: Depth=1
|
|
stp q0, q0, [x12]
|
|
stp q0, q0, [x12, #-128]
|
|
stp q0, q0, [x12, #32]
|
|
stp q0, q0, [x12, #-96]
|
|
stp q0, q0, [x14]
|
|
add x10, x10, #32 // =32
|
|
stp q0, q0, [x14, #-128]
|
|
stp q0, q0, [x14, #32]
|
|
stp q0, q0, [x14, #-96]
|
|
stp q0, q0, [x15]
|
|
stp q0, q0, [x15, #-128]
|
|
stp q0, q0, [x15, #32]
|
|
stp q0, q0, [x15, #-96]
|
|
stp q0, q0, [x13]
|
|
stp q0, q0, [x13, #-128]
|
|
stp q0, q0, [x13, #32]
|
|
stp q0, q0, [x13, #-96]
|
|
stp q0, q0, [x12, #64]
|
|
stp q0, q0, [x12, #-64]
|
|
stp q0, q0, [x12, #96]
|
|
stp q0, q0, [x12, #-32]
|
|
add x12, x12, #256 // =256
|
|
stp q0, q0, [x14, #64]
|
|
stp q0, q0, [x14, #-64]
|
|
stp q0, q0, [x14, #96]
|
|
stp q0, q0, [x14, #-32]
|
|
add x14, x14, #256 // =256
|
|
stp q0, q0, [x15, #64]
|
|
stp q0, q0, [x15, #-64]
|
|
stp q0, q0, [x15, #96]
|
|
stp q0, q0, [x15, #-32]
|
|
add x15, x15, #256 // =256
|
|
stp q0, q0, [x13, #64]
|
|
stp q0, q0, [x13, #-64]
|
|
stp q0, q0, [x13, #96]
|
|
stp q0, q0, [x13, #-32]
|
|
add x13, x13, #256 // =256
|
|
adds x11, x11, #4 // =4
|
|
b.ne .LBB0_12
|
|
// %bb.13:
|
|
lsl x10, x10, #3
|
|
orr x13, x10, #0x20
|
|
cbz x9, .LBB0_16
|
|
.LBB0_14:
|
|
ldr x14, [sp, #88] // 8-byte Folded Reload
|
|
add x10, x23, x13
|
|
add x11, x22, x13
|
|
add x12, x20, x13
|
|
add x13, x14, x13
|
|
mov x14, #28286
|
|
movk x14, #29109, lsl #16
|
|
movk x14, #34426, lsl #32
|
|
movk x14, #16000, lsl #48
|
|
dup v0.2d, x14
|
|
neg x9, x9
|
|
.p2align 6
|
|
.LBB0_15: // =>This Inner Loop Header: Depth=1
|
|
stp q0, q0, [x10]
|
|
stp q0, q0, [x11]
|
|
stp q0, q0, [x10, #-32]
|
|
stp q0, q0, [x13]
|
|
stp q0, q0, [x11, #-32]
|
|
add x10, x10, #64 // =64
|
|
stp q0, q0, [x12]
|
|
stp q0, q0, [x13, #-32]
|
|
add x11, x11, #64 // =64
|
|
stp q0, q0, [x12, #-32]
|
|
add x12, x12, #64 // =64
|
|
add x13, x13, #64 // =64
|
|
adds x9, x9, #1 // =1
|
|
b.ne .LBB0_15
|
|
.LBB0_16:
|
|
cmp x8, x24
|
|
b.eq .LBB0_19
|
|
.LBB0_17:
|
|
ldr x10, [sp, #88] // 8-byte Folded Reload
|
|
mov x13, #28286
|
|
movk x13, #29109, lsl #16
|
|
lsl x12, x8, #3
|
|
movk x13, #34426, lsl #32
|
|
add x9, x20, x12
|
|
movk x13, #16000, lsl #48
|
|
add x10, x10, x12
|
|
add x11, x22, x12
|
|
add x12, x23, x12
|
|
sub x8, x24, x8
|
|
.p2align 6
|
|
.LBB0_18: // =>This Inner Loop Header: Depth=1
|
|
str x13, [x12], #8
|
|
str x13, [x11], #8
|
|
str x13, [x10], #8
|
|
str x13, [x9], #8
|
|
subs x8, x8, #1 // =1
|
|
b.ne .LBB0_18
|
|
.LBB0_19:
|
|
ldr x10, [sp, #88] // 8-byte Folded Reload
|
|
add x8, x20, #256 // =256
|
|
and x26, x24, #0xfffffff8
|
|
str x8, [sp, #40] // 8-byte Folded Spill
|
|
add x8, x23, #256 // =256
|
|
sub x27, x26, #8 // =8
|
|
str x8, [sp, #32] // 8-byte Folded Spill
|
|
add x8, x22, #256 // =256
|
|
orr w25, wzr, #0x1
|
|
str x8, [sp, #24] // 8-byte Folded Spill
|
|
add x8, x10, #256 // =256
|
|
str x8, [sp, #16] // 8-byte Folded Spill
|
|
lsr x8, x27, #3
|
|
add w9, w8, #1 // =1
|
|
mvn x8, x8
|
|
and x28, x9, #0x7
|
|
add x8, x28, x8
|
|
str x8, [sp, #8] // 8-byte Folded Spill
|
|
neg x8, x28
|
|
str x8, [sp, #80] // 8-byte Folded Spill
|
|
add x8, x10, #32 // =32
|
|
str x8, [sp, #72] // 8-byte Folded Spill
|
|
add x8, x22, #32 // =32
|
|
str x8, [sp, #64] // 8-byte Folded Spill
|
|
add x8, x20, #32 // =32
|
|
str x8, [sp, #56] // 8-byte Folded Spill
|
|
add x8, x23, #32 // =32
|
|
str x8, [sp, #48] // 8-byte Folded Spill
|
|
adrp x8, .LCPI0_0
|
|
ldr d8, [x8, :lo12:.LCPI0_0]
|
|
.p2align 6
|
|
.LBB0_20: // =>This Loop Header: Depth=1
|
|
// Child Loop BB0_22 Depth 2
|
|
// Child Loop BB0_29 Depth 3
|
|
// Child Loop BB0_32 Depth 3
|
|
// Child Loop BB0_35 Depth 3
|
|
sub x0, x29, #88 // =88
|
|
add x1, sp, #96 // =96
|
|
bl timing
|
|
cbz w25, .LBB0_37
|
|
// %bb.21: // in Loop: Header=BB0_20 Depth=1
|
|
mov w21, wzr
|
|
.p2align 6
|
|
.LBB0_22: // Parent Loop BB0_20 Depth=1
|
|
// => This Loop Header: Depth=2
|
|
// Child Loop BB0_29 Depth 3
|
|
// Child Loop BB0_32 Depth 3
|
|
// Child Loop BB0_35 Depth 3
|
|
ldr d0, [x20]
|
|
fcmp d0, #0.0
|
|
b.le .LBB0_24
|
|
// %bb.23: // in Loop: Header=BB0_22 Depth=2
|
|
mov x0, x20
|
|
bl dummy
|
|
.LBB0_24: // in Loop: Header=BB0_22 Depth=2
|
|
cmp w19, #7 // =7
|
|
b.hi .LBB0_26
|
|
// %bb.25: // in Loop: Header=BB0_22 Depth=2
|
|
mov x12, xzr
|
|
b .LBB0_34
|
|
.p2align 6
|
|
.LBB0_26: // in Loop: Header=BB0_22 Depth=2
|
|
cmp x27, #56 // =56
|
|
b.hs .LBB0_28
|
|
// %bb.27: // in Loop: Header=BB0_22 Depth=2
|
|
mov x8, xzr
|
|
cbnz x28, .LBB0_31
|
|
b .LBB0_33
|
|
.p2align 6
|
|
.LBB0_28: // in Loop: Header=BB0_22 Depth=2
|
|
ldp x9, x10, [sp, #16] // 8-byte Folded Reload
|
|
ldp x11, x12, [sp, #32] // 8-byte Folded Reload
|
|
ldr x13, [sp, #8] // 8-byte Folded Reload
|
|
mov x8, xzr
|
|
.p2align 6
|
|
mov x1, #111 // OSACA START
|
|
.byte 213,3,32,31 // OSACA START
|
|
.LBB0_29: // Parent Loop BB0_20 Depth=1
|
|
// Parent Loop BB0_22 Depth=2
|
|
// => This Inner Loop Header: Depth=3
|
|
ldp q2, q5, [x10, #-256]
|
|
ldp q6, q7, [x10, #-224]
|
|
ldp q16, q17, [x11, #-256]
|
|
ldp q18, q19, [x11, #-224]
|
|
fmul v2.2d, v2.2d, v16.2d
|
|
fmul v5.2d, v5.2d, v17.2d
|
|
fmul v6.2d, v6.2d, v18.2d
|
|
ldp q0, q1, [x9, #-256]
|
|
ldp q3, q4, [x9, #-224]
|
|
fmul v7.2d, v7.2d, v19.2d
|
|
fadd v0.2d, v0.2d, v2.2d
|
|
fadd v2.2d, v1.2d, v5.2d
|
|
stp q0, q2, [x12, #-256]
|
|
fadd v1.2d, v3.2d, v6.2d
|
|
ldp q6, q17, [x10, #-192]
|
|
ldp q18, q19, [x10, #-160]
|
|
ldp q20, q21, [x11, #-192]
|
|
ldp q22, q23, [x11, #-160]
|
|
fmul v6.2d, v6.2d, v20.2d
|
|
fmul v17.2d, v17.2d, v21.2d
|
|
fmul v18.2d, v18.2d, v22.2d
|
|
fadd v3.2d, v4.2d, v7.2d
|
|
stp q1, q3, [x12, #-224]
|
|
ldp q4, q5, [x9, #-192]
|
|
ldp q7, q16, [x9, #-160]
|
|
fmul v19.2d, v19.2d, v23.2d
|
|
fadd v4.2d, v4.2d, v6.2d
|
|
fadd v6.2d, v5.2d, v17.2d
|
|
stp q4, q6, [x12, #-192]
|
|
fadd v5.2d, v7.2d, v18.2d
|
|
ldp q18, q21, [x10, #-128]
|
|
ldp q22, q23, [x10, #-96]
|
|
ldp q24, q25, [x11, #-128]
|
|
ldp q26, q27, [x11, #-96]
|
|
fmul v18.2d, v18.2d, v24.2d
|
|
fmul v21.2d, v21.2d, v25.2d
|
|
fmul v22.2d, v22.2d, v26.2d
|
|
fadd v7.2d, v16.2d, v19.2d
|
|
stp q5, q7, [x12, #-160]
|
|
ldp q16, q17, [x9, #-128]
|
|
ldp q19, q20, [x9, #-96]
|
|
fadd v16.2d, v16.2d, v18.2d
|
|
fadd v18.2d, v17.2d, v21.2d
|
|
stp q16, q18, [x12, #-128]
|
|
fadd v17.2d, v19.2d, v22.2d
|
|
ldp q22, q25, [x10, #-64]
|
|
ldp q28, q29, [x11, #-64]
|
|
fmul v23.2d, v23.2d, v27.2d
|
|
ldp q26, q27, [x10, #-32]
|
|
fmul v22.2d, v22.2d, v28.2d
|
|
fmul v25.2d, v25.2d, v29.2d
|
|
ldp q28, q29, [x11, #-32]
|
|
fmul v26.2d, v26.2d, v28.2d
|
|
fmul v27.2d, v27.2d, v29.2d
|
|
fadd v19.2d, v20.2d, v23.2d
|
|
stp q17, q19, [x12, #-96]
|
|
ldp q20, q21, [x9, #-64]
|
|
ldp q23, q24, [x9, #-32]
|
|
fadd v20.2d, v20.2d, v22.2d
|
|
fadd v22.2d, v21.2d, v25.2d
|
|
stp q20, q22, [x12, #-64]
|
|
fadd v21.2d, v23.2d, v26.2d
|
|
fadd v23.2d, v24.2d, v27.2d
|
|
stp q21, q23, [x12, #-32]
|
|
ldp q24, q25, [x10]
|
|
ldp q28, q29, [x11]
|
|
ldp q26, q27, [x10, #32]
|
|
fmul v24.2d, v24.2d, v28.2d
|
|
fmul v25.2d, v25.2d, v29.2d
|
|
ldp q28, q29, [x11, #32]
|
|
fmul v26.2d, v26.2d, v28.2d
|
|
fmul v27.2d, v27.2d, v29.2d
|
|
ldp q28, q29, [x9]
|
|
fadd v24.2d, v28.2d, v24.2d
|
|
fadd v25.2d, v29.2d, v25.2d
|
|
stp q24, q25, [x12]
|
|
ldp q28, q29, [x9, #32]
|
|
fadd v26.2d, v28.2d, v26.2d
|
|
fadd v27.2d, v29.2d, v27.2d
|
|
stp q26, q27, [x12, #32]
|
|
ldp q24, q25, [x10, #64]
|
|
ldp q28, q29, [x11, #64]
|
|
ldp q26, q27, [x10, #96]
|
|
fmul v24.2d, v24.2d, v28.2d
|
|
fmul v25.2d, v25.2d, v29.2d
|
|
ldp q28, q29, [x11, #96]
|
|
fmul v26.2d, v26.2d, v28.2d
|
|
fmul v27.2d, v27.2d, v29.2d
|
|
ldp q28, q29, [x9, #64]
|
|
fadd v24.2d, v28.2d, v24.2d
|
|
fadd v25.2d, v29.2d, v25.2d
|
|
stp q24, q25, [x12, #64]
|
|
ldp q28, q29, [x9, #96]
|
|
fadd v26.2d, v28.2d, v26.2d
|
|
fadd v27.2d, v29.2d, v27.2d
|
|
stp q26, q27, [x12, #96]
|
|
ldp q24, q25, [x10, #128]
|
|
ldp q28, q29, [x11, #128]
|
|
ldp q26, q27, [x10, #160]
|
|
fmul v24.2d, v24.2d, v28.2d
|
|
fmul v25.2d, v25.2d, v29.2d
|
|
ldp q28, q29, [x11, #160]
|
|
fmul v26.2d, v26.2d, v28.2d
|
|
fmul v27.2d, v27.2d, v29.2d
|
|
ldp q28, q29, [x9, #128]
|
|
fadd v24.2d, v28.2d, v24.2d
|
|
fadd v25.2d, v29.2d, v25.2d
|
|
stp q24, q25, [x12, #128]
|
|
ldp q28, q29, [x9, #160]
|
|
fadd v26.2d, v28.2d, v26.2d
|
|
fadd v27.2d, v29.2d, v27.2d
|
|
stp q26, q27, [x12, #160]
|
|
ldp q24, q25, [x10, #192]
|
|
ldp q26, q27, [x11, #192]
|
|
fmul v24.2d, v24.2d, v26.2d
|
|
ldp q26, q28, [x10, #224]
|
|
fmul v25.2d, v25.2d, v27.2d
|
|
ldp q27, q0, [x11, #224]
|
|
fmul v2.2d, v26.2d, v27.2d
|
|
fmul v0.2d, v28.2d, v0.2d
|
|
ldp q1, q3, [x9, #192]
|
|
ldp q4, q5, [x9, #224]
|
|
fadd v1.2d, v1.2d, v24.2d
|
|
fadd v3.2d, v3.2d, v25.2d
|
|
stp q1, q3, [x12, #192]
|
|
fadd v2.2d, v4.2d, v2.2d
|
|
fadd v0.2d, v5.2d, v0.2d
|
|
stp q2, q0, [x12, #224]
|
|
add x8, x8, #64 // =64
|
|
add x12, x12, #512 // =512
|
|
add x11, x11, #512 // =512
|
|
add x10, x10, #512 // =512
|
|
add x9, x9, #512 // =512
|
|
adds x13, x13, #8 // =8
|
|
b.ne .LBB0_29
|
|
mov x1, #222 // OSACA END
|
|
.byte 213,3,32,31 // OSACA END
|
|
// %bb.30: // in Loop: Header=BB0_22 Depth=2
|
|
cbz x28, .LBB0_33
|
|
.LBB0_31: // in Loop: Header=BB0_22 Depth=2
|
|
lsl x11, x8, #3
|
|
ldp x9, x8, [sp, #64] // 8-byte Folded Reload
|
|
ldp x12, x10, [sp, #48] // 8-byte Folded Reload
|
|
add x8, x8, x11
|
|
add x9, x9, x11
|
|
add x10, x10, x11
|
|
add x11, x12, x11
|
|
ldr x12, [sp, #80] // 8-byte Folded Reload
|
|
.p2align 6
|
|
.LBB0_32: // Parent Loop BB0_20 Depth=1
|
|
// Parent Loop BB0_22 Depth=2
|
|
// => This Inner Loop Header: Depth=3
|
|
ldp q4, q5, [x9, #-32]
|
|
ldp q6, q7, [x9], #64
|
|
ldp q16, q17, [x11, #-32]
|
|
ldp q18, q19, [x11], #64
|
|
fmul v4.2d, v4.2d, v16.2d
|
|
fmul v5.2d, v5.2d, v17.2d
|
|
fmul v6.2d, v6.2d, v18.2d
|
|
fmul v7.2d, v7.2d, v19.2d
|
|
ldp q0, q1, [x8, #-32]
|
|
ldp q2, q3, [x8], #64
|
|
fadd v0.2d, v0.2d, v4.2d
|
|
fadd v1.2d, v1.2d, v5.2d
|
|
stp q0, q1, [x10, #-32]
|
|
fadd v2.2d, v2.2d, v6.2d
|
|
fadd v3.2d, v3.2d, v7.2d
|
|
stp q2, q3, [x10]
|
|
add x10, x10, #64 // =64
|
|
adds x12, x12, #1 // =1
|
|
b.ne .LBB0_32
|
|
.LBB0_33: // in Loop: Header=BB0_22 Depth=2
|
|
mov x12, x26
|
|
cmp x26, x24
|
|
b.eq .LBB0_36
|
|
.LBB0_34: // in Loop: Header=BB0_22 Depth=2
|
|
ldr x8, [sp, #88] // 8-byte Folded Reload
|
|
lsl x11, x12, #3
|
|
sub x12, x24, x12
|
|
add x8, x8, x11
|
|
add x9, x22, x11
|
|
add x10, x23, x11
|
|
add x11, x20, x11
|
|
.p2align 6
|
|
.LBB0_35: // Parent Loop BB0_20 Depth=1
|
|
// Parent Loop BB0_22 Depth=2
|
|
// => This Inner Loop Header: Depth=3
|
|
ldr d0, [x8], #8
|
|
ldr d1, [x9], #8
|
|
ldr d2, [x10], #8
|
|
fmul d1, d1, d2
|
|
fadd d0, d0, d1
|
|
str d0, [x11], #8
|
|
subs x12, x12, #1 // =1
|
|
b.ne .LBB0_35
|
|
.LBB0_36: // in Loop: Header=BB0_22 Depth=2
|
|
add w21, w21, #1 // =1
|
|
cmp w21, w25
|
|
b.ne .LBB0_22
|
|
.LBB0_37: // in Loop: Header=BB0_20 Depth=1
|
|
add x0, sp, #104 // =104
|
|
add x1, sp, #96 // =96
|
|
bl timing
|
|
ldr d0, [sp, #104]
|
|
ldur d1, [x29, #-88]
|
|
fsub d1, d0, d1
|
|
lsl w25, w25, #1
|
|
fcmp d1, d8
|
|
b.mi .LBB0_20
|
|
.LBB0_38:
|
|
scvtf d4, w19
|
|
lsr w1, w25, #1
|
|
adrp x8, .LCPI0_1
|
|
scvtf d6, w1
|
|
fadd d2, d4, d4
|
|
ldr d5, [x8, :lo12:.LCPI0_1]
|
|
adrp x8, .LCPI0_2
|
|
fmov d0, #8.00000000
|
|
fmul d2, d2, d6
|
|
ldr d3, [x8, :lo12:.LCPI0_2]
|
|
adrp x8, .LCPI0_3
|
|
adrp x0, .L.str
|
|
fmul d2, d2, d3
|
|
ldr d3, [x8, :lo12:.LCPI0_3]
|
|
adrp x8, .LCPI0_4
|
|
add x0, x0, :lo12:.L.str
|
|
fmul d3, d6, d3
|
|
fmul d0, d4, d0
|
|
fmul d3, d3, d4
|
|
fmul d4, d4, d6
|
|
fdiv d3, d3, d1
|
|
fdiv d4, d4, d1
|
|
fdiv d4, d4, d5
|
|
fdiv d0, d0, d5
|
|
fdiv d2, d2, d1
|
|
ldr d7, [x8, :lo12:.LCPI0_4]
|
|
fmul d3, d3, d7
|
|
fdiv d4, d4, d5
|
|
fmul d3, d3, d7
|
|
mov w2, w19
|
|
bl printf
|
|
mov x0, x20
|
|
bl free
|
|
ldr x0, [sp, #88] // 8-byte Folded Reload
|
|
bl free
|
|
mov x0, x22
|
|
bl free
|
|
mov x0, x23
|
|
bl free
|
|
ldp x29, x30, [sp, #208] // 16-byte Folded Reload
|
|
ldp x20, x19, [sp, #192] // 16-byte Folded Reload
|
|
ldp x22, x21, [sp, #176] // 16-byte Folded Reload
|
|
ldp x24, x23, [sp, #160] // 16-byte Folded Reload
|
|
ldp x26, x25, [sp, #144] // 16-byte Folded Reload
|
|
ldp x28, x27, [sp, #128] // 16-byte Folded Reload
|
|
ldr d8, [sp, #112] // 8-byte Folded Reload
|
|
add sp, sp, #224 // =224
|
|
ret
|
|
.Lfunc_end0:
|
|
.size triad, .Lfunc_end0-triad
|
|
.cfi_endproc
|
|
// -- End function
|
|
.globl main // -- Begin function main
|
|
.p2align 6
|
|
.type main,@function
|
|
main: // @main
|
|
.cfi_startproc
|
|
// %bb.0:
|
|
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
|
|
mov x29, sp
|
|
.cfi_def_cfa w29, 16
|
|
.cfi_offset w30, -8
|
|
.cfi_offset w29, -16
|
|
adrp x0, .Lstr
|
|
add x0, x0, :lo12:.Lstr
|
|
bl puts
|
|
adrp x0, .Lstr.3
|
|
add x0, x0, :lo12:.Lstr.3
|
|
bl puts
|
|
mov w0, #190
|
|
bl triad
|
|
mov w0, #247
|
|
bl triad
|
|
mov w0, #321
|
|
bl triad
|
|
mov w0, #417
|
|
bl triad
|
|
mov w0, #542
|
|
bl triad
|
|
mov w0, #705
|
|
bl triad
|
|
mov w0, #917
|
|
bl triad
|
|
mov w0, #1192
|
|
bl triad
|
|
mov w0, #1550
|
|
bl triad
|
|
mov w0, #2015
|
|
bl triad
|
|
mov w0, #2619
|
|
bl triad
|
|
mov w0, #3405
|
|
bl triad
|
|
mov w0, #4427
|
|
bl triad
|
|
mov w0, #5756
|
|
bl triad
|
|
mov w0, #7482
|
|
bl triad
|
|
mov w0, #9727
|
|
bl triad
|
|
mov w0, wzr
|
|
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
|
ret
|
|
.Lfunc_end1:
|
|
.size main, .Lfunc_end1-main
|
|
.cfi_endproc
|
|
.type .L.str,@object // @.str
|
|
.section .rodata.str1.1,"aMS",@progbits,1
|
|
.L.str:
|
|
.asciz "%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
|
|
.size .L.str, 53
|
|
.type .Lstr,@object // @str
|
|
.section .rodata.str1.16,"aMS",@progbits,1
|
|
.p2align 4
|
|
.Lstr:
|
|
.asciz "TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
|
|
.size .Lstr, 51
|
|
.type .Lstr.3,@object // @str.3
|
|
.p2align 4
|
|
.Lstr.3:
|
|
.asciz "Size (KByte) | runtime | MFlop/s | MB/s | MLUP/s | repeat | size"
|
|
.size .Lstr.3, 74
|
|
.ident "Arm C/C++/Fortran Compiler version 19.0 (build number 69) (based on LLVM 7.0.2)"
|
|
.section ".note.GNU-stack","",@progbits
|
|
.addrsig
|