diff --git a/osaca/create_testcase.py b/osaca/create_testcase.py new file mode 100755 index 0000000..2ca7b29 --- /dev/null +++ b/osaca/create_testcase.py @@ -0,0 +1,41 @@ +#!/apps/python/3.5-anaconda/bin/python + +from param import * +from testcase import * + +# Choose out of various operands +reg8 = Register('al') +reg16 = Register('ax') +reg32 = Register('eax') +reg64 = Register('rax') +xmm = Register('xmm0') +ymm = Register('ymm0') +zmm = Register('zmm0') +mem0 = MemAddr('(%rax, %esi, 4)') +imd1 = Parameter('IMD') + + +#----------------------------------------------- +#-USER INPUT------------------------------------ +#----------------------------------------------- +# Enter your mnemonic +mnemonic = 'vxorpd' + +# Define your operands. If you don't need it, just type in None +dst = xmm +op1 = xmm +op2 = xmm + +# Define the number of instructions per loop (default: 12) +per_loop = '128' + +#----------------------------------------------- +#----------------------------------------------- + +# Start +operands = [x for x in [dst, op1, op2] if x is not None] +opListStr = ', '.join([str(x) for x in operands]) +print('Create Testcase for {} {}'.format(mnemonic, opListStr ), end='') +tc = Testcase(mnemonic, operands, per_loop) +tc.write_testcase() +print(' --------> SUCCEEDED') diff --git a/osaca/data/ivb_data.csv b/osaca/data/ivb_data.csv new file mode 100644 index 0000000..9a5435a --- /dev/null +++ b/osaca/data/ivb_data.csv @@ -0,0 +1,53 @@ +instr,TP,LT,ports +jmp-lbl,0.0,0.0,"((5,),)" +jo-lbl,0.0,0.0,"((5,),)" +jno-lbl,0.0,0.0,"((5,),)" +js-lbl,0.0,0.0,"((5,),)" +jns-lbl,0.0,0.0,"((5,),)" +je-lbl,0.0,0.0,"((5,),)" +jz-lbl,0.0,0.0,"((5,),)" +jne-lbl,0.0,0.0,"((5,),)" +jnz-lbl,0.0,0.0,"((5,),)" +jb-lbl,0.0,0.0,"((5,),)" +jnae-lbl,0.0,0.0,"((5,),)" +jc-lbl,0.0,0.0,"((5,),)" +jnb-lbl,0.0,0.0,"((5,),)" +jae-lbl,0.0,0.0,"((5,),)" +jnc-lbl,0.0,0.0,"((5,),)" +jbe-lbl,0.0,0.0,"((5,),)" +jna-lbl,0.0,0.0,"((5,),)" +ja-lbl,0.0,0.0,"((5,),)" +jnbe-lbl,0.0,0.0,"((5,),)" +jl-lbl,0.0,0.0,"((5,),)" +jnge-lbl,0.0,0.0,"((5,),)" +jge-lbl,0.0,0.0,"((5,),)" +jnl-lbl,0.0,0.0,"((5,),)" +jle-lbl,0.0,0.0,"((5,),)" +jng-lbl,0.0,0.0,"((5,),)" +jg-lbl,0.0,0.0,"((5,),)" +jnle-lbl,0.0,0.0,"((5,),)" +jp-lbl,0.0,0.0,"((5,),)" +jpe-lbl,0.0,0.0,"((5,),)" +jnp-lbl,0.0,0.0,"((5,),)" +jpo-lbl,0.0,0.0,"((5,),)" +jcxz-lbl,0.0,0.0,"((5,),)" +jecxz-lbl,0.0,0.0,"((5,),)" +jo-lbl,0.0,0.0,"((5,),)" +jno-lbl,0.0,0.0,"((5,),)" +js-lbl,0.0,0.0,"((5,),)" +jns-lbl,0.0,0.0,"((5,),)" +lea-r64_mem,1.0,1.0,"((2,),(3,))" +lea-r32_mem,1.0,1.0,"((2,),(3,))" +vcvtsi2ss-xmm_xmm_r64,1.0,3.0,"((0,1),(1,5))" +vcvtsi2ss-xmm_xmm_r32,1.0,3.0,"((1,5),(0,1))" +vmulss-xmm_xmm_xmm,1.0,5.0,"((0,),)" +vaddss-xmm_xmm_mem,1.0,3.0,"((1,),)" +vaddss-xmm_xmm_xmm,1.0,3.0,"((1,),)" +vxorps-xmm_xmm_xmm,0.3333333333333333,1.0,"((0,),(1,),(5,))" +vmovss-xmm_mem,0.5,1.0,"((2,),(3,))" +vmovss-mem_xmm,1.0,1.0,"((2,4),(3,4))" +inc-r32,0.3333333333333333,1.0,"((0,),(1,),(5,))" +inc-r64,0.3333333333333333,1.0,"((0,),(1,),(5,))" +cmp-r64_imd,0.3333333333333333,1.0,"((0,),(1,),(5,))" +cmp-r32_mem,0.5,1.0,"((0,2),(0,3),(1,2),(1,3),(2,5),(3,5))" +cmp-r32_r32,0.3333333333333333,1.0,"((0,),(1,),(5,))" diff --git a/osaca/data/ivb_data_old.csv b/osaca/data/ivb_data_old.csv new file mode 100644 index 0000000..334b377 --- /dev/null +++ b/osaca/data/ivb_data_old.csv @@ -0,0 +1,92 @@ +instr,TP,LT +jmp-lbl,0.0,-1.0 +jo-lbl,0.0,-1.0 +jno-lbl,0.0,-1.0 +js-lbl,0.0,-1.0 +jns-lbl,0.0,-1.0 +je-lbl,0.0,-1.0 +jz-lbl,0.0,-1.0 +jne-lbl,0.0,-1.0 +jnz-lbl,0.0,-1.0 +jb-lbl,0.0,-1.0 +jnae-lbl,0.0,-1.0 +jc-lbl,0.0,-1.0 +jnb-lbl,0.0,-1.0 +jae-lbl,0.0,-1.0 +jnc-lbl,0.0,-1.0 +jbe-lbl,0.0,-1.0 +jna-lbl,0.0,-1.0 +ja-lbl,0.0,-1.0 +jnbe-lbl,0.0,-1.0 +jl-lbl,0.0,-1.0 +jnge-lbl,0.0,-1.0 +jge-lbl,0.0,-1.0 +jnl-lbl,0.0,-1.0 +jle-lbl,0.0,-1.0 +jng-lbl,0.0,-1.0 +jg-lbl,0.0,-1.0 +jnle-lbl,0.0,-1.0 +jp-lbl,0.0,-1.0 +jpe-lbl,0.0,-1.0 +jnp-lbl,0.0,-1.0 +jpo-lbl,0.0,-1.0 +jcxz-lbl,0.0,-1.0 +jecxz-lbl,0.0,-1.0 +jo-lbl,0.0,-1.0 +jno-lbl,0.0,-1.0 +js-lbl,0.0,-1.0 +jns-lbl,0.0,-1.0 +vmulss-xmm_xmm_xmm,1.0,-1.0 +vaddss-xmm_xmm_xmm,1.0,-1.0 +vxorps-xmm_xmm_xmm,0.25,-1.0 +inc-r64,0.3333333333333333,-1.0 +xor-r32_r32,0.3333333333333333,-1.0 +vcvtsi2ss-xmm_xmm_r32,1.0,-1.0 +vaddss-xmm_xmm_mem,1.0,-1.0 +vmovupd-load-avx,1.0,-1.0 +lea-r32_mem,1.0,-1.0 +vmovss-xmm_mem,0.5,-1.0 +vmovss-mem_xmm,1.0,-1.0 +vmovupd-store-avx,2.0,-1.0 +lea-r64_mem,1.0,-1.0 +movslq-r64_mem,0.5,-1.0 +mov-r64_mem,0.5,-1.0 +vaddpd-ymm_ymm_ymm,1.0,-1.0 +cmp-r32_r32,0.3333333333333333,-1.0 +vmovsd-xmm_xmm_xmm,1.0,-1.0 +vmulsd-xmm_xmm_mem,1.0,-1.0 +vmovsd-mem_xmm,1.0,-1.0 +vmovhpd-xmm_xmm_mem,1.0,-1.0 +vsubpd-ymm_ymm_ymm,1.0,-1.0 +vmovq-xmm_r64,1.0,-1.0 +vunpckhpd-xmm_xmm_xmm,1.0,-1.0 +vmulpd-ymm_ymm_mem,1.0,-1.0 +mov-mem_r64,1.0,-1.0 +movzbl-r32_r8,0.29600000000000004,-1.0 +vmulsd-xmm_xmm_xmm,1.0,-1.0 +vaddsd-xmm_xmm_mem,1.0,-1.0 +vmovq-r64_xmm,1.0,-1.0 +vmulpd-ymm_ymm_ymm,1.0,-1.0 +mov-r32_mem,0.5,-1.0 +cmp-r32_mem,0.5,-1.0 +vaddpd-xmm_xmm_xmm,1.0,-1.0 +mov-mem_r32,1.0,-1.0 +vmovsd-xmm_mem,0.5,-1.0 +vsubsd-xmm_xmm_xmm,1.0,-1.0 +vmovaps-xmm_xmm,0.845,-1.0 +vaddsd-xmm_xmm_xmm,1.0,-1.0 +add-r32_mem,0.5,-1.0 +vmovupd-xmm_mem,0.5,-1.0 +test-r32_r32,0.3333333333333333,-1.0 +add-r64_r64,0.3333333333333333,-1.0 +dec-r32,0.3333333333333333,-1.0 +movslq-r64_r32,0.3333333333333333,-1.0 +vxorpd-ymm_ymm_ymm,0.25,-1.0 +sub-r32_r32,0.3333333333333333,-1.0 +inc-r32,0.3333333333333333,-1.0 +neg-r32,0.3333333333333333,-1.0 +cmp-r64_imd,0.3333333333333333,-1.0 +vxorpd-xmm_xmm_xmm,0.25,-1.0 +vmovapd-ymm_ymm,0.856,-1.0 +vmovapd-xmm_xmm,0.855,-1.0 +mov-r32_r32,0.3333333333333333,-1.0 diff --git a/osaca/data/res_ivb.dat b/osaca/data/res_ivb.dat new file mode 100644 index 0000000..8305692 --- /dev/null +++ b/osaca/data/res_ivb.dat @@ -0,0 +1,107 @@ +Using frequency 2.20GHz. +vmovsd-xmm_mem: 0.503 (clock cycles) [DEBUG - result: 3.141590] +lea-r64_mem-TP: 1.015 (clock cycles) [DEBUG - result: 1.000000] +vmovupd-load-avx-TP: 1.004 (clock cycles) [DEBUG - result: 3.141590] +movslq-r64_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000] +lea-r32_mem-TP: 1.015 (clock cycles) [DEBUG - result: 1.000000] +cmp-r32_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000] +sub-r32_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000] +test-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000] +vaddss-xmm_xmm_xmm-TP: 1.015 (clock cycles) [DEBUG - result: 1.000000] +vsubsd-xmm_xmm_xmm: 3.005 (clock cycles) [DEBUG - result: -1.000000] +vunpckhpd-xmm_xmm_xmm: 1.017 (clock cycles) [DEBUG - result: 1.000000] +movzbl-r32_r8: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vaddss-xmm_xmm_mem: 3.005 (clock cycles) [DEBUG - result: 2.000002] +dec-r32: 1.003 (clock cycles) [DEBUG - result: 1.000000] +vxorpd-ymm_ymm_ymm: 0.517 (clock cycles) [DEBUG - result: inf] +vaddpd-xmm_xmm_xmm: 3.005 (clock cycles) [DEBUG - result: inf] +cmp-r64_imd-TP: 0.341 (clock cycles) [DEBUG - result: 1.000000] +cmp-r64_imd: 0.341 (clock cycles) [DEBUG - result: 1.000000] +vaddsd-xmm_xmm_xmm: 3.004 (clock cycles) [DEBUG - result: inf] +vmovapd-ymm_ymm-TP: 0.864 (clock cycles) [DEBUG - result: 1.000000] +vmovaps-xmm_xmm: 0.681 (clock cycles) [DEBUG - result: 2.000000] +vmovq-xmm_r64: 1.017 (clock cycles) [DEBUG - result: 1.000000] +vxorpd-xmm_xmm_xmm: 0.517 (clock cycles) [DEBUG - result: inf] +vmovq-r64_xmm: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vcvtsi2ss-xmm_xmm_r32-TP: 1.033 (clock cycles) [DEBUG - result: 1.000000] +inc-r64: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vmovsd-mem_xmm: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vaddpd-ymm_ymm_ymm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000] +add-r32_mem: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vmulsd-xmm_xmm_mem: 5.007 (clock cycles) [DEBUG - result: inf] +lea-r64_mem: 1.015 (clock cycles) [DEBUG - result: 1.000000] +vcvtsi2ss-xmm_xmm_r32: 3.005 (clock cycles) [DEBUG - result: 2.000000] +movslq-r64_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000] +lea-r32_mem: 1.015 (clock cycles) [DEBUG - result: 1.000000] +cmp-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000] +vxorpd-xmm_xmm_xmm-TP: 0.261 (clock cycles) [DEBUG - result: 1.000000] +vmovsd-xmm_xmm_xmm-TP: 1.018 (clock cycles) [DEBUG - result: 1.000000] +vmovapd-ymm_ymm: 0.681 (clock cycles) [DEBUG - result: 2.000000] +vaddss-xmm_xmm_xmm: 3.005 (clock cycles) [DEBUG - result: 2.000000] +vmulsd-xmm_xmm_mem-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000] +vmovsd-mem_xmm-TP: 1.003 (clock cycles) [DEBUG - result: 1.000000] +mov-r32_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000] +vmulss-xmm_xmm_xmm: 5.012 (clock cycles) [DEBUG - result: 2.000000] +vmovhpd-xmm_xmm_mem-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000] +vsubpd-ymm_ymm_ymm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000] +vmovss-xmm_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000] +vmovq-xmm_r64-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000] +vunpckhpd-xmm_xmm_xmm-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000] +add-r64_r64-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000] +inc-r32: 1.002 (clock cycles) [DEBUG - result: 1.000000] +mov-r64_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000] +vmulpd-ymm_ymm_mem-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000] +mov-mem_r64-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vmovupd-xmm_mem: 0.503 (clock cycles) [DEBUG - result: 3.141590] +movzbl-r32_r8-TP: 0.286 (clock cycles) [DEBUG - result: 1.000000] +dec-r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000] +mov-r32_r32-TP: 0.287 (clock cycles) [DEBUG - result: 1.000000] +vmulpd-ymm_ymm_mem: 5.007 (clock cycles) [DEBUG - result: inf] +vaddpd-ymm_ymm_ymm: 3.005 (clock cycles) [DEBUG - result: inf] +movslq-r64_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000] +vxorpd-ymm_ymm_ymm-TP: 0.258 (clock cycles) [DEBUG - result: 1.000000] +cmp-r32_r32: 0.344 (clock cycles) [DEBUG - result: 1.000000] +vmulsd-xmm_xmm_xmm-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000] +mov-r32_r32: 0.668 (clock cycles) [DEBUG - result: 1.000000] +vxorps-xmm_xmm_xmm-TP: 0.258 (clock cycles) [DEBUG - result: 1.000000] +neg-r32: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vaddsd-xmm_xmm_mem-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000] +vmovq-r64_xmm-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vmulpd-ymm_ymm_ymm-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000] +vmovss-mem_xmm-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000] +mov-r32_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000] +vmulpd-ymm_ymm_ymm: 5.007 (clock cycles) [DEBUG - result: inf] +test-r32_r32: 0.346 (clock cycles) [DEBUG - result: 1.000000] +xor-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000] +vmovupd-store-avx-TP: 2.005 (clock cycles) [DEBUG - result: 0.000000] +cmp-r32_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000] +mov-r64_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000] +vmovapd-xmm_xmm: 0.681 (clock cycles) [DEBUG - result: 2.000000] +vaddpd-xmm_xmm_xmm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000] +sub-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000] +vmovss-xmm_mem: 0.516 (clock cycles) [DEBUG - result: 0.000000] +add-r64_r64: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vmulsd-xmm_xmm_xmm: 5.007 (clock cycles) [DEBUG - result: inf] +vmulss-xmm_xmm_xmm-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000] +mov-mem_r32-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000] +mov-mem_r64: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vmovsd-xmm_mem-TP: 0.507 (clock cycles) [DEBUG - result: 1.000000] +vaddss-xmm_xmm_mem-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000] +vsubsd-xmm_xmm_xmm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000] +vmovaps-xmm_xmm-TP: 0.860 (clock cycles) [DEBUG - result: 1.000000] +movslq-r64_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000] +vmovss-mem_xmm: 1.002 (clock cycles) [DEBUG - result: 1.000000] +inc-r32-TP: 0.344 (clock cycles) [DEBUG - result: 1.000000] +vmovapd-xmm_xmm-TP: 0.856 (clock cycles) [DEBUG - result: 1.000000] +vaddsd-xmm_xmm_xmm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000] +vmovhpd-xmm_xmm_mem: 1.017 (clock cycles) [DEBUG - result: 2.000000] +vxorps-xmm_xmm_xmm: 0.517 (clock cycles) [DEBUG - result: inf] +vmovsd-xmm_xmm_xmm: 1.017 (clock cycles) [DEBUG - result: 1.000000] +vaddsd-xmm_xmm_mem: 3.005 (clock cycles) [DEBUG - result: 201061760.000000] +add-r32_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000] +vmovupd-xmm_mem-TP: 0.509 (clock cycles) [DEBUG - result: 1.000000] +mov-mem_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000] +inc-r64-TP: 0.355 (clock cycles) [DEBUG - result: 1.000000] +neg-r32-TP: 0.344 (clock cycles) [DEBUG - result: 1.000000] +vsubpd-ymm_ymm_ymm: 3.004 (clock cycles) [DEBUG - result: -1.000000] +xor-r32_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000] diff --git a/osaca/data/res_test.dat b/osaca/data/res_test.dat new file mode 100644 index 0000000..e93f019 --- /dev/null +++ b/osaca/data/res_test.dat @@ -0,0 +1,4 @@ +Using frequency 2.20GHz. +lea-r64_mem-TP: 1.003 (clock cycles) [DEBUG - result: 3.141590] +jan-xmm_xmm-TP: 0.995 (clock cycles) [DEBUG - result: 3.141590] +jan-xmm_xmm: 2.037 (clock cycles) [DEBUG - result: 3.141590] diff --git a/osaca/data/skl_data.csv b/osaca/data/skl_data.csv new file mode 100644 index 0000000..f31169f --- /dev/null +++ b/osaca/data/skl_data.csv @@ -0,0 +1,53 @@ +instr,TP,LT,ports +jmp-lbl,0.0,0.0,"((5,),)" +jo-lbl,0.0,0.0,"((5,),)" +jno-lbl,0.0,0.0,"((5,),)" +js-lbl,0.0,0.0,"((5,),)" +jns-lbl,0.0,0.0,"((5,),)" +je-lbl,0.0,0.0,"((5,),)" +jz-lbl,0.0,0.0,"((5,),)" +jne-lbl,0.0,0.0,"((5,),)" +jnz-lbl,0.0,0.0,"((5,),)" +jb-lbl,0.0,0.0,"((5,),)" +jnae-lbl,0.0,0.0,"((5,),)" +jc-lbl,0.0,0.0,"((5,),)" +jnb-lbl,0.0,0.0,"((5,),)" +jae-lbl,0.0,0.0,"((5,),)" +jnc-lbl,0.0,0.0,"((5,),)" +jbe-lbl,0.0,0.0,"((5,),)" +jna-lbl,0.0,0.0,"((5,),)" +ja-lbl,0.0,0.0,"((5,),)" +jnbe-lbl,0.0,0.0,"((5,),)" +jl-lbl,0.0,0.0,"((5,),)" +jnge-lbl,0.0,0.0,"((5,),)" +jge-lbl,0.0,0.0,"((5,),)" +jnl-lbl,0.0,0.0,"((5,),)" +jle-lbl,0.0,0.0,"((5,),)" +jng-lbl,0.0,0.0,"((5,),)" +jg-lbl,0.0,0.0,"((5,),)" +jnle-lbl,0.0,0.0,"((5,),)" +jp-lbl,0.0,0.0,"((5,),)" +jpe-lbl,0.0,0.0,"((5,),)" +jnp-lbl,0.0,0.0,"((5,),)" +jpo-lbl,0.0,0.0,"((5,),)" +jcxz-lbl,0.0,0.0,"((5,),)" +jecxz-lbl,0.0,0.0,"((5,),)" +jo-lbl,0.0,0.0,"((5,),)" +jno-lbl,0.0,0.0,"((5,),)" +js-lbl,0.0,0.0,"((5,),)" +jns-lbl,0.0,0.0,"((5,),)" +lea-r64_mem,1.0,1.0,"((2,),(3,))" +lea-r32_mem,1.0,1.0,"((2,),(3,))" +vcvtsi2ss-xmm_xmm_r64,1.0,3.0,"((0,1),(1,5))" +vcvtsi2ss-xmm_xmm_r32,1.0,3.0,"((-1,))" +vmulss-xmm_xmm_xmm,1.0,5.0,"((0,),)" +vaddss-xmm_xmm_mem,1.0,3.0,"((1,),)" +vaddss-xmm_xmm_xmm,1.0,3.0,"((1,),)" +vxorps-xmm_xmm_xmm,0.3333333333333333,1.0,"((0,),(1,),(5,))" +vmovss-xmm_mem,0.5,1.0,"((2,),(3,))" +vmovss-mem_xmm,1.0,1.0,"((2,4),(3,4))" +inc-r32,0.3333333333333333,1.0,"((0,),(1,),(5,))" +inc-r64,0.3333333333333333,1.0,"((0,),(1,),(5,))" +cmp-r64_imd,0.3333333333333333,1.0,"((0,),(1,),(5,))" +cmp-r32_mem,0.5,1.0,"((0,),(1,),(5,))" +cmp-r32_r32,0.3333333333333333,1.0,"((0,),(1,),(5,))" diff --git a/osaca/data/tmp_ivb_throughput.csv b/osaca/data/tmp_ivb_throughput.csv new file mode 100644 index 0000000..c49a513 --- /dev/null +++ b/osaca/data/tmp_ivb_throughput.csv @@ -0,0 +1,92 @@ +instr,clock_cycles +jmp-lbl-TP,0.0 +jo-lbl-TP,0.0 +jno-lbl-TP,0.0 +js-lbl-TP,0.0 +jns-lbl-TP,0.0 +je-lbl-TP,0.0 +jz-lbl-TP,0.0 +jne-lbl-TP,0.0 +jnz-lbl-TP,0.0 +jb-lbl-TP,0.0 +jnae-lbl-TP,0.0 +jc-lbl-TP,0.0 +jnb-lbl-TP,0.0 +jae-lbl-TP,0.0 +jnc-lbl-TP,0.0 +jbe-lbl-TP,0.0 +jna-lbl-TP,0.0 +ja-lbl-TP,0.0 +jnbe-lbl-TP,0.0 +jl-lbl-TP,0.0 +jnge-lbl-TP,0.0 +jge-lbl-TP,0.0 +jnl-lbl-TP,0.0 +jle-lbl-TP,0.0 +jng-lbl-TP,0.0 +jg-lbl-TP,0.0 +jnle-lbl-TP,0.0 +jp-lbl-TP,0.0 +jpe-lbl-TP,0.0 +jnp-lbl-TP,0.0 +jpo-lbl-TP,0.0 +jcxz-lbl-TP,0.0 +jecxz-lbl-TP,0.0 +jo-lbl-TP,0.0 +jno-lbl-TP,0.0 +js-lbl-TP,0.0 +jns-lbl-TP,0.0 +vmulss-xmm_xmm_xmm-TP,1.0 +vaddss-xmm_xmm_xmm-TP,1.0 +vxorps-xmm_xmm_xmm-TP,0.25 +inc-r64-TP,0.3333333333333333 +xor-r32_r32-TP,0.3333333333333333 +vcvtsi2ss-xmm_xmm_r32-TP,1.0 +vaddss-xmm_xmm_mem-TP,1.0 +vmovupd-load-avx-TP,1.0 +lea-r32_mem-TP,1.0 +vmovss-xmm_mem-TP,0.5 +vmovss-mem_xmm-TP,1.0 +vmovupd-store-avx-TP,2.0 +lea-r64_mem-TP,1.0 +movslq-r64_mem-TP,0.5 +mov-r64_mem-TP,0.5 +vaddpd-ymm_ymm_ymm-TP,1.0 +cmp-r32_r32-TP,0.3333333333333333 +vmovsd-xmm_xmm_xmm-TP,1.0 +vmulsd-xmm_xmm_mem-TP,1.0 +vmovsd-mem_xmm-TP,1.0 +vmovhpd-xmm_xmm_mem-TP,1.0 +vsubpd-ymm_ymm_ymm-TP,1.0 +vmovq-xmm_r64-TP,1.0 +vunpckhpd-xmm_xmm_xmm-TP,1.0 +vmulpd-ymm_ymm_mem-TP,1.0 +mov-mem_r64-TP,1.0 +movzbl-r32_r8-TP,0.29600000000000004 +vmulsd-xmm_xmm_xmm-TP,1.0 +vaddsd-xmm_xmm_mem-TP,1.0 +vmovq-r64_xmm-TP,1.0 +vmulpd-ymm_ymm_ymm-TP,1.0 +mov-r32_mem-TP,0.5 +cmp-r32_mem-TP,0.5 +vaddpd-xmm_xmm_xmm-TP,1.0 +mov-mem_r32-TP,1.0 +vmovsd-xmm_mem-TP,0.5 +vsubsd-xmm_xmm_xmm-TP,1.0 +vmovaps-xmm_xmm-TP,0.845 +vaddsd-xmm_xmm_xmm-TP,1.0 +add-r32_mem-TP,0.5 +vmovupd-xmm_mem-TP,0.5 +test-r32_r32-TP,0.3333333333333333 +add-r64_r64-TP,0.3333333333333333 +dec-r32-TP,0.3333333333333333 +movslq-r64_r32-TP,0.3333333333333333 +vxorpd-ymm_ymm_ymm-TP,0.25 +sub-r32_r32-TP,0.3333333333333333 +inc-r32-TP,0.3333333333333333 +neg-r32-TP,0.3333333333333333 +cmp-r64_imd-TP,0.3333333333333333 +vxorpd-xmm_xmm_xmm-TP,0.25 +vmovapd-ymm_ymm-TP,0.856 +vmovapd-xmm_xmm-TP,0.855 +mov-r32_r32-TP,0.3333333333333333 diff --git a/osaca/eu_sched.py b/osaca/eu_sched.py new file mode 100755 index 0000000..0a6c589 --- /dev/null +++ b/osaca/eu_sched.py @@ -0,0 +1,331 @@ +#!/apps/python/3.5-anaconda/bin/python + +import sys +import os +import math +import ast +from param import * +from operator import add +import pandas as pd + +class Scheduler(object): + arch_dict = {'SNB':6, 'IVB':6, 'HSW':8, 'BDW':8, 'SKL':8} + ports = None #type: int + instrList = None #type: list> + # instr, operand(s), instr form + df = None #type: DataFrame + + + def __init__(self, arch, instructionList): + arch = arch.upper() + try: + self.ports = self.arch_dict[arch] + except KeyError: + print('Architecture not supportet for EU scheduling.') + sys.exit() + self.instrList = instructionList + currDir = os.path.realpath(__file__)[:-11] + self.df = pd.read_csv(currDir+'data/'+arch.lower()+'_data.csv', quotechar='"', converters={'ports':ast.literal_eval}) + + + def schedule(self): + ''' + Schedules Instruction Form list and calculates port bindings. + + Returns + ------- + (str, [int, ...]) + A tuple containing the graphic output of the schedule as string and + the port bindings as list of ints. + ''' + sched = self.get_head() +# Initialize ports +# groups = [[] for x in range(len(set(portOccurances))-1)] + occ_ports = [[0]*self.ports for x in range(len(self.instrList))] +# occ_ports = [[0]*self.ports]*len(self.instrList) + port_bndgs = [0]*self.ports +# Check if there's a port occupation stored in the CSV, otherwise leave the +# occ_port list item empty + for i,instrForm in enumerate(self.instrList): + try: + searchString = instrForm[0]+'-'+self.get_operand_suffix(instrForm) + entry = self.df.loc[lambda df: df.instr == searchString,'TP':'ports'] + tup = entry.ports.values[0] + if(len(tup) == 1 and tup[0][0] == -1): + raise IndexError() + except IndexError: +# Instruction form not in CSV + sched += self.get_line(occ_ports[i], '* '+instrForm[-1]) + continue +# Get the occurance of each port from the occupation list + portOccurances = self.get_port_occurances(tup) +# Get 'occurance groups' + occuranceGroups = self.get_occurance_groups(portOccurances) +# Calculate port dependent throughput + TPGes = entry.TP.values[0]*len(occuranceGroups[0]) + for occGroup in occuranceGroups: + for port in occGroup: + occ_ports[i][port] = TPGes/len(occGroup) +# Write schedule line + sched += self.get_line(occ_ports[i], instrForm[-1]) +# Add throughput to total port binding + port_bndgs = list(map(add, port_bndgs, occ_ports[i])) + return (sched, port_bndgs) + + + def schedule_FCFS(self): + ''' + Schedules Instruction Form list for a single run with latencies. + + Returns + ------- + (str, int) + A tuple containing the graphic output as string and the total throughput time as int. + ''' + sched = self.get_head() + total = 0 +# Initialize ports + occ_ports = [0]*self.ports + for i,instrForm in enumerate(self.instrList): + try: + searchString = instrForm[0]+'-'+self.get_operand_suffix(instrForm) + entry = self.df.loc[lambda df: df.instr == searchString,'LT':'ports'] + tup = entry.ports.values[0] + if(len(tup) == 1 and tup[0][0] == -1): + raise IndexError() + except IndexError: +# Instruction form not in CSV + sched += self.get_line([0]*self.ports,'* '+instrForm[-1]) + continue + found = False + while(not found): + for portOcc in tup: +# Test if chosen instruction form port occupation suits the current CPU port occupation + if(self.test_ports_FCFS(occ_ports, portOcc)): +# Current port occupation fits for chosen port occupation of the instruction! + found = True + good = [entry.LT.values[0] if (j in portOcc) else 0 for j in range(0,self.ports)] + sched += self.get_line(good, instrForm[-1]) +# Add new occupation + occ_ports = [occ_ports[j]+good[j] for j in range(0, self.ports)] + break +# Step + occ_ports = [j-1 if (j > 0) else 0 for j in occ_ports] + if(entry.LT.values[0] != 0): + total += 1 + total += max(occ_ports) + return (sched, total) + + + def get_occurance_groups(self, portOccurances): + ''' + Groups ports in groups by the number of their occurance and sorts + groups by cardinality + + Parameters + ---------- + portOccurances : [int, ...] + List with the length of ports containing the number of occurances + of each port + + Returns + ------- + [[int, ...], ...] + List of lists with all occurance groups sorted by cardinality + (smallest group first) + ''' + groups = [[] for x in range(len(set(portOccurances))-1)] + for i,groupInd in enumerate(range(min(list(filter(lambda x: x > 0, portOccurances))),max(portOccurances)+1)): + for p, occurs in enumerate(portOccurances): + if groupInd == occurs: + groups[i].append(p) +# Sort groups by cardinality + groups.sort(key=len) + return groups + + + def get_port_occurances(self, tups): + ''' + Returns the number of each port occurance for the possible port + occupations + + Parameters + ---------- + tups : ((int, ...), ...) + Tuple of tuples of possible port occupations + + Returns + ------- + [int, ...] + List in the length of the number of ports for the current architecture, + containing the amount of occurances for each port + ''' + ports = [0]*self.ports + for tup in tups: + for elem in tup: + ports[elem] += 1 + return ports + + + def test_ports_FCFS(self, occ_ports, needed_ports): + ''' + Test if current configuration of ports is possible and returns boolean + + Parameters + ---------- + occ_ports : [int] + Tuple to inspect for current port occupation + needed_ports : (int) + Tuple with needed port(s) for particular instruction form + + Returns + ------- + bool + True if needed ports can get scheduled on current port occupation + False if not + ''' + for port in needed_ports: + if(occ_ports[port] != 0): + return False + return True + + + def get_report_info(self): + ''' + Creates Report information including all needed annotations. + + Returns + ------- + str + String containing the report information + ''' + analysis = 'Throughput Analysis Report\n'+('-'*26)+'\n' + annotations = ( '* - No information for this instruction in database\n' + '\n') + return analysis+annotations + + + def get_head(self): + ''' + Creates right heading for CPU architecture. + + Returns + ------- + str + String containing the header + ''' + horizLine = '-'*7*self.ports+'-\n' + portAnno = ' '*(math.floor((len(horizLine)-24)/2))+'Ports Pressure in cycles'+' '*(math.ceil((len(horizLine)-24)/2))+'\n' + portLine = '' + for i in range(0,self.ports): + portLine += '| {} '.format(i) + portLine += '|\n' + head = portAnno+portLine+horizLine + return head + + + def get_line(self, occ_ports, instrName): + ''' + Create line with port occupation for output. + + Parameters + ---------- + occ_ports : (int, ...) + Integer tuple containing needed ports + instrName : str + Name of instruction form for output + + Returns + ------- + str + String for output containing port scheduling for instrName + ''' + line = '' + for i in occ_ports: + cycles = ' ' if (i == 0) else '%.2f' % float(i) + line += '| '+cycles+' ' + line += '| '+instrName+'\n' + return line + + + def get_port_binding(self, port_bndg): + ''' + Creates port binding out of scheduling result. + + Parameters + ---------- + port_bndg : [int, ...] + Integer list containing port bindings + + Returns + ------- + str + String containing the port binding graphical output + ''' + header = 'Port Binding in Cycles Per Iteration:\n' + horizLine = '-'*10+'-'*6*self.ports+'\n' + portLine = '| Port |' + for i in range(0, self.ports): + portLine += ' {} |'.format(i) + portLine += '\n' + cycLine = '| Cycles |' + for i in range(len(port_bndg)): + cycLine += ' {} |'.format(round(port_bndg[i], 2)) + cycLine += '\n' + binding = header+horizLine+portLine+horizLine+cycLine+horizLine + return binding + + + def get_operand_suffix(self, instrForm): + ''' + Creates operand suffix out of list of Parameters. + + Parameters + ---------- + instrForm : [str, Parameter, ..., Parameter, str] + Instruction Form data structure + + Returns + ------- + str + Operand suffix for searching in database + ''' + extension = '' + opExt = [] + for i in range(1, len(instrForm)-1): + optmp = '' + if(isinstance(instrForm[i], Register) and instrForm[i].reg_type == 'GPR'): + optmp = 'r'+str(instrForm[i].size) + elif(isinstance(instrForm[i], MemAddr)): + optmp = 'mem' + else: + optmp = str(instrForm[i]).lower() + opExt.append(optmp) + operands = '_'.join(opExt) + return operands + + +if __name__ == '__main__': + data = [ + ['lea',Register('RAX'),MemAddr('%edx,(%rax,%rax,1)'),'lea 0x1(%rax,%rax,1),%edx'], + ['vcvtsi2ss',Register('XMM0'),Register('XMM0'),Register('RAX'),'vcvtsi2ss %edx,%xmm2,%xmm2'], + ['vmulss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vmulss %xmm2,%xmm0, %xmm3'], + ['lea',Register('RAX'),MemAddr('%edx,(%rax,%rax,1)'),'lea 0x2(%rax,%rax,1),%ecx'], + ['vaddss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vaddss %xmm3,%xmm1,%xmm4'], + ['vxorps',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vxorps %xmm1, %xmm1,%xmm1'], + ['vcvtsi2ss',Register('XMM0'),Register('XMM0'),Register('RAX'),'vcvtsi2ss %ecx,%xmm1, %xmm1'], + ['vmulss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vmulss %xmm1,%xmm0,%xmm5'], + ['vmovss',MemAddr('%edx,(%rax,%rax,1)'),Register('XMM0'),'vmovss %xmm4,0x4(%rsp,%rax,8)'], + ['vaddss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vaddss %xmm5,%xmm4,%xmm1'], + ['vmovss',MemAddr('%edx,(%rax,%rax,1)'),Register('XMM0'),'vmovss %xmm1,0x8(%rsp,%rax,8)'], + ['inc',Register('RAX'),'inc %rax'], + ['cmp',Register('RAX'),Parameter('IMD'),'cmp $0x1f3,%rax'], + ['jb',Parameter('LBL'),'jb 400bc2 '] + ] + + sched = Scheduler('ivb', data) + output,binding = sched.schedule() + print(sched.get_port_binding(binding)) + print(sched.get_report_info(),end='') + print(output) + print('Block Throughput: {}'.format(round(max(binding),2))) diff --git a/osaca/get_instr.py b/osaca/get_instr.py new file mode 100755 index 0000000..f40db87 --- /dev/null +++ b/osaca/get_instr.py @@ -0,0 +1,247 @@ +#!/apps/python/3.5-anaconda/bin/python +import sys +import re +from testcase import * +from param import * + +marker = r'//STARTLOOP' +asm_line = re.compile(r'\s[0-9a-f]+[:]') +numSeps = 0 +sem = 0 +db = {} +sorted_db = [] +lncnt = 1 +#cnt=0 +fname = "" +cntChar = '' +first = True + +def extract_instr(asmFile): + global once + global lncnt + global fname + fname = asmFile +#Check if parameter is in the correct file format + if(asmFile[-4:] != ".log"): + print("Invalid argument") + sys.exit() +#Open file + try: + f=open(asmFile, "r") + except IOError: + print("IOError: File not found") +#Analyse code line by line and check the instructions + lncnt = 1 + for line in f: + check_line(line) + lncnt += 1 + f.close() + + +def check_line(line): + global numSeps + global sem + global first +#Check if marker is in line and count the number of whitespaces if so + if(marker in line): +#But first, check if high level code ist indented with whitespaces or tabs + if(first): + set_counter_char(line) + first = False + numSeps = (re.split(marker,line)[0]).count(cntChar) + sem = 2; + elif(sem > 0): +#We're in the marked code snipped +#Check if the line is ASM code and - if not - check if we're still in the loop + match = re.search(asm_line, line) + if(match): +#Further analysis of instructions +# print("".join(re.split(r'\t',line)[-1:]),end="") +#Check if there are commetns in line + if(r'//' in line): + return + check_instr("".join(re.split(r'\t',line)[-1:])) + elif((re.split(r'\S',line)[0]).count(cntChar) <= numSeps): +#Not in the loop anymore - or yet - so we decrement the semaphore + sem = sem-1 + +#Check if seperator is either tabulator or whitespace +def set_counter_char(line): + global cntChar + numSpaces = (re.split(marker,line)[0]).count(" ") + numTabs = (re.split(marker,line)[0]).count("\t") + if(numSpaces != 0 and numTabs == 0): + cntChar = ' ' + elif(numSpaces == 0 and numTabs != 0): + cntChar = '\t' + else: + raise NotImplementedError("Indentation of code is only supported for whitespaces and tabs.") + + +def check_instr(instr): + global db + global lncnt + global cnt + global fname +#Check for strange clang padding bytes + while(instr.startswith("data32")): + instr = instr[7:] +#Seperate mnemonic and operands + mnemonic = instr.split()[0] + params = "".join(instr.split()[1:]) +#Check if line is not only a byte + empty_byte = re.compile(r'[0-9a-f]{2}') + if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2): + return +#Check if there's one or more operand and store all in a list + param_list = flatten(separate_params(params)) + opList = list(param_list) +#Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or LABEL (LBL) + for i in range(len(param_list)): + op = param_list[i] + if(len(op) <= 0): + op = Parameter("NONE") + elif(op[0] == '$'): + op = Parameter("IMD") + elif(op[0] == '%' and '(' not in op): + j = len(op) + opmask = False + if('{' in op): + j = op.index('{') + opmask = True + op = Register(op[1:j], opmask) + elif('<' in op): + op = Parameter("LBL") + else: + op = MemAddr(op) + param_list[i] = str(op) + opList[i] = op +#Join mnemonic and operand(s) to an instruction form + if(len(mnemonic) > 7): + tabs = "\t" + else: + tabs = "\t\t" + instr_form = mnemonic+tabs+(" ".join(param_list)) +#Check in database for instruction form and increment the counter + if(instr_form in db): + db[instr_form] = db[instr_form]+1 + else: + db[instr_form] = 1 +#Create testcase for instruction form, since it is the first appearance of it +#But (as far as now) only for instr forms with only registers as operands +# is_Reg = True +# for par in opList: +# print(par.print()+" is Register: "+str(isinstance(par, Register))) +# if(not isinstance(par, Register)): +# is_Reg = False +# if(is_Reg): + #print(mnemonic) +# print("create testcase for "+mnemonic+" with params:") +# for p in opList: +# print(p.print(),end=", ") +# print() + + +#Only create benchmark if no label (LBL) is part of the operands + do_bench = True + for par in opList: + if(str(par) == 'LBL' or str(par) == ''): + do_bench = False + if(do_bench): +#Create testcase with reversed param list, due to the fact its intel syntax! +# create_testcase(mnemonic, list(reversed(opList))) +# print('menmonic: '+mnemonic+' ops: '+str(list(reversed(opList)))) + tc = Testcase(mnemonic, list(reversed(opList)), '64') + tc.write_testcase() +# print("-----------") + +def separate_params(params): + param_list = [params] + if(',' in params): + if(')' in params): + if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','): + i = params.index(')')+1 + elif(params.index('(') < params.index(',')): + return param_list + else: + i = params.index(',') + else: + i = params.index(',') + param_list = [params[:i],separate_params(params[i+1:])] + elif('#' in params): + i = params.index('#') + param_list = [params[:i]] + return param_list + + +def sort_db(): + global sorted_db + sorted_db=sorted(db.items(), key=lambda x:x[1], reverse=True) + + +def print_sorted_db(): + sort_db() + sum = 0 + print("Number of\tmnemonic") + print("calls\n") + for i in range(len(sorted_db)): + print(str(sorted_db[i][1])+"\t\t"+sorted_db[i][0]) + sum += sorted_db[i][1] + print("\nCumulated number of instructions: "+str(sum)) + + +def save_db(): + global db + file = open(".cnt_asm_ops.db","w") + for i in db.items(): + file.write(i[0]+"\t"+str(i[1])+"\n") + file.close() + + +def load_db(): + global db + try: + file = open(".cnt_asm_ops.db", "r") + except FileNotFoundError: + print("no database found in current directory") + return + for line in file: + mnemonic = line.split('\t')[0] +#Join mnemonic and operand(s) to an instruction form + if(len(mnemonic) > 7): + tabs = "\t" + params = line.split('\t')[1] + numCalls = line.split("\t")[2][:-1] + else: + tabs = "\t\t" + params = line.split('\t')[2] + numCalls = line.split("\t")[3][:-1] + instr_form = mnemonic+tabs+params + db[instr_form] = int(numCalls) + file.close() + + +def flatten(l): + if l == []: + return l + if(isinstance(l[0], list)): + return flatten(l[0]) + flatten(l[1:]) + return l[:1] + flatten(l[1:]) + + +if __name__ == "__main__": +# load_db() +# r0 = Register("ymm0") +# r1 = Register("xmm0") +# r64 = Register("rax") +# r32 = Register("eax") +# mem0 = MemAddr('(%rax, %esi, 4)') +# tc = Testcase("XOR", [r32, r32], '64') +# tc.write_testcase() +# create_testcase("VADDPD", [r0, r0, r0]) + if(len(sys.argv) > 1): + for i in range(1,len(sys.argv)): + extract_instr(sys.argv[i]) + print_sorted_db() + +# save_db() diff --git a/osaca/osaca.py b/osaca/osaca.py new file mode 100755 index 0000000..8cb2449 --- /dev/null +++ b/osaca/osaca.py @@ -0,0 +1,826 @@ +#!/apps/python/3.5-anaconda/bin/python + +import argparse +import sys +import subprocess +import os +import re +from param import * +from eu_sched import * +from testcase import * +import pandas as pd +from datetime import datetime +import numpy as np + + +class Osaca(object): + arch = None + filepath = None + srcCode = None + df = None + instrForms = None +# Variables for checking lines + numSeps = 0 + indentChar = '' + sem = 0 + marker = r'//STARTLOOP' + +# Variables for creating output + longestInstr = 30 +# Constants + ASM_LINE = re.compile(r'\s[0-9a-f]+[:]') +# Matches every variation of the IACA start marker + IACA_SM = re.compile(r'\s*movl[ \t]+\$111[ \t]*,[ \t]*%ebx[ \t]*\n\s*\.byte[ \t]+100[ \t]*((,[ \t]*103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144)))|(\n\s*\.byte[ \t]+103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144))))') +# Matches every variation of the IACA end marker + IACA_EM = re.compile(r'\s*movl[ \t]+\$222[ \t]*,[ \t]*%ebx[ \t]*\n\s*\.byte[ \t]+100[ \t]*((,[ \t]*103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144)))|(\n\s*\.byte[ \t]+103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144))))') + + def __init__(self, _arch, _filepath): + self.arch = _arch + self.filepath = _filepath + self.instrForms = [] + + +##-------------------main functions depending on arguments---------------------- + def include_ibench(self): + """ + Reads ibench output and includes it in the architecture specific csv + file. + """ +# Check args and exit program if something's wrong + if(not self.check_arch()): + print('Invalid microarchitecture.') + sys.exit() + if(not self.check_file()): + print('Invalid file path or file format.') + sys.exit() +# Check for database for the chosen architecture + self.df = self.read_csv() +# Create sequence of numbers and their reciprokals for validate the measurements + cycList,reciList = self.create_sequences() + print('Everything seems fine! Let\'s start!') + newData = [] + addedValues = 0 + for line in self.srcCode: + if('Using frequency' in line or len(line) == 0): + continue + clmn = 'LT' + instr = line.split()[0][:-1] + if('TP' in line): +# We found a command with a throughput value. Get instruction and the number of +# clock cycles and remove the '-TP' suffix. + clmn = 'TP' + instr = instr[:-3] +# Otherwise it is a latency value. Nothing to do. + clkC = line.split()[1] + clkC_tmp = clkC + clkC = self.validate_val(clkC, instr, True if (clmn == 'TP') else False, cycList, reciList) + txtOutput = True if (clkC_tmp == clkC) else False + val = -2 + new = False + try: + entry = self.df.loc[lambda df: df.instr == instr,clmn] + val = entry.values[0] + except IndexError: +# Instruction not in database yet --> add it + new = True +# First check if LT or TP value has already been added before + for i,item in enumerate(newData): + if(instr in item): + if(clmn == 'TP'): + newData[i][1] = clkC + elif(clmn == 'LT'): + newData[i][2] = clkC + new = False + break + if(new and clmn == 'TP'): + newData.append([instr,clkC,'-1',((-1,),)]) + elif(new and clmn == 'LT'): + newData.append([instr,'-1',clkC,((-1,),)]) + new = True + addedValues += 1 + pass +# If val is -1 (= not filled with a valid value) add it immediately + if(val == -1): + self.df.set_value(entry.index[0], clmn, clkC) + addedValues += 1 + continue + if(not new and abs((val/np.float64(clkC))-1) > 0.05): + print('Different measurement for {} ({}): {}(old) vs. {}(new)\nPlease check for correctness (no changes were made).'.format(instr, clmn, val, clkC)) + txtOutput = True + if(txtOutput): + print() + txtOutput = False +# Now merge the DataFrames and write new csv file + self.df = self.df.append(pd.DataFrame(newData, columns=['instr','TP','LT','ports']), ignore_index=True) + csv = self.df.to_csv(index=False) + self.write_csv(csv) + print('ibench output {} successfully in database included.'.format(self.filepath.split('/')[-1])) + print('{} values were added.'.format(addedValues)) + + + def inspect_binary(self): + """ + Main function of OSACA. Inspect binary file and create analysis. + """ +# Check args and exit program if something's wrong + if(not self.check_arch()): + print('Invalid microarchitecture.') + sys.exit() + if(not self.check_elffile()): + print('Invalid file path or file format.') + sys.exit() +# Finally check for database for the chosen architecture + self.read_csv() + + print('Everything seems fine! Let\'s start checking!') + for i,line in enumerate(self.srcCode): + if(i == 0): + self.check_line(line, True) + else: + self.check_line(line) + output = self.create_output() + print(output) + + + def inspect_with_iaca(self): + """ + Main function of OSACA with IACA markers instead of OSACA marker. + Inspect binary file and create analysis. + """ +# Check args and exit program if something's wrong + if(not self.check_arch()): + print('Invalid microarchitecture.') + sys.exit() +# Check if input file is a binary or assembly file + try: + binaryFile = True + if(not self.check_elffile()): + print('Invalid file path or file format.') + sys.exit() + except (TypeError,IndexError): + binaryFile = False + if(not self.check_file(True)): + print('Invalid file path or file format.') + sys.exit() +# Finally check for database for the chosen architecture + self.read_csv() + + print('Everything seems fine! Let\'s start checking!') + if(binaryFile): + self.iaca_bin() + else: + self.iaca_asm() + output = self.create_output() + print(output) + +##------------------------------------------------------------------------------ + + def check_arch(self): + """ + Check if the architecture is valid. + + Returns + ------- + bool + True if arch is supported + False if arch is not supported + + """ + archList = ['SNB','IVB','HSW', 'BDW', 'SKL'] + if(self.arch in archList): + return True + else: + return False + + + def check_elffile(self): + """ + Check if the given filepath exists, if the format is the needed elf64 + and store file data in attribute srcCode. + + Returns + ------- + bool + True if file is expected elf64 file + False if file does not exist or is not an elf64 file + + """ + if(os.path.isfile(self.filepath)): + self.store_srcCode_elf() + if('file format elf64' in self.srcCode[1]): + return True + return False + + + def check_file(self,iacaFlag=False): + """ + Check if the given filepath exists and store file data in attribute + srcCode. + + Parameters + ---------- + iacaFlag : bool + store file data as a string in attribute srcCode if True, + store it as a list of strings (lines) if False (default False) + + Returns + ------- + bool + True if file exists + False if file does not exist + + """ + if(os.path.isfile(self.filepath)): + self.store_srcCode(iacaFlag) + return True + return False + + def store_srcCode_elf(self): + """ + Load binary file compiled with '-g' in class attribute srcCode and + separate by line. + """ + self.srcCode = subprocess.run(['objdump', '--source', self.filepath], stdout=subprocess.PIPE).stdout.decode('utf-8').split('\n') + + + def store_srcCode(self,iacaFlag=False): + """ + Load arbitrary file in class attribute srcCode. + + Parameters + ---------- + iacaFlag : bool + store file data as a string in attribute srcCode if True, + store it as a list of strings (lines) if False (default False) + """ + try: + f = open(self.filepath, 'r') + except IOError: + print('IOError: file \'{}\' not found'.format(self.filepath)) + self.srcCode = '' + for line in f: + self.srcCode += line + f.close() + if(iacaFlag): + return + self.srcCode = self.srcCode.split('\n') + + + def read_csv(self): + """ + Reads architecture dependent CSV from data directory. + + Returns + ------- + DataFrame + CSV as DataFrame object + """ + currDir = '/'.join(os.path.realpath(__file__).split('/')[:-1]) + df = pd.read_csv(currDir+'/data/'+self.arch.lower()+'_data.csv') + return df + + + def write_csv(self,csv): + """ + Writes architecture dependent CSV into data directory. + + Parameters + ---------- + csv : str + CSV data as string + """ + try: + f = open('data/'+self.arch.lower()+'_data.csv', 'w') + except IOError: + print('IOError: file \'{}\' not found in ./data'.format(self.arch.lower()+'_data.csv')) + f.write(csv) + f.close() + + + + def create_sequences(self,end=101): + """ + Creates list of integers from 1 to end and list of their reciprocals. + + Parameters + ---------- + end : int + End value for list of integers (default 101) + + Returns + ------- + [int] + cycList of integers + [float] + reciList of floats + """ + cycList = [] + reciList = [] + for i in range(1, end): + cycList.append(i) + reciList.append(1/i) + return cycList,reciList + + + def validate_val(self,clkC, instr, isTP, cycList, reciList): + """ + Validate given clock cycle clkC and return rounded value in case of + success. + + A succeeded validation means the clock cycle clkC is only 5% higher or + lower than an integer value from cycList or - if clkC is a throughput + value - 5% higher or lower than a reciprocal from the reciList. + + Parameters + ---------- + clkC : float + Clock cycle to validate + instr : str + Instruction for warning output + isTP : bool + True if a throughput value is to check, False for a latency value + cycList : [int] + Cycle list for validating + reciList : [float] + Reciprocal cycle list for validating + + Returns + ------- + float + Clock cycle, either rounded to an integer or its reciprocal or the + given clkC parameter + """ + clmn = 'LT' + if(isTP): + clmn = 'TP' + for i in range(0, len(cycList)): + if(cycList[i]*1.05 > float(clkC) and cycList[i]*0.95 < float(clkC)): +# Value is probably correct, so round it to the estimated value + return cycList[i] +# Check reciprocal only if it is a throughput value + elif(isTP and reciList[i]*1.05 > float(clkC) and reciList[i]*0.95 < float(clkC)): +# Value is probably correct, so round it to the estimated value + return reciList[i] +# No value close to an integer or its reciprocal found, we assume the +# measurement is incorrect + print('Your measurement for {} ({}) is probably wrong. Please inspect your benchmark!'.format(instr, clmn)) + print('The program will continue with the given value') + return clkC + + + def check_line(self,line,firstAppearance=False): + """ + Inspect line of source code and process it if inside the marked snippet. + + Parameter + --------- + line : str + Line of source code + firstAppearance : bool + Necessary for setting indenting character (default False) + """ +# Check if marker is in line + if(self.marker in line): +# First, check if high level code in indented with whitespaces or tabs + if(firstAppearance): + self.indentChar = self.get_indent_chars(line) +# Now count the number of whitespaces + self.numSeps = (re.split(self.marker, line)[0]).count(self.indentChar) + self.sem = 2 + elif(self.sem > 0): +# We're in the marked code snippet +# Check if the line is ASM code and - if not - check if we're still in the loop + match = re.search(self.ASM_LINE, line) + if(match): +# Further analysis of instructions +# Check if there are comments in line + if(r'//' in line): + return + self.check_instr(''.join(re.split(r'\t', line)[-1:])) + elif((re.split(r'\S', line)[0]).count(self.indentChar) <= self.numSeps): +# Not in the loop anymore - or yet. We decrement the semaphore + self.sem = self.sem-1 + + + def get_indent_chars(self,line): + """ + Check if indentation characters are either tabulators or whitespaces + + Parameters + ---------- + line : str + Line with start marker in it + + Returns + ------- + str + Indentation character as string + """ + numSpaces = (re.split(self.marker, line)[0]).count(' ') + numTabs = (re.split(self.marker, line)[0]).count('\t') + if(numSpaces != 0 and numTabs == 0): + return ' ' + elif(numSpaces == 0 and numTabs != 0): + return '\t' + else: + raise NotImplementedError('Indentation of code is only supported for whitespaces and tabs.') + + + def iaca_bin(self): + """ + Extract instruction forms out of binary file using IACA markers. + """ + self.marker = r'fs addr32 nop' + for line in self.srcCode: +# Check if marker is in line + if(self.marker in line): + self.sem += 1 + elif(self.sem == 1): +# We're in the marked code snippet +# Check if the line is ASM code + match = re.search(self.ASM_LINE, line) + if(match): +# Further analysis of instructions +# Check if there are comments in line + if(r'//' in line): + continue +# Do the same instruction check as for the OSACA marker line check + self.check_instr(''.join(re.split(r'\t', line)[-1:])) + elif(self.sem == 2): +# Not in the loop anymore. Due to the fact it's the IACA marker we can stop here +# After removing the last line which belongs to the IACA marker + del self.instrForms[-1:] + return + + + def iaca_asm(self): + """ + Extract instruction forms out of assembly file using IACA markers. + """ +# Extract the code snippet surround by the IACA markers + code = self.srcCode +# Search for the start marker + match = re.match(self.IACA_SM, code) + while(not match): + code = code.split('\n',1)[1] + match = re.match(self.IACA_SM, code) +# Search for the end marker + code = (code.split('144',1)[1]).split('\n',1)[1] + res = '' + match = re.match(self.IACA_EM, code) + while(not match): + res += code.split('\n',1)[0]+'\n' + code = code.split('\n',1)[1] + match = re.match(self.IACA_EM, code) +# Split the result by line go on like with OSACA markers + res = res.split('\n') + for line in res: + line = line.split('#')[0] + line = line.lstrip() + if(len(line) == 0 or '//' in line or line.startswith('..')): + continue + self.check_instr(line) + + + def check_instr(self,instr): + """ + Inspect instruction for its parameters and add it to the instruction forms + pool instrForm. + + Parameters + ---------- + instr : str + Instruction as string + """ +# Check for strange clang padding bytes + while(instr.startswith('data32')): + instr = instr[7:] +# Separate mnemonic and operands + mnemonic = instr.split()[0] + params = ''.join(instr.split()[1:]) +# Check if line is not only a byte + empty_byte = re.compile(r'[0-9a-f]{2}') + if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2): + return +# Check if there's one or more operands and store all in a list + param_list = self.flatten(self.separate_params(params)) + param_list_types = list(param_list) +# Check operands and separate them by IMMEDIATE (IMD), REGISTER (REG), +# MEMORY (MEM) or LABEL(LBL) + for i in range(len(param_list)): + op = param_list[i] + if(len(op) <= 0): + op = Parameter('NONE') + elif(op[0] == '$'): + op = Parameter('IMD') + elif(op[0] == '%' and '(' not in op): + j = len(op) + opmask = False + if('{' in op): + j = op.index('{') + opmask = True + op = Register(op[1:j], opmask) + elif('<' in op or op.startswith('.')): + op = Parameter('LBL') + else: + op = MemAddr(op) + param_list[i] = str(op) + param_list_types[i] = op +# Add to list + if(len(instr) > self.longestInstr): + self.longestInstr = len(instr) + instrForm = [mnemonic]+list(reversed(param_list_types))+[instr] + self.instrForms.append(instrForm) +# If flag is set, create testcase for instruction form +# Do this in reversed param list order, du to the fact it's intel syntax +# Only create benchmark if no label (LBL) is part of the operands + if('LBL' in param_list or '' in param_list): + return + tc = Testcase(mnemonic, list(reversed(param_list_types)), '64') +# Only write a testcase if it not already exists + writeTP, writeLT = tc._Testcase__is_in_dir() + tc.write_testcase(not writeTP, not writeLT) + + + def separate_params(self,params): + """ + Delete comments, separates parameters and return them as a list. + + Parameters + ---------- + params : str + Splitted line after mnemonic + + Returns + ------- + [[...[str]]] + Nested list of strings. The number of nest levels depend on the + number of parametes given. + """ + param_list = [params] + if(',' in params): + if(')' in params): + if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','): + i = params.index(')')+1 + elif(params.index('(') < params.index(',')): + return param_list + else: + i = params.index(',') + else: + i = params.index(',') + param_list = [params[:i],self.separate_params(params[i+1:])] + elif('#' in params): + i = params.index('#') + param_list = [params[:i]] + return param_list + + def flatten(self,l): + """ + Flatten a nested list of strings. + + Parameters + ---------- + l : [[...[str]]] + Nested list of strings + + Returns + ------- + [str] + List of strings + """ + if l == []: + return l + if(isinstance(l[0], list)): + return self.flatten(l[0]) + self.flatten(l[1:]) + return l[:1] + self.flatten(l[1:]) + + + def create_output(self,tp_list=False,pr_sched=True): + """ + Creates output of analysed file including a time stamp. + + Parameters + ---------- + tp_list : bool + Boolean for indicating the need for the throughput list as output + (default False) + pr_sched : bool + Boolean for indicating the need for predicting a scheduling + (default True) + + Returns + ------- + str + OSACA output + """ +# Check the output alignment depending on the longest instruction + if(self.longestInstr > 70): + self.longestInstr = 70 + horizLine = self.create_horiz_sep() + ws = ' '*(len(horizLine)-23) +# Write general information about the benchmark + output = ( '--'+horizLine+'\n' + '| Analyzing of file:\t'+os.path.abspath(self.filepath)+'\n' + '| Architecture:\t\t'+self.arch+'\n' + '| Timestamp:\t\t'+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'\n') + + if(tp_list): + output += self.create_TP_list(horizLine) + if(pr_sched): + output += '\n\n' + sched = Scheduler(self.arch, self.instrForms) + schedOutput,portBinding = sched.schedule() + binding = sched.get_port_binding(portBinding) + output += sched.get_report_info()+'\n'+binding+'\n\n'+schedOutput + blockTP = round(max(portBinding), 2) + output += 'Total number of estimated throughput: '+str(blockTP) + return output + + + def create_horiz_sep(self): + """ + Calculate and return horizontal separator line. + + Returns + ------- + str + Horizontal separator line + """ + return '-'*(self.longestInstr+8) + + + def create_TP_list(self,horizLine): + """ + Create list of instruction forms with the proper throughput value. + + Parameter + --------- + horizLine : str + Calculated horizontal line for nice alignement + + Returns + ------- + str + Throughput list output for printing + """ + warning = False + ws = ' '*(len(horizLine)-23) + + output = ('\n| INSTRUCTION'+ws+'CLOCK CYCLES\n' + '| '+horizLine+'\n|\n') +# Check for the throughput data in CSV + for elem in self.instrForms: + extension = '' + opExt = [] + for i in range(1, len(elem)-1): + optmp = '' + if(isinstance(elem[i], Register) and elem[i].reg_type == 'GPR'): + optmp = 'r'+str(elem[i].size) + elif(isinstance(elem[i], MemAddr)): + optmp = 'mem' + else: + optmp = str(elem[i]).lower() + opExt.append(optmp) + operands = '_'.join(opExt) +# Now look up the value in the dataframe +# Check if there is a stored throughput value in database + import warnings + warnings.filterwarnings("ignore", 'This pattern has match groups') + series = self.df['instr'].str.contains(elem[0]+'-'+operands) + if( True in series.values): +# It's a match! + notFound = False + try: + tp = self.df[self.df.instr == elem[0]+'-'+operands].TP.values[0] + except IndexError: +# Something went wrong + print('Error while fetching data from database') + continue +# Did not found the exact instruction form. +# Try to find the instruction form for register operands only + else: + opExtRegs = [] + for operand in opExt: + try: + regTmp = Register(operand) + opExtRegs.append(True) + except KeyError: + opExtRegs.append(False) + pass + if(not True in opExtRegs): +# No register in whole instruction form. How can I find out what regsize we need? + print('Feature not included yet: ', end='') + print(elem[0]+' for '+operands) + tp = 0 + notFound = True + warning = True + + numWhitespaces = self.longestInstr-len(elem[-1]) + ws = ' '*numWhitespaces+'| ' + n_f = ' '*(5-len(str(tp)))+'*' + data = '| '+elem[-1]+ws+str(tp)+n_f+'\n' + output += data + continue + if(opExtRegs[0] == False): +# Instruction stores result in memory. Check for storing in register instead. + if(len(opExt) > 1): + if(opExtRegs[1] == True): + opExt[0] = opExt[1] + elif(len(optExt > 2)): + if(opExtRegs[2] == True): + opExt[0] = opExt[2] + if(len(opExtRegs) == 2 and opExtRegs[1] == False): +# Instruction loads value from memory and has only two operands. Check for +# loading from register instead + if(opExtRegs[0] == True): + opExt[1] = opExt[0] + if(len(opExtRegs) == 3 and opExtRegs[2] == False): +# Instruction loads value from memory and has three operands. Check for loading +# from register instead + opExt[2] = opExt[0] + operands = '_'.join(opExt) +# Check for register equivalent instruction + series = self.df['instr'].str.contains(elem[0]+'-'+operands) + if( True in series.values): +# It's a match! + notFound = False + try: + tp = self.df[self.df.instr == elem[0]+'-'+operands].TP.values[0] + + except IndexError: +# Something went wrong + print('Error while fetching data from database') + continue +# Did not found the register instruction form. Set warning and go on with +# throughput 0 + else: + tp = 0 + notFound = True + warning = True +# Check the alignement again + numWhitespaces = self.longestInstr-len(elem[-1]) + ws = ' '*numWhitespaces+'| ' + n_f = '' + if(notFound): + n_f = ' '*(5-len(str(tp)))+'*' + data = '| '+elem[-1]+ws+'{:3.2f}'.format(tp)+n_f+'\n' + output += data +# Finally end the list of throughput values + numWhitespaces = self.longestInstr-27 + ws = ' '+' '*numWhitespaces + output += '| '+horizLine+'\n' + if(warning): + output += ('\n\n* There was no throughput value found ' + 'for the specific instruction form.' + '\n Please create a testcase via the create_testcase-method ' + 'or add a value manually.') + return output + + +##------------------------------------------------------------------------------ +##------------Main method-------------- +def main(): +# Parse args + parser = argparse.ArgumentParser(description='Analyzes a marked innermost loop snippet for a given architecture type and prints out the estimated average throughput') + parser.add_argument('-V', '--version', action='version', version='%(prog)s 0.1') + parser.add_argument('--arch', dest='arch', type=str, help='define architecture (SNB, IVB, HSW, BDW, SKL)') + parser.add_argument('filepath', type=str, help='path to object (Binary, ASM, CSV)') + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('-i', '--include-ibench', dest='incl', action='store_true', help='includes the given values in form of the output of ibench in the database') + group.add_argument('--iaca', dest='iaca', action='store_true', help='search for IACA markers instead the OSACA marker') + group.add_argument('-m', '--insert-marker', dest='insert_marker', action='store_true', help='try to find blocks probably corresponding to loops in assembly and insert IACA marker') + +# Store args in global variables + inp = parser.parse_args() + if(inp.arch is None and inp.insert_marker is None): + raise ValueError('Please specify an architecture') + if(inp.arch is not None): + arch = inp.arch.upper() + filepath = inp.filepath + inclIbench = inp.incl + iacaFlag = inp.iaca + insert_m = inp.insert_marker + +# Create Osaca object + if(inp.arch is not None): + osaca = Osaca(arch, filepath) + + if(inclIbench): + osaca.include_ibench() + elif(iacaFlag): + osaca.inspect_with_iaca() + elif(insert_m): + try: + from kerncraft import iaca + except ImportError: + print('ImportError: Module kerncraft not installed. Use \'pip install --user kerncraft\' for installation.\nFor more information see https://github.com/RRZE-HPC/kerncraft') + sys.exit() + iaca.iaca_instrumentation(input_file=filepath, output_file=filepath, + block_selection='manual', pointer_increment=1) + else: + osaca.inspect_binary() + + +##------------Main method-------------- +if __name__ == '__main__': + main() diff --git a/osaca/param.py b/osaca/param.py new file mode 100755 index 0000000..afde59d --- /dev/null +++ b/osaca/param.py @@ -0,0 +1,109 @@ +#!/apps/python/3.5-anaconda/bin/python +class Parameter(object): + type_list = ["REG", "MEM", "IMD", "LBL", "NONE"] + def __init__(self, ptype, name="NONE"): + self.ptype = ptype.upper() + if(self.ptype not in self.type_list): + raise NameError("Type not supported: "+ptype) + + def __str__(self): + '''returns string representation''' + if(self.ptype == "NONE"): + return "" + else: + return self.ptype + +class MemAddr(Parameter): + segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"] + scales = [1, 2, 4, 8] + def __init__(self, name): + self.sreg = False + self.offset = False + self.base = False + self.index = False + self.scale = False + if(':' in name): + if(name[1:name.index(':')].upper() not in self.segment_regs): + raise NameError("Type not supported: "+name) + self.sreg = True + self.offset = True + if('(' not in name or ('(' in name and name.index('(') != 0)): + self.offset = True + if('(' in name): + self.parentheses = name[name.index('(')+1:-1] + self.commacnt = self.parentheses.count(',') + if(self.commacnt == 0): + self.base = True + elif(self.commacnt == 1 or self.commacnt == 2 and int(self.parentheses[-1:]) == 1): + self.base = True + self.index = True + elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales): + self.base = True + self.index = True + self.scale = True + else: + raise NameError("Type not supported: "+name) + + def __str__(self): + '''returns string representation''' + mem_format = "MEM(" + if(self.sreg): + mem_format += "sreg:" + if(self.offset): + mem_format += "offset" + if(self.base and not self.index): + mem_format += "(base)" + elif(self.base and self.index and self.scale): + mem_format += "(base, index, scale)" + mem_format += ")" + return mem_format + + + +class Register(Parameter): + sizes = { +#General Purpose Registers + "AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"), + "R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"), + "AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"), + "EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"), + "RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"), + "CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"), + "EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"), +#FPU Registers + "ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"), +#MMX Registers + "MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"), +#XMM Registers + "XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"), +#YMM Registers + "YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"), +#ZMM Registers + "ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"), +#Opmask Register + "K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"), +#Bounds Registers + "BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND"), +#Registers in gerneral + "R8":(8,"GPR"), "R16":(16,"GPR"), "R32":(32,"GPR"), "R64":(64,"GPR"), "FPU":(80,"FPU"), "MMX":(64,"MMX"), "XMM":(128,"XMM"), "YMM":(256,"YMM"), "ZMM":(512,"ZMM"), "K":(64,"K"), "BND":(128,"BND") + } + + def __init__(self,name,mask=False): + self.name = name.upper() + self.mask = mask +# try: + if[name in self.sizes]: + self.size = self.sizes[self.name][0] + self.reg_type = self.sizes[self.name][1] + else: + print(lncnt) + raise NameError("Register name not in dictionary: "+self.name) +# except KeyError: +# print(lncnt) + + def __str__(self): + '''returns string representation''' + opmask = "" + if(self.mask): + opmask = "{opmask}" + return(self.reg_type+opmask) diff --git a/osaca/testcase.py b/osaca/testcase.py new file mode 100755 index 0000000..446c7d3 --- /dev/null +++ b/osaca/testcase.py @@ -0,0 +1,367 @@ +#!/apps/python/3.5-anaconda/bin/python + +import os +from subprocess import call +from math import ceil +from param import * + +class Testcase(object): + +##------------------Constant variables-------------------------- +# Lookup tables for regs + gprs64 = ['rax', 'rbx', 'rcx', 'rdx', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15'] + gprs32 = ['eax', 'ebx', 'ecx', 'edx', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d'] + gprs16 = ['ax', 'bx', 'cx', 'dx', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w'] + gprs8 = ['al', 'bl', 'cl', 'dl', 'r9l', 'r10l', 'r11l', 'r12l', 'r13l', 'r14l', 'r15l'] + fpus = ['st0', 'st1', 'st2', 'st3', 'st4', 'st5', 'st6', 'st7'] + mmxs = ['mm0', 'mm1', 'mm2', 'mm3', 'mm4', 'mm5', 'mm6', 'mm7'] + ks = ['k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7'] + bnds = ['bnd0', 'bnd1', 'bnd2', 'bnd3', 'bnd4', 'bnd5', 'bnd6', 'bnd7'] + xmms = ['xmm0', 'xmm1', 'xmm2', 'xmm3', 'xmm4', 'xmm5', 'xmm6', 'xmm7', 'xmm8', 'xmm9', + 'xmm10', 'xmm11', 'xmm12', 'xmm13', 'xmm14', 'xmm15'] + ymms = ['ymm0', 'ymm1', 'ymm2', 'ymm3', 'ymm4', 'ymm5', 'ymm6', 'ymm7', 'ymm8', 'ymm9', + 'ymm10', 'ymm11', 'ymm12', 'ymm13', 'ymm14', 'ymm15'] + zmms = ['zmm0', 'zmm1', 'zmm2', 'zmm3', 'zmm4', 'zmm5', 'zmm6', 'zmm7', 'zmm8', 'zmm9', + 'zmm10', 'zmm11', 'zmm12', 'zmm13', 'zmm14', 'zmm15'] +# Lookup table for memory + mems = ['[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]'] +# Lookup table for immediates + imds = ['1', '2', '13', '22', '8', '78', '159', '222', '3', '9', '5', '55', '173', '317', '254', '255'] +# TODO Differentiate between AVX512 (with additional xmm16-31) and the rest +# ... +# ... +# end TODO + + ops = {'gpr64':gprs64, 'gpr32':gprs32, 'gpr16':gprs16, 'gpr8':gprs8, 'fpu':fpus, 'mmx':mmxs, 'k':ks, 'bnd':bnds, 'xmm':xmms, 'ymm':ymms, 'zmm':zmms, 'mem':mems, 'imd':imds} + +# Create Single Precision 1.0 + sp1 = '\t\t# create SP 1.0\n' + sp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\n' + sp1 += '\t\tvpslld xmm0, xmm0, 25\t\t\t# logical left shift: 11111110..0 (25=32-(8-1))\n' + sp1 += '\t\tvpsrld xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n' + sp1 += '\t\t# copy SP 1.0\n' +# Create Double Precision 1.0 + dp1 = '\t\t# create DP 1.0\n' + dp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\t\t# all ones\n' + dp1 += '\t\tvpsllq xmm0, xmm0, 54\t\t\t# logical left shift: 11111110..0 (54=64-(10-1))\n' + dp1 += '\t\tvpsrlq xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n' +# Create epilogue + done = ('done:\n' + '\t\tmov\trsp, rbp\n' + '\t\tpop\trbp\n' + '\t\tret\n' + '.size latency, .-latency') +##---------------------------------------------------------------- + +# Constructor + def __init__(self, _mnemonic, _param_list, _num_instr='32'): + self.instr = _mnemonic.lower() + self.param_list = _param_list +# num_instr must be an even number + self.num_instr = str(ceil(int(_num_instr)/2)*2) +# Check for the number of operands and initialise the GPRs if necessary + self.op_a, self.op_b, self.op_c, self.gprPush, self.gprPop, self.zeroGPR, self.copy = self.__define_operands() + self.num_operands = len(self.param_list) + +# Create asm header + self.def_instr, self.ninstr, self.init, self.expand = self.__define_header() +# Create latency and throughput loop + self.loop_lat = self.__define_loop_lat() + self.loop_thrpt = self.__define_loop_thrpt() +# Create extension for testcase name + sep1 = '_' if (self.num_operands > 1) else '' + sep2 = '_' if (self.num_operands > 2) else '' + self.extension = ('-'+(self.op_a if ('gpr' not in self.op_a) else 'r' + self.op_a[3:]) + sep1 + (self.op_b if ('gpr' not in self.op_b) else 'r'+self.op_b[3:]) + sep2 + (self.op_c if ('gpr' not in self.op_c) else 'r'+self.op_c[3:])) + + + def write_testcase(self, TP=True, LT=True): + """ + Write testcase for class attributes in a file. + + Parameters + ---------- + TP : bool + Controls if throughput testcase should be written + (default True) + + LT : bool + Controls if latency testcase should be written + (default True) + """ + if(LT): +# Write latency file + call(['mkdir', '-p', os.path.dirname(__file__)+'/../testcases']) + f = open(os.path.dirname(__file__)+'/../testcases/'+self.instr+self.extension+'.S', 'w') + data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_lat+self.gprPop+self.done) + f.write(data) + f.close() + if(TP): +# Write throughput file + f = open(os.path.dirname(__file__)+'/../testcases/'+self.instr+self.extension+'-TP.S', 'w') + data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_thrpt+self.gprPop+self.done) + f.write(data) + f.close() + + +# Check operands + def __define_operands(self): + """ + Check for the number of operands and initialise the GPRs if necessary. + + Returns + ------- + (str, str, str, str, str, str) + String tuple containing types of operands and if needed push/pop operations, the + initialisation of general purpose regs and the copy if registers. + """ + oprnds = self.param_list + op_a, op_b, op_c = ('', '', '') + gprPush, gprPop, zeroGPR = ('', '', '') + if(isinstance(oprnds[0], Register)): + op_a = oprnds[0].reg_type.lower() + elif(isinstance(oprnds[0], MemAddr)): + op_a = 'mem' + elif(isinstance(oprnds[0], Parameter) and str(oprnds[0]) == 'IMD'): + op_a = 'imd' + if(op_a == 'gpr'): + gprPush, gprPop, zeroGPR = self.__initialise_gprs() + op_a += str(oprnds[0].size) + if(len(oprnds) > 1): + if(isinstance(oprnds[1], Register)): + op_b = oprnds[1].reg_type.lower() + elif(isinstance(oprnds[1], MemAddr)): + op_b = 'mem' + elif(isinstance(oprnds[1], Parameter) and str(oprnds[1]) == 'IMD'): + op_b = 'imd' + if(op_b == 'gpr'): + op_b += str(oprnds[1].size) + if('gpr' not in op_a): + gprPush, gprPop, zeroGPR = self.__initialise_gprs() + if(len(oprnds) == 3): + if(isinstance(oprnds[2], Register)): + op_c = oprnds[2].reg_type.lower() + elif(isinstance(oprnds[2], MemAddr)): + op_c = 'mem' + elif(isinstance(oprnds[2], Parameter) and str(oprnds[2]) == 'IMD'): + op_c = 'imd' + if(op_c == 'gpr'): + op_c += str(oprnds[2].size) + if(('gpr' not in op_a) and ('gpr'not in op_b)): + gprPush, gprPop, zeroGPR = self.__initialise_gprs() + if(len(oprnds) == 1 and isinstance(oprnds[0], Register)): + copy = self.__copy_regs(oprnds[0]) + elif(len(oprnds) > 1 and isinstance(oprnds[1], Register)): + copy = self.__copy_regs(oprnds[1]) + elif(len(oprnds) > 2 and isinstance(oprnds[2], Register)): + copy = self.__copy_regs(oprnds[1]) + else: + copy = '' + return (op_a, op_b, op_c, gprPush, gprPop, zeroGPR, copy) + + + def __initialise_gprs(self): + """ + Initialise eleven general purpose registers and set them to zero. + + Returns + ------- + (str, str, str) + String tuple for push, pop and initalisation operations + """ + + gprPush = '' + gprPop = '' + zeroGPR = '' + for reg in self.gprs64: + gprPush += '\t\tpush {}\n'.format(reg) + for reg in reversed(self.gprs64): + gprPop += '\t\tpop {}\n'.format(reg) + for reg in self.gprs64: + zeroGPR += '\t\txor {}, {}\n'.format(reg, reg) + return (gprPush, gprPop, zeroGPR) + + +# Copy created values in specific register + def __copy_regs(self, reg): + """ + Copy created values in specific register. + + Parameters + ---------- + reg : Register + Register for copying the value + + Returns + ------- + str + String containing the copy instructions + """ + copy = '\t\t# copy DP 1.0\n' +# Different handling for GPR, MMX and SSE/AVX registers + if(reg.reg_type == 'GPR'): + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][1]) + copy += '\t\t# Create DP 2.0\n' + copy += '\t\tadd {}, {}\n'.format(self.ops['gpr64'][1], self.ops['gpr64'][0]) + copy += '\t\t# Create DP 0.5\n' + copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0]) + copy += '\t\tmovq {}, {}\n'.format(self.ops['gpr64'][2], self.ops['gpr64'][0]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0]) + elif(reg.reg_type == 'MMX'): + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][0]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][1]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0]) + copy += '\t\t# Create DP 2.0\n' + copy += '\t\tadd {}, {}\n'.format(ops['mmx'][1], ops['mmx'][0]) + copy += '\t\t# Create DP 0.5\n' + copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0]) + copy += '\t\tmovq {}, {}\n'.format(self.ops['mmx'][2], self.ops['gpr64'][0]) + elif(reg.reg_type == 'XMM' or reg.reg_type == 'YMM' or reg.reg_type == 'ZMM'): + key = reg.reg_type.lower() + copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][0], self.ops[key][0]) + copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][1], self.ops[key][0]) + copy += '\t\t# Create DP 2.0\n' + copy += '\t\tvaddpd {}, {}, {}\n'.format(self.ops[key][1], self.ops[key][1], self.ops[key][1]) + copy += '\t\t# Create DP 0.5\n' + copy += '\t\tvdivpd {}, {}, {}\n'.format(self.ops[key][2], self.ops[key][0], self.ops[key][1]) + else: + copy = '' + return copy + + + def __define_header(self): + """ + Define header. + + Returns + ------- + (str, str, str, str) + String tuple containing the header, value initalisations and extensions + """ + def_instr = '#define INSTR '+self.instr+'\n' + ninstr = '#define NINST '+self.num_instr+'\n' + pi = ('PI:\n' + '.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' #128 bit + '0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' #256 bit + '0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' #384 bit + '0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9\n') #512 bit + init = ('#define N edi\n' \ + '#define i r8d\n\n\n' + '.intel_syntax noprefix\n' + '.globl ninst\n' + '.data\n' + 'ninst:\n' + '.long NINST\n' + '.align 32\n' + +pi+ + '.text\n' + '.globl latency\n' + '.type latency, @function\n' + '.align 32\n' + 'latency:\n' + '\t\tpush rbp\n' + '\t\tmov rbp, rsp\n' + '\t\txor i, i\n' + '\t\ttest N, N\n' + '\t\tjle done\n') +# Expand to AVX(512) if necessary + expand = '' + if(self.op_a == 'ymm' or self.op_b == 'ymm' or self.op_c == 'ymm'): + expand = ('\t\t# expand from SSE to AVX\n' + '\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n') + if(self.op_a == 'zmm' or self.op_b == 'zmm' or self.op_c == 'zmm'): + expand = ('\t\t# expand from SSE to AVX\n' + '\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n' + '\t\t# expand from AVX to AVX512\n' + '\t\tvinsert64x4 zmm0, zmm0, ymm0, 0x1\n') + return (def_instr, ninstr, init, expand) + + + def __define_loop_lat(self): + """ + Create latency loop. + + Returns + ------- + str + Latency loop as string + """ + loop_lat = ('loop:\n' + '\t\tinc i\n') + if(self.num_operands == 1): + for i in range(0, int(self.num_instr)): + loop_lat += '\t\tINSTR {}\n'.format(self.ops[self.op_a][0]) + elif(self.num_operands == 2 and self.op_a == self.op_b): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][1]) + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_b][1], self.ops[self.op_b][0]) + elif(self.num_operands == 2 and self.op_a != self.op_b): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][0]) + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][0]) + elif(self.num_operands == 3 and self.op_a == self.op_b): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][1], self.ops[self.op_c][0]) + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][1], self.ops[self.op_b][0], self.ops[self.op_c][0]) + elif(self.num_operands == 3 and self.op_a == self.op_c): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][0], self.ops[self.op_c][0]) + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][1], self.ops[self.op_b][0], self.ops[self.op_c][0]) + loop_lat += ('\t\tcmp i, N\n' + '\t\tjl loop\n') + return loop_lat + + + def __define_loop_thrpt(self): + """ + Create throughput loop. + + Returns + ------- + str + Throughput loop as string + """ + loop_thrpt = ('loop:\n' + '\t\tinc i\n') + ext = '' + ext1 = False + ext2 = False + if(self.num_operands == 2): + ext1 = True + if(self.num_operands == 3): + ext1 = True + ext2 = True + for i in range(0, int(self.num_instr)): + if(ext1): + ext = ', {}'.format(self.ops[self.op_b][i%3]) + if(ext2): + ext += ', {}'.format(self.ops[self.op_c][i%3]) + regNum = (i%(len(self.ops[self.op_a])-3))+3 + loop_thrpt += '\t\tINSTR {}{}\n'.format(self.ops[self.op_a][regNum], ext) + loop_thrpt += ('\t\tcmp i, N\n' + '\t\tjl loop\n') + return loop_thrpt + + + def __is_in_dir(self): + """ + Check if testcases with the same name already exist in testcase + directory. + + Returns + ------- + (bool, bool) + True if file is in directory + False if file is not in directory + While the first value stands for the throughput testcase + and the second value stands for the latency testcase + """ + TP = False + LT = False + name = self.instr+self.extension + for root, dirs, files in os.walk(os.path.dirname(__file__)+'/testcases'): + if((name+'-TP.S') in files): + TP = True + if name+'.S' in files: + LT = True + return (TP,LT) diff --git a/testcases/TaxCalc/add-r32_mem-TP.S b/testcases/TaxCalc/add-r32_mem-TP.S new file mode 100644 index 0000000..64fc02f --- /dev/null +++ b/testcases/TaxCalc/add-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/add-r32_mem.S b/testcases/TaxCalc/add-r32_mem.S new file mode 100644 index 0000000..7c94bcc --- /dev/null +++ b/testcases/TaxCalc/add-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/add-r64_r64-TP.S b/testcases/TaxCalc/add-r64_r64-TP.S new file mode 100644 index 0000000..d475743 --- /dev/null +++ b/testcases/TaxCalc/add-r64_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/add-r64_r64.S b/testcases/TaxCalc/add-r64_r64.S new file mode 100644 index 0000000..a64dc7c --- /dev/null +++ b/testcases/TaxCalc/add-r64_r64.S @@ -0,0 +1,143 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/cmp-r32_mem-TP.S b/testcases/TaxCalc/cmp-r32_mem-TP.S new file mode 100644 index 0000000..88baf8d --- /dev/null +++ b/testcases/TaxCalc/cmp-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/cmp-r32_mem.S b/testcases/TaxCalc/cmp-r32_mem.S new file mode 100644 index 0000000..12b88d1 --- /dev/null +++ b/testcases/TaxCalc/cmp-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/cmp-r32_r32-TP.S b/testcases/TaxCalc/cmp-r32_r32-TP.S new file mode 100644 index 0000000..c359fe8 --- /dev/null +++ b/testcases/TaxCalc/cmp-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/cmp-r32_r32.S b/testcases/TaxCalc/cmp-r32_r32.S new file mode 100644 index 0000000..99b4b20 --- /dev/null +++ b/testcases/TaxCalc/cmp-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/cmp-r64_imd-TP.S b/testcases/TaxCalc/cmp-r64_imd-TP.S new file mode 100644 index 0000000..a24dfd0 --- /dev/null +++ b/testcases/TaxCalc/cmp-r64_imd-TP.S @@ -0,0 +1,170 @@ +#define INSTR cmp +#define NINST 100 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/cmp-r64_imd.S b/testcases/TaxCalc/cmp-r64_imd.S new file mode 100644 index 0000000..00198e9 --- /dev/null +++ b/testcases/TaxCalc/cmp-r64_imd.S @@ -0,0 +1,170 @@ +#define INSTR cmp +#define NINST 100 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/dec-r32-TP.S b/testcases/TaxCalc/dec-r32-TP.S new file mode 100644 index 0000000..f886ad1 --- /dev/null +++ b/testcases/TaxCalc/dec-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR dec +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/dec-r32.S b/testcases/TaxCalc/dec-r32.S new file mode 100644 index 0000000..7c18fd9 --- /dev/null +++ b/testcases/TaxCalc/dec-r32.S @@ -0,0 +1,143 @@ +#define INSTR dec +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/inc-r32-TP.S b/testcases/TaxCalc/inc-r32-TP.S new file mode 100644 index 0000000..34f98ff --- /dev/null +++ b/testcases/TaxCalc/inc-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/inc-r32.S b/testcases/TaxCalc/inc-r32.S new file mode 100644 index 0000000..84f2a8c --- /dev/null +++ b/testcases/TaxCalc/inc-r32.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/lea-r32_mem-TP.S b/testcases/TaxCalc/lea-r32_mem-TP.S new file mode 100644 index 0000000..9ab76b8 --- /dev/null +++ b/testcases/TaxCalc/lea-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/lea-r32_mem.S b/testcases/TaxCalc/lea-r32_mem.S new file mode 100644 index 0000000..0516e8d --- /dev/null +++ b/testcases/TaxCalc/lea-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/lea-r64_mem-TP.S b/testcases/TaxCalc/lea-r64_mem-TP.S new file mode 100644 index 0000000..e31ca30 --- /dev/null +++ b/testcases/TaxCalc/lea-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/lea-r64_mem.S b/testcases/TaxCalc/lea-r64_mem.S new file mode 100644 index 0000000..aad963e --- /dev/null +++ b/testcases/TaxCalc/lea-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-mem_r32-TP.S b/testcases/TaxCalc/mov-mem_r32-TP.S new file mode 100644 index 0000000..18142e2 --- /dev/null +++ b/testcases/TaxCalc/mov-mem_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-mem_r32.S b/testcases/TaxCalc/mov-mem_r32.S new file mode 100644 index 0000000..427caf4 --- /dev/null +++ b/testcases/TaxCalc/mov-mem_r32.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-mem_r64-TP.S b/testcases/TaxCalc/mov-mem_r64-TP.S new file mode 100644 index 0000000..b4a7f6a --- /dev/null +++ b/testcases/TaxCalc/mov-mem_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-mem_r64.S b/testcases/TaxCalc/mov-mem_r64.S new file mode 100644 index 0000000..c1c6012 --- /dev/null +++ b/testcases/TaxCalc/mov-mem_r64.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-r32_mem-TP.S b/testcases/TaxCalc/mov-r32_mem-TP.S new file mode 100644 index 0000000..69c76ec --- /dev/null +++ b/testcases/TaxCalc/mov-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-r32_mem.S b/testcases/TaxCalc/mov-r32_mem.S new file mode 100644 index 0000000..e4e7313 --- /dev/null +++ b/testcases/TaxCalc/mov-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-r32_r32-TP.S b/testcases/TaxCalc/mov-r32_r32-TP.S new file mode 100644 index 0000000..ce489c3 --- /dev/null +++ b/testcases/TaxCalc/mov-r32_r32-TP.S @@ -0,0 +1,207 @@ +#define INSTR mov +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-r32_r32.S b/testcases/TaxCalc/mov-r32_r32.S new file mode 100644 index 0000000..71e767e --- /dev/null +++ b/testcases/TaxCalc/mov-r32_r32.S @@ -0,0 +1,207 @@ +#define INSTR mov +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-r64_mem-TP.S b/testcases/TaxCalc/mov-r64_mem-TP.S new file mode 100644 index 0000000..97984a3 --- /dev/null +++ b/testcases/TaxCalc/mov-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/mov-r64_mem.S b/testcases/TaxCalc/mov-r64_mem.S new file mode 100644 index 0000000..7095f31 --- /dev/null +++ b/testcases/TaxCalc/mov-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/movslq-r64_mem-TP.S b/testcases/TaxCalc/movslq-r64_mem-TP.S new file mode 100644 index 0000000..e4ba19f --- /dev/null +++ b/testcases/TaxCalc/movslq-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/movslq-r64_mem.S b/testcases/TaxCalc/movslq-r64_mem.S new file mode 100644 index 0000000..50c48ed --- /dev/null +++ b/testcases/TaxCalc/movslq-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/movslq-r64_r32-TP.S b/testcases/TaxCalc/movslq-r64_r32-TP.S new file mode 100644 index 0000000..9b12cc4 --- /dev/null +++ b/testcases/TaxCalc/movslq-r64_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + INSTR rdx, ebx + INSTR r9, ecx + INSTR r10, eax + INSTR r11, ebx + INSTR r12, ecx + INSTR r13, eax + INSTR r14, ebx + INSTR r15, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + INSTR rdx, ebx + INSTR r9, ecx + INSTR r10, eax + INSTR r11, ebx + INSTR r12, ecx + INSTR r13, eax + INSTR r14, ebx + INSTR r15, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/movslq-r64_r32.S b/testcases/TaxCalc/movslq-r64_r32.S new file mode 100644 index 0000000..bf6f2bd --- /dev/null +++ b/testcases/TaxCalc/movslq-r64_r32.S @@ -0,0 +1,143 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/movzbl-r32_r8-TP.S b/testcases/TaxCalc/movzbl-r32_r8-TP.S new file mode 100644 index 0000000..c7de3ab --- /dev/null +++ b/testcases/TaxCalc/movzbl-r32_r8-TP.S @@ -0,0 +1,207 @@ +#define INSTR movzbl +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/movzbl-r32_r8.S b/testcases/TaxCalc/movzbl-r32_r8.S new file mode 100644 index 0000000..2f0ab2f --- /dev/null +++ b/testcases/TaxCalc/movzbl-r32_r8.S @@ -0,0 +1,207 @@ +#define INSTR movzbl +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/neg-r32-TP.S b/testcases/TaxCalc/neg-r32-TP.S new file mode 100644 index 0000000..e60f4a2 --- /dev/null +++ b/testcases/TaxCalc/neg-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR neg +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/neg-r32.S b/testcases/TaxCalc/neg-r32.S new file mode 100644 index 0000000..c25e69c --- /dev/null +++ b/testcases/TaxCalc/neg-r32.S @@ -0,0 +1,143 @@ +#define INSTR neg +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/sub-r32_r32-TP.S b/testcases/TaxCalc/sub-r32_r32-TP.S new file mode 100644 index 0000000..2f45769 --- /dev/null +++ b/testcases/TaxCalc/sub-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/sub-r32_r32.S b/testcases/TaxCalc/sub-r32_r32.S new file mode 100644 index 0000000..91a7610 --- /dev/null +++ b/testcases/TaxCalc/sub-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/test-r32_r32-TP.S b/testcases/TaxCalc/test-r32_r32-TP.S new file mode 100644 index 0000000..5403390 --- /dev/null +++ b/testcases/TaxCalc/test-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/test-r32_r32.S b/testcases/TaxCalc/test-r32_r32.S new file mode 100644 index 0000000..8c7e48d --- /dev/null +++ b/testcases/TaxCalc/test-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vaddpd-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vaddpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..7bf13a5 --- /dev/null +++ b/testcases/TaxCalc/vaddpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vaddpd-xmm_xmm_xmm.S b/testcases/TaxCalc/vaddpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..a4bf29b --- /dev/null +++ b/testcases/TaxCalc/vaddpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vaddpd-ymm_ymm_ymm-TP.S b/testcases/TaxCalc/vaddpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..268aafe --- /dev/null +++ b/testcases/TaxCalc/vaddpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vaddpd-ymm_ymm_ymm.S b/testcases/TaxCalc/vaddpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..0edbbbe --- /dev/null +++ b/testcases/TaxCalc/vaddpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddss-xmm_xmm_mem-TP.S b/testcases/TaxCalc/vaddsd-xmm_xmm_mem-TP.S similarity index 63% rename from testcases/vaddss-xmm_xmm_mem-TP.S rename to testcases/TaxCalc/vaddsd-xmm_xmm_mem-TP.S index 7b00116..902cf3a 100644 --- a/testcases/vaddss-xmm_xmm_mem-TP.S +++ b/testcases/TaxCalc/vaddsd-xmm_xmm_mem-TP.S @@ -1,5 +1,5 @@ -#define INSTR vaddss -#define NINST 32 +#define INSTR vaddsd +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm6, xmm2, [rip+PI] INSTR xmm7, xmm0, [rip+PI] INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] cmp i, N jl loop done: diff --git a/testcases/vaddss-xmm_xmm_mem.S b/testcases/TaxCalc/vaddsd-xmm_xmm_mem.S similarity index 64% rename from testcases/vaddss-xmm_xmm_mem.S rename to testcases/TaxCalc/vaddsd-xmm_xmm_mem.S index 5a7d8b1..8a4bc84 100644 --- a/testcases/vaddss-xmm_xmm_mem.S +++ b/testcases/TaxCalc/vaddsd-xmm_xmm_mem.S @@ -1,5 +1,5 @@ -#define INSTR vaddss -#define NINST 32 +#define INSTR vaddsd +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm1, xmm0, [rip+PI] INSTR xmm0, xmm1, [rip+PI] INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] cmp i, N jl loop done: diff --git a/testcases/TaxCalc/vaddsd-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vaddsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..274e201 --- /dev/null +++ b/testcases/TaxCalc/vaddsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vaddsd-xmm_xmm_xmm.S b/testcases/TaxCalc/vaddsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..d071892 --- /dev/null +++ b/testcases/TaxCalc/vaddsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovapd-xmm_xmm-TP.S b/testcases/TaxCalc/vmovapd-xmm_xmm-TP.S new file mode 100644 index 0000000..b6583d8 --- /dev/null +++ b/testcases/TaxCalc/vmovapd-xmm_xmm-TP.S @@ -0,0 +1,172 @@ +#define INSTR vmovapd +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovapd-xmm_xmm.S b/testcases/TaxCalc/vmovapd-xmm_xmm.S new file mode 100644 index 0000000..dcdb49f --- /dev/null +++ b/testcases/TaxCalc/vmovapd-xmm_xmm.S @@ -0,0 +1,172 @@ +#define INSTR vmovapd +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovapd-ymm_ymm-TP.S b/testcases/TaxCalc/vmovapd-ymm_ymm-TP.S new file mode 100644 index 0000000..dc26fe0 --- /dev/null +++ b/testcases/TaxCalc/vmovapd-ymm_ymm-TP.S @@ -0,0 +1,174 @@ +#define INSTR vmovapd +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm3, ymm1 + INSTR ymm4, ymm2 + INSTR ymm5, ymm0 + INSTR ymm6, ymm1 + INSTR ymm7, ymm2 + INSTR ymm8, ymm0 + INSTR ymm9, ymm1 + INSTR ymm10, ymm2 + INSTR ymm11, ymm0 + INSTR ymm12, ymm1 + INSTR ymm13, ymm2 + INSTR ymm14, ymm0 + INSTR ymm15, ymm1 + INSTR ymm3, ymm2 + INSTR ymm4, ymm0 + INSTR ymm5, ymm1 + INSTR ymm6, ymm2 + INSTR ymm7, ymm0 + INSTR ymm8, ymm1 + INSTR ymm9, ymm2 + INSTR ymm10, ymm0 + INSTR ymm11, ymm1 + INSTR ymm12, ymm2 + INSTR ymm13, ymm0 + INSTR ymm14, ymm1 + INSTR ymm15, ymm2 + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm3, ymm1 + INSTR ymm4, ymm2 + INSTR ymm5, ymm0 + INSTR ymm6, ymm1 + INSTR ymm7, ymm2 + INSTR ymm8, ymm0 + INSTR ymm9, ymm1 + INSTR ymm10, ymm2 + INSTR ymm11, ymm0 + INSTR ymm12, ymm1 + INSTR ymm13, ymm2 + INSTR ymm14, ymm0 + INSTR ymm15, ymm1 + INSTR ymm3, ymm2 + INSTR ymm4, ymm0 + INSTR ymm5, ymm1 + INSTR ymm6, ymm2 + INSTR ymm7, ymm0 + INSTR ymm8, ymm1 + INSTR ymm9, ymm2 + INSTR ymm10, ymm0 + INSTR ymm11, ymm1 + INSTR ymm12, ymm2 + INSTR ymm13, ymm0 + INSTR ymm14, ymm1 + INSTR ymm15, ymm2 + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm3, ymm1 + INSTR ymm4, ymm2 + INSTR ymm5, ymm0 + INSTR ymm6, ymm1 + INSTR ymm7, ymm2 + INSTR ymm8, ymm0 + INSTR ymm9, ymm1 + INSTR ymm10, ymm2 + INSTR ymm11, ymm0 + INSTR ymm12, ymm1 + INSTR ymm13, ymm2 + INSTR ymm14, ymm0 + INSTR ymm15, ymm1 + INSTR ymm3, ymm2 + INSTR ymm4, ymm0 + INSTR ymm5, ymm1 + INSTR ymm6, ymm2 + INSTR ymm7, ymm0 + INSTR ymm8, ymm1 + INSTR ymm9, ymm2 + INSTR ymm10, ymm0 + INSTR ymm11, ymm1 + INSTR ymm12, ymm2 + INSTR ymm13, ymm0 + INSTR ymm14, ymm1 + INSTR ymm15, ymm2 + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovapd-ymm_ymm.S b/testcases/TaxCalc/vmovapd-ymm_ymm.S new file mode 100644 index 0000000..05d0539 --- /dev/null +++ b/testcases/TaxCalc/vmovapd-ymm_ymm.S @@ -0,0 +1,174 @@ +#define INSTR vmovapd +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovaps-xmm_xmm-TP.S b/testcases/TaxCalc/vmovaps-xmm_xmm-TP.S new file mode 100644 index 0000000..af1fd01 --- /dev/null +++ b/testcases/TaxCalc/vmovaps-xmm_xmm-TP.S @@ -0,0 +1,172 @@ +#define INSTR vmovaps +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovaps-xmm_xmm.S b/testcases/TaxCalc/vmovaps-xmm_xmm.S new file mode 100644 index 0000000..d743c98 --- /dev/null +++ b/testcases/TaxCalc/vmovaps-xmm_xmm.S @@ -0,0 +1,172 @@ +#define INSTR vmovaps +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovhpd-xmm_xmm_mem-TP.S b/testcases/TaxCalc/vmovhpd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..11cbaf0 --- /dev/null +++ b/testcases/TaxCalc/vmovhpd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovhpd-xmm_xmm_mem.S b/testcases/TaxCalc/vmovhpd-xmm_xmm_mem.S new file mode 100644 index 0000000..b423e4a --- /dev/null +++ b/testcases/TaxCalc/vmovhpd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vmovhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovq-r64_xmm-TP.S b/testcases/TaxCalc/vmovq-r64_xmm-TP.S new file mode 100644 index 0000000..b80c773 --- /dev/null +++ b/testcases/TaxCalc/vmovq-r64_xmm-TP.S @@ -0,0 +1,141 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + INSTR rdx, xmm1 + INSTR r9, xmm2 + INSTR r10, xmm0 + INSTR r11, xmm1 + INSTR r12, xmm2 + INSTR r13, xmm0 + INSTR r14, xmm1 + INSTR r15, xmm2 + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + INSTR rdx, xmm1 + INSTR r9, xmm2 + INSTR r10, xmm0 + INSTR r11, xmm1 + INSTR r12, xmm2 + INSTR r13, xmm0 + INSTR r14, xmm1 + INSTR r15, xmm2 + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovq-r64_xmm.S b/testcases/TaxCalc/vmovq-r64_xmm.S new file mode 100644 index 0000000..029ebc3 --- /dev/null +++ b/testcases/TaxCalc/vmovq-r64_xmm.S @@ -0,0 +1,141 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovq-xmm_r64-TP.S b/testcases/TaxCalc/vmovq-xmm_r64-TP.S new file mode 100644 index 0000000..fc7da5a --- /dev/null +++ b/testcases/TaxCalc/vmovq-xmm_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm6, rax + INSTR xmm7, rbx + INSTR xmm8, rcx + INSTR xmm9, rax + INSTR xmm10, rbx + INSTR xmm11, rcx + INSTR xmm12, rax + INSTR xmm13, rbx + INSTR xmm14, rcx + INSTR xmm15, rax + INSTR xmm3, rbx + INSTR xmm4, rcx + INSTR xmm5, rax + INSTR xmm6, rbx + INSTR xmm7, rcx + INSTR xmm8, rax + INSTR xmm9, rbx + INSTR xmm10, rcx + INSTR xmm11, rax + INSTR xmm12, rbx + INSTR xmm13, rcx + INSTR xmm14, rax + INSTR xmm15, rbx + INSTR xmm3, rcx + INSTR xmm4, rax + INSTR xmm5, rbx + INSTR xmm6, rcx + INSTR xmm7, rax + INSTR xmm8, rbx + INSTR xmm9, rcx + INSTR xmm10, rax + INSTR xmm11, rbx + INSTR xmm12, rcx + INSTR xmm13, rax + INSTR xmm14, rbx + INSTR xmm15, rcx + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm6, rax + INSTR xmm7, rbx + INSTR xmm8, rcx + INSTR xmm9, rax + INSTR xmm10, rbx + INSTR xmm11, rcx + INSTR xmm12, rax + INSTR xmm13, rbx + INSTR xmm14, rcx + INSTR xmm15, rax + INSTR xmm3, rbx + INSTR xmm4, rcx + INSTR xmm5, rax + INSTR xmm6, rbx + INSTR xmm7, rcx + INSTR xmm8, rax + INSTR xmm9, rbx + INSTR xmm10, rcx + INSTR xmm11, rax + INSTR xmm12, rbx + INSTR xmm13, rcx + INSTR xmm14, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovq-xmm_r64.S b/testcases/TaxCalc/vmovq-xmm_r64.S new file mode 100644 index 0000000..6a89af7 --- /dev/null +++ b/testcases/TaxCalc/vmovq-xmm_r64.S @@ -0,0 +1,143 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovsd-mem_xmm-TP.S b/testcases/TaxCalc/vmovsd-mem_xmm-TP.S new file mode 100644 index 0000000..14a1cb6 --- /dev/null +++ b/testcases/TaxCalc/vmovsd-mem_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovsd-mem_xmm.S b/testcases/TaxCalc/vmovsd-mem_xmm.S new file mode 100644 index 0000000..4f1bfbb --- /dev/null +++ b/testcases/TaxCalc/vmovsd-mem_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovss-xmm_mem-TP.S b/testcases/TaxCalc/vmovsd-xmm_mem-TP.S similarity index 63% rename from testcases/vmovss-xmm_mem-TP.S rename to testcases/TaxCalc/vmovsd-xmm_mem-TP.S index 74d3d83..74f7da2 100644 --- a/testcases/vmovss-xmm_mem-TP.S +++ b/testcases/TaxCalc/vmovsd-xmm_mem-TP.S @@ -1,5 +1,5 @@ -#define INSTR vmovss -#define NINST 32 +#define INSTR vmovsd +#define NINST 64 #define N edi #define i r8d @@ -60,6 +60,38 @@ loop: INSTR xmm6, [rip+PI] INSTR xmm7, [rip+PI] INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] cmp i, N jl loop done: diff --git a/testcases/vmovss-xmm_mem.S b/testcases/TaxCalc/vmovsd-xmm_mem.S similarity index 64% rename from testcases/vmovss-xmm_mem.S rename to testcases/TaxCalc/vmovsd-xmm_mem.S index f553695..6447ff8 100644 --- a/testcases/vmovss-xmm_mem.S +++ b/testcases/TaxCalc/vmovsd-xmm_mem.S @@ -1,5 +1,5 @@ -#define INSTR vmovss -#define NINST 32 +#define INSTR vmovsd +#define NINST 64 #define N edi #define i r8d @@ -60,6 +60,38 @@ loop: INSTR xmm0, [rip+PI] INSTR xmm0, [rip+PI] INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] cmp i, N jl loop done: diff --git a/testcases/TaxCalc/vmovsd-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vmovsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..1c847dd --- /dev/null +++ b/testcases/TaxCalc/vmovsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovsd-xmm_xmm_xmm.S b/testcases/TaxCalc/vmovsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..d31c45a --- /dev/null +++ b/testcases/TaxCalc/vmovsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovupd-xmm_mem-TP.S b/testcases/TaxCalc/vmovupd-xmm_mem-TP.S new file mode 100644 index 0000000..9c5d7a0 --- /dev/null +++ b/testcases/TaxCalc/vmovupd-xmm_mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR vmovupd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmovupd-xmm_mem.S b/testcases/TaxCalc/vmovupd-xmm_mem.S new file mode 100644 index 0000000..b5cc153 --- /dev/null +++ b/testcases/TaxCalc/vmovupd-xmm_mem.S @@ -0,0 +1,101 @@ +#define INSTR vmovupd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulpd-ymm_ymm_mem-TP.S b/testcases/TaxCalc/vmulpd-ymm_ymm_mem-TP.S new file mode 100644 index 0000000..bdbd111 --- /dev/null +++ b/testcases/TaxCalc/vmulpd-ymm_ymm_mem-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, [rip+PI] + INSTR ymm4, ymm1, [rip+PI] + INSTR ymm5, ymm2, [rip+PI] + INSTR ymm6, ymm0, [rip+PI] + INSTR ymm7, ymm1, [rip+PI] + INSTR ymm8, ymm2, [rip+PI] + INSTR ymm9, ymm0, [rip+PI] + INSTR ymm10, ymm1, [rip+PI] + INSTR ymm11, ymm2, [rip+PI] + INSTR ymm12, ymm0, [rip+PI] + INSTR ymm13, ymm1, [rip+PI] + INSTR ymm14, ymm2, [rip+PI] + INSTR ymm15, ymm0, [rip+PI] + INSTR ymm3, ymm1, [rip+PI] + INSTR ymm4, ymm2, [rip+PI] + INSTR ymm5, ymm0, [rip+PI] + INSTR ymm6, ymm1, [rip+PI] + INSTR ymm7, ymm2, [rip+PI] + INSTR ymm8, ymm0, [rip+PI] + INSTR ymm9, ymm1, [rip+PI] + INSTR ymm10, ymm2, [rip+PI] + INSTR ymm11, ymm0, [rip+PI] + INSTR ymm12, ymm1, [rip+PI] + INSTR ymm13, ymm2, [rip+PI] + INSTR ymm14, ymm0, [rip+PI] + INSTR ymm15, ymm1, [rip+PI] + INSTR ymm3, ymm2, [rip+PI] + INSTR ymm4, ymm0, [rip+PI] + INSTR ymm5, ymm1, [rip+PI] + INSTR ymm6, ymm2, [rip+PI] + INSTR ymm7, ymm0, [rip+PI] + INSTR ymm8, ymm1, [rip+PI] + INSTR ymm9, ymm2, [rip+PI] + INSTR ymm10, ymm0, [rip+PI] + INSTR ymm11, ymm1, [rip+PI] + INSTR ymm12, ymm2, [rip+PI] + INSTR ymm13, ymm0, [rip+PI] + INSTR ymm14, ymm1, [rip+PI] + INSTR ymm15, ymm2, [rip+PI] + INSTR ymm3, ymm0, [rip+PI] + INSTR ymm4, ymm1, [rip+PI] + INSTR ymm5, ymm2, [rip+PI] + INSTR ymm6, ymm0, [rip+PI] + INSTR ymm7, ymm1, [rip+PI] + INSTR ymm8, ymm2, [rip+PI] + INSTR ymm9, ymm0, [rip+PI] + INSTR ymm10, ymm1, [rip+PI] + INSTR ymm11, ymm2, [rip+PI] + INSTR ymm12, ymm0, [rip+PI] + INSTR ymm13, ymm1, [rip+PI] + INSTR ymm14, ymm2, [rip+PI] + INSTR ymm15, ymm0, [rip+PI] + INSTR ymm3, ymm1, [rip+PI] + INSTR ymm4, ymm2, [rip+PI] + INSTR ymm5, ymm0, [rip+PI] + INSTR ymm6, ymm1, [rip+PI] + INSTR ymm7, ymm2, [rip+PI] + INSTR ymm8, ymm0, [rip+PI] + INSTR ymm9, ymm1, [rip+PI] + INSTR ymm10, ymm2, [rip+PI] + INSTR ymm11, ymm0, [rip+PI] + INSTR ymm12, ymm1, [rip+PI] + INSTR ymm13, ymm2, [rip+PI] + INSTR ymm14, ymm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulpd-ymm_ymm_mem.S b/testcases/TaxCalc/vmulpd-ymm_ymm_mem.S new file mode 100644 index 0000000..3193575 --- /dev/null +++ b/testcases/TaxCalc/vmulpd-ymm_ymm_mem.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulpd-ymm_ymm_ymm-TP.S b/testcases/TaxCalc/vmulpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..029acd9 --- /dev/null +++ b/testcases/TaxCalc/vmulpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulpd-ymm_ymm_ymm.S b/testcases/TaxCalc/vmulpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..830c26d --- /dev/null +++ b/testcases/TaxCalc/vmulpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulsd-xmm_xmm_mem-TP.S b/testcases/TaxCalc/vmulsd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..5a0359f --- /dev/null +++ b/testcases/TaxCalc/vmulsd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulsd-xmm_xmm_mem.S b/testcases/TaxCalc/vmulsd-xmm_xmm_mem.S new file mode 100644 index 0000000..4b70252 --- /dev/null +++ b/testcases/TaxCalc/vmulsd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulsd-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vmulsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..c2dc870 --- /dev/null +++ b/testcases/TaxCalc/vmulsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vmulsd-xmm_xmm_xmm.S b/testcases/TaxCalc/vmulsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..97d4bac --- /dev/null +++ b/testcases/TaxCalc/vmulsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vsubpd-ymm_ymm_ymm-TP.S b/testcases/TaxCalc/vsubpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..2eca166 --- /dev/null +++ b/testcases/TaxCalc/vsubpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vsubpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vsubpd-ymm_ymm_ymm.S b/testcases/TaxCalc/vsubpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..96d3fe9 --- /dev/null +++ b/testcases/TaxCalc/vsubpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vsubpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vsubsd-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vsubsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..ceb9507 --- /dev/null +++ b/testcases/TaxCalc/vsubsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vsubsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vsubsd-xmm_xmm_xmm.S b/testcases/TaxCalc/vsubsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..b7429a4 --- /dev/null +++ b/testcases/TaxCalc/vsubsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vsubsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vunpckhpd-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vunpckhpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..1d99838 --- /dev/null +++ b/testcases/TaxCalc/vunpckhpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vunpckhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vunpckhpd-xmm_xmm_xmm.S b/testcases/TaxCalc/vunpckhpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..8807655 --- /dev/null +++ b/testcases/TaxCalc/vunpckhpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vunpckhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vxorpd-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vxorpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..a7a81f7 --- /dev/null +++ b/testcases/TaxCalc/vxorpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,172 @@ +#define INSTR vxorpd +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vxorpd-xmm_xmm_xmm.S b/testcases/TaxCalc/vxorpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..4c56abc --- /dev/null +++ b/testcases/TaxCalc/vxorpd-xmm_xmm_xmm.S @@ -0,0 +1,172 @@ +#define INSTR vxorpd +#define NINST 128 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vxorpd-ymm_ymm_ymm-TP.S b/testcases/TaxCalc/vxorpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..3a7e7fe --- /dev/null +++ b/testcases/TaxCalc/vxorpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vxorpd-ymm_ymm_ymm.S b/testcases/TaxCalc/vxorpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..8ab0f92 --- /dev/null +++ b/testcases/TaxCalc/vxorpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vxorps-xmm_xmm_xmm-TP.S b/testcases/TaxCalc/vxorps-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..77475af --- /dev/null +++ b/testcases/TaxCalc/vxorps-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vxorps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/vxorps-xmm_xmm_xmm.S b/testcases/TaxCalc/vxorps-xmm_xmm_xmm.S new file mode 100644 index 0000000..f1a1a8c --- /dev/null +++ b/testcases/TaxCalc/vxorps-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vxorps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/xor-r32_r32-TP.S b/testcases/TaxCalc/xor-r32_r32-TP.S new file mode 100644 index 0000000..bf5757b --- /dev/null +++ b/testcases/TaxCalc/xor-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR xor +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/TaxCalc/xor-r32_r32.S b/testcases/TaxCalc/xor-r32_r32.S new file mode 100644 index 0000000..652a935 --- /dev/null +++ b/testcases/TaxCalc/xor-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR xor +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r32_imd-TP.S b/testcases/add-r32_imd-TP.S new file mode 100644 index 0000000..26cfee8 --- /dev/null +++ b/testcases/add-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r32_imd.S b/testcases/add-r32_imd.S new file mode 100644 index 0000000..3e57ea7 --- /dev/null +++ b/testcases/add-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r32_mem-TP.S b/testcases/add-r32_mem-TP.S new file mode 100644 index 0000000..64fc02f --- /dev/null +++ b/testcases/add-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r32_mem.S b/testcases/add-r32_mem.S new file mode 100644 index 0000000..7c94bcc --- /dev/null +++ b/testcases/add-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r64_imd-TP.S b/testcases/add-r64_imd-TP.S new file mode 100644 index 0000000..b4e6897 --- /dev/null +++ b/testcases/add-r64_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r64_imd.S b/testcases/add-r64_imd.S new file mode 100644 index 0000000..79aabb2 --- /dev/null +++ b/testcases/add-r64_imd.S @@ -0,0 +1,134 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r64_r64-TP.S b/testcases/add-r64_r64-TP.S new file mode 100644 index 0000000..d475743 --- /dev/null +++ b/testcases/add-r64_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-r64_r64.S b/testcases/add-r64_r64.S new file mode 100644 index 0000000..a64dc7c --- /dev/null +++ b/testcases/add-r64_r64.S @@ -0,0 +1,143 @@ +#define INSTR add +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/addl-mem_imd-TP.S b/testcases/addl-mem_imd-TP.S new file mode 100644 index 0000000..3987eb3 --- /dev/null +++ b/testcases/addl-mem_imd-TP.S @@ -0,0 +1,101 @@ +#define INSTR addl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/addl-mem_imd.S b/testcases/addl-mem_imd.S new file mode 100644 index 0000000..4693ece --- /dev/null +++ b/testcases/addl-mem_imd.S @@ -0,0 +1,101 @@ +#define INSTR addl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/and-r32_imd-TP.S b/testcases/and-r32_imd-TP.S new file mode 100644 index 0000000..0d48a75 --- /dev/null +++ b/testcases/and-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR and +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/and-r32_imd.S b/testcases/and-r32_imd.S new file mode 100644 index 0000000..99deb6a --- /dev/null +++ b/testcases/and-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR and +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r32_imd-TP.S b/testcases/cmp-r32_imd-TP.S new file mode 100644 index 0000000..34d0509 --- /dev/null +++ b/testcases/cmp-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r32_imd.S b/testcases/cmp-r32_imd.S new file mode 100644 index 0000000..5f412ad --- /dev/null +++ b/testcases/cmp-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r32_mem-TP.S b/testcases/cmp-r32_mem-TP.S new file mode 100644 index 0000000..88baf8d --- /dev/null +++ b/testcases/cmp-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r32_mem.S b/testcases/cmp-r32_mem.S new file mode 100644 index 0000000..12b88d1 --- /dev/null +++ b/testcases/cmp-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r32_r32-TP.S b/testcases/cmp-r32_r32-TP.S new file mode 100644 index 0000000..c359fe8 --- /dev/null +++ b/testcases/cmp-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r32_r32.S b/testcases/cmp-r32_r32.S new file mode 100644 index 0000000..99b4b20 --- /dev/null +++ b/testcases/cmp-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r64_imd-TP.S b/testcases/cmp-r64_imd-TP.S new file mode 100644 index 0000000..9c76e1b --- /dev/null +++ b/testcases/cmp-r64_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + INSTR rdx, 2 + INSTR r9, 13 + INSTR r10, 1 + INSTR r11, 2 + INSTR r12, 13 + INSTR r13, 1 + INSTR r14, 2 + INSTR r15, 13 + INSTR rdx, 1 + INSTR r9, 2 + INSTR r10, 13 + INSTR r11, 1 + INSTR r12, 2 + INSTR r13, 13 + INSTR r14, 1 + INSTR r15, 2 + INSTR rdx, 13 + INSTR r9, 1 + INSTR r10, 2 + INSTR r11, 13 + INSTR r12, 1 + INSTR r13, 2 + INSTR r14, 13 + INSTR r15, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r64_imd.S b/testcases/cmp-r64_imd.S new file mode 100644 index 0000000..54e5a3c --- /dev/null +++ b/testcases/cmp-r64_imd.S @@ -0,0 +1,134 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + INSTR rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r64_r64-TP.S b/testcases/cmp-r64_r64-TP.S new file mode 100644 index 0000000..45eaa89 --- /dev/null +++ b/testcases/cmp-r64_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-r64_r64.S b/testcases/cmp-r64_r64.S new file mode 100644 index 0000000..1e80d5d --- /dev/null +++ b/testcases/cmp-r64_r64.S @@ -0,0 +1,143 @@ +#define INSTR cmp +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/dec-r32-TP.S b/testcases/dec-r32-TP.S new file mode 100644 index 0000000..f886ad1 --- /dev/null +++ b/testcases/dec-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR dec +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/dec-r32.S b/testcases/dec-r32.S new file mode 100644 index 0000000..7c18fd9 --- /dev/null +++ b/testcases/dec-r32.S @@ -0,0 +1,143 @@ +#define INSTR dec +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/imul-r64_r64_imd-TP.S b/testcases/imul-r64_r64_imd-TP.S new file mode 100644 index 0000000..ba6292c --- /dev/null +++ b/testcases/imul-r64_r64_imd-TP.S @@ -0,0 +1,143 @@ +#define INSTR imul +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax, 1 + INSTR r9, rbx, 2 + INSTR r10, rcx, 13 + INSTR r11, rax, 1 + INSTR r12, rbx, 2 + INSTR r13, rcx, 13 + INSTR r14, rax, 1 + INSTR r15, rbx, 2 + INSTR rdx, rcx, 13 + INSTR r9, rax, 1 + INSTR r10, rbx, 2 + INSTR r11, rcx, 13 + INSTR r12, rax, 1 + INSTR r13, rbx, 2 + INSTR r14, rcx, 13 + INSTR r15, rax, 1 + INSTR rdx, rbx, 2 + INSTR r9, rcx, 13 + INSTR r10, rax, 1 + INSTR r11, rbx, 2 + INSTR r12, rcx, 13 + INSTR r13, rax, 1 + INSTR r14, rbx, 2 + INSTR r15, rcx, 13 + INSTR rdx, rax, 1 + INSTR r9, rbx, 2 + INSTR r10, rcx, 13 + INSTR r11, rax, 1 + INSTR r12, rbx, 2 + INSTR r13, rcx, 13 + INSTR r14, rax, 1 + INSTR r15, rbx, 2 + INSTR rdx, rcx, 13 + INSTR r9, rax, 1 + INSTR r10, rbx, 2 + INSTR r11, rcx, 13 + INSTR r12, rax, 1 + INSTR r13, rbx, 2 + INSTR r14, rcx, 13 + INSTR r15, rax, 1 + INSTR rdx, rbx, 2 + INSTR r9, rcx, 13 + INSTR r10, rax, 1 + INSTR r11, rbx, 2 + INSTR r12, rcx, 13 + INSTR r13, rax, 1 + INSTR r14, rbx, 2 + INSTR r15, rcx, 13 + INSTR rdx, rax, 1 + INSTR r9, rbx, 2 + INSTR r10, rcx, 13 + INSTR r11, rax, 1 + INSTR r12, rbx, 2 + INSTR r13, rcx, 13 + INSTR r14, rax, 1 + INSTR r15, rbx, 2 + INSTR rdx, rcx, 13 + INSTR r9, rax, 1 + INSTR r10, rbx, 2 + INSTR r11, rcx, 13 + INSTR r12, rax, 1 + INSTR r13, rbx, 2 + INSTR r14, rcx, 13 + INSTR r15, rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/imul-r64_r64_imd.S b/testcases/imul-r64_r64_imd.S new file mode 100644 index 0000000..f0fac52 --- /dev/null +++ b/testcases/imul-r64_r64_imd.S @@ -0,0 +1,143 @@ +#define INSTR imul +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + INSTR rax, rbx, 1 + INSTR rbx, rax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r32-TP.S b/testcases/inc-r32-TP.S new file mode 100644 index 0000000..34f98ff --- /dev/null +++ b/testcases/inc-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r32.S b/testcases/inc-r32.S new file mode 100644 index 0000000..84f2a8c --- /dev/null +++ b/testcases/inc-r32.S @@ -0,0 +1,143 @@ +#define INSTR inc +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r64-TP.S b/testcases/inc-r64-TP.S index 62b649d..a9273f0 100644 --- a/testcases/inc-r64-TP.S +++ b/testcases/inc-r64-TP.S @@ -1,5 +1,5 @@ #define INSTR inc -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -91,6 +91,38 @@ loop: INSTR r13 INSTR r14 INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 cmp i, N jl loop pop r15 diff --git a/testcases/inc-r64.S b/testcases/inc-r64.S index 095248a..1c15147 100644 --- a/testcases/inc-r64.S +++ b/testcases/inc-r64.S @@ -1,5 +1,5 @@ #define INSTR inc -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -91,6 +91,38 @@ loop: INSTR rax INSTR rax INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax cmp i, N jl loop pop r15 diff --git a/testcases/lea-r32_mem-TP.S b/testcases/lea-r32_mem-TP.S index c5becb3..9ab76b8 100644 --- a/testcases/lea-r32_mem-TP.S +++ b/testcases/lea-r32_mem-TP.S @@ -1,5 +1,5 @@ #define INSTR lea -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -82,6 +82,38 @@ loop: INSTR r13d, [rip+PI] INSTR r14d, [rip+PI] INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] cmp i, N jl loop pop r15 diff --git a/testcases/lea-r32_mem.S b/testcases/lea-r32_mem.S index 1a7bf5e..0516e8d 100644 --- a/testcases/lea-r32_mem.S +++ b/testcases/lea-r32_mem.S @@ -1,5 +1,5 @@ #define INSTR lea -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -82,6 +82,38 @@ loop: INSTR eax, [rip+PI] INSTR eax, [rip+PI] INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] cmp i, N jl loop pop r15 diff --git a/testcases/lea-r64_mem-TP.S b/testcases/lea-r64_mem-TP.S new file mode 100644 index 0000000..e31ca30 --- /dev/null +++ b/testcases/lea-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/lea-r64_mem.S b/testcases/lea-r64_mem.S new file mode 100644 index 0000000..aad963e --- /dev/null +++ b/testcases/lea-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR lea +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-mem_r32-TP.S b/testcases/mov-mem_r32-TP.S new file mode 100644 index 0000000..18142e2 --- /dev/null +++ b/testcases/mov-mem_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + INSTR [rip+PI], ebx + INSTR [rip+PI], ecx + INSTR [rip+PI], eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-mem_r32.S b/testcases/mov-mem_r32.S new file mode 100644 index 0000000..427caf4 --- /dev/null +++ b/testcases/mov-mem_r32.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + INSTR [rip+PI], eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-mem_r64-TP.S b/testcases/mov-mem_r64-TP.S new file mode 100644 index 0000000..b4a7f6a --- /dev/null +++ b/testcases/mov-mem_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + INSTR [rip+PI], rbx + INSTR [rip+PI], rcx + INSTR [rip+PI], rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-mem_r64.S b/testcases/mov-mem_r64.S new file mode 100644 index 0000000..c1c6012 --- /dev/null +++ b/testcases/mov-mem_r64.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + INSTR [rip+PI], rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r32_imd-TP.S b/testcases/mov-r32_imd-TP.S new file mode 100644 index 0000000..d81bbac --- /dev/null +++ b/testcases/mov-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r32_imd.S b/testcases/mov-r32_imd.S new file mode 100644 index 0000000..b1f4bda --- /dev/null +++ b/testcases/mov-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r32_mem-TP.S b/testcases/mov-r32_mem-TP.S new file mode 100644 index 0000000..69c76ec --- /dev/null +++ b/testcases/mov-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r32_mem.S b/testcases/mov-r32_mem.S new file mode 100644 index 0000000..e4e7313 --- /dev/null +++ b/testcases/mov-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r32_r32-TP.S b/testcases/mov-r32_r32-TP.S new file mode 100644 index 0000000..5bdcdf6 --- /dev/null +++ b/testcases/mov-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r32_r32.S b/testcases/mov-r32_r32.S new file mode 100644 index 0000000..d896ce7 --- /dev/null +++ b/testcases/mov-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r64_mem-TP.S b/testcases/mov-r64_mem-TP.S new file mode 100644 index 0000000..97984a3 --- /dev/null +++ b/testcases/mov-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-r64_mem.S b/testcases/mov-r64_mem.S new file mode 100644 index 0000000..7095f31 --- /dev/null +++ b/testcases/mov-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR mov +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movl-mem_imd-TP.S b/testcases/movl-mem_imd-TP.S new file mode 100644 index 0000000..2be91ea --- /dev/null +++ b/testcases/movl-mem_imd-TP.S @@ -0,0 +1,101 @@ +#define INSTR movl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + INSTR [rip+PI], 2 + INSTR [rip+PI], 13 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movl-mem_imd.S b/testcases/movl-mem_imd.S new file mode 100644 index 0000000..acf1961 --- /dev/null +++ b/testcases/movl-mem_imd.S @@ -0,0 +1,101 @@ +#define INSTR movl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + INSTR [rip+PI], 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movsbl-r32_mem-TP.S b/testcases/movsbl-r32_mem-TP.S new file mode 100644 index 0000000..68c4967 --- /dev/null +++ b/testcases/movsbl-r32_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR movsbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + INSTR edx, [rip+PI] + INSTR r9d, [rip+PI] + INSTR r10d, [rip+PI] + INSTR r11d, [rip+PI] + INSTR r12d, [rip+PI] + INSTR r13d, [rip+PI] + INSTR r14d, [rip+PI] + INSTR r15d, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movsbl-r32_mem.S b/testcases/movsbl-r32_mem.S new file mode 100644 index 0000000..d24ac60 --- /dev/null +++ b/testcases/movsbl-r32_mem.S @@ -0,0 +1,134 @@ +#define INSTR movsbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + INSTR eax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-r64_mem-TP.S b/testcases/movslq-r64_mem-TP.S new file mode 100644 index 0000000..e4ba19f --- /dev/null +++ b/testcases/movslq-r64_mem-TP.S @@ -0,0 +1,134 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + INSTR rdx, [rip+PI] + INSTR r9, [rip+PI] + INSTR r10, [rip+PI] + INSTR r11, [rip+PI] + INSTR r12, [rip+PI] + INSTR r13, [rip+PI] + INSTR r14, [rip+PI] + INSTR r15, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-r64_mem.S b/testcases/movslq-r64_mem.S new file mode 100644 index 0000000..50c48ed --- /dev/null +++ b/testcases/movslq-r64_mem.S @@ -0,0 +1,134 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + INSTR rax, [rip+PI] + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-r64_r32-TP.S b/testcases/movslq-r64_r32-TP.S new file mode 100644 index 0000000..9b12cc4 --- /dev/null +++ b/testcases/movslq-r64_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + INSTR rdx, ebx + INSTR r9, ecx + INSTR r10, eax + INSTR r11, ebx + INSTR r12, ecx + INSTR r13, eax + INSTR r14, ebx + INSTR r15, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + INSTR rdx, ebx + INSTR r9, ecx + INSTR r10, eax + INSTR r11, ebx + INSTR r12, ecx + INSTR r13, eax + INSTR r14, ebx + INSTR r15, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-r64_r32.S b/testcases/movslq-r64_r32.S new file mode 100644 index 0000000..bf6f2bd --- /dev/null +++ b/testcases/movslq-r64_r32.S @@ -0,0 +1,143 @@ +#define INSTR movslq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movzbl-r32_r8-TP.S b/testcases/movzbl-r32_r8-TP.S new file mode 100644 index 0000000..95020b8 --- /dev/null +++ b/testcases/movzbl-r32_r8-TP.S @@ -0,0 +1,143 @@ +#define INSTR movzbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR edx, bl + INSTR r9d, cl + INSTR r10d, al + INSTR r11d, bl + INSTR r12d, cl + INSTR r13d, al + INSTR r14d, bl + INSTR r15d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movzbl-r32_r8.S b/testcases/movzbl-r32_r8.S new file mode 100644 index 0000000..d67a693 --- /dev/null +++ b/testcases/movzbl-r32_r8.S @@ -0,0 +1,143 @@ +#define INSTR movzbl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/neg-r32-TP.S b/testcases/neg-r32-TP.S new file mode 100644 index 0000000..e60f4a2 --- /dev/null +++ b/testcases/neg-r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR neg +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/neg-r32.S b/testcases/neg-r32.S new file mode 100644 index 0000000..c25e69c --- /dev/null +++ b/testcases/neg-r32.S @@ -0,0 +1,143 @@ +#define INSTR neg +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/nopl-mem-TP.S b/testcases/nopl-mem-TP.S new file mode 100644 index 0000000..9519631 --- /dev/null +++ b/testcases/nopl-mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR nopl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/nopl-mem.S b/testcases/nopl-mem.S new file mode 100644 index 0000000..9519631 --- /dev/null +++ b/testcases/nopl-mem.S @@ -0,0 +1,101 @@ +#define INSTR nopl +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/nopw-mem-TP.S b/testcases/nopw-mem-TP.S new file mode 100644 index 0000000..36786a2 --- /dev/null +++ b/testcases/nopw-mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR nopw +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/nopw-mem.S b/testcases/nopw-mem.S new file mode 100644 index 0000000..36786a2 --- /dev/null +++ b/testcases/nopw-mem.S @@ -0,0 +1,101 @@ +#define INSTR nopw +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + INSTR [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/pop-r64-TP.S b/testcases/pop-r64-TP.S new file mode 100644 index 0000000..29c5434 --- /dev/null +++ b/testcases/pop-r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR pop +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/pop-r64.S b/testcases/pop-r64.S new file mode 100644 index 0000000..c1f6ccf --- /dev/null +++ b/testcases/pop-r64.S @@ -0,0 +1,143 @@ +#define INSTR pop +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/pushq-imd-TP.S b/testcases/pushq-imd-TP.S new file mode 100644 index 0000000..ec56071 --- /dev/null +++ b/testcases/pushq-imd-TP.S @@ -0,0 +1,101 @@ +#define INSTR pushq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + INSTR 255 + INSTR 22 + INSTR 8 + INSTR 78 + INSTR 159 + INSTR 222 + INSTR 3 + INSTR 9 + INSTR 5 + INSTR 55 + INSTR 173 + INSTR 317 + INSTR 254 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/pushq-imd.S b/testcases/pushq-imd.S new file mode 100644 index 0000000..433b6d2 --- /dev/null +++ b/testcases/pushq-imd.S @@ -0,0 +1,101 @@ +#define INSTR pushq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + INSTR 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/shr-r32_imd-TP.S b/testcases/shr-r32_imd-TP.S new file mode 100644 index 0000000..547f8af --- /dev/null +++ b/testcases/shr-r32_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR shr +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + INSTR edx, 2 + INSTR r9d, 13 + INSTR r10d, 1 + INSTR r11d, 2 + INSTR r12d, 13 + INSTR r13d, 1 + INSTR r14d, 2 + INSTR r15d, 13 + INSTR edx, 1 + INSTR r9d, 2 + INSTR r10d, 13 + INSTR r11d, 1 + INSTR r12d, 2 + INSTR r13d, 13 + INSTR r14d, 1 + INSTR r15d, 2 + INSTR edx, 13 + INSTR r9d, 1 + INSTR r10d, 2 + INSTR r11d, 13 + INSTR r12d, 1 + INSTR r13d, 2 + INSTR r14d, 13 + INSTR r15d, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/shr-r32_imd.S b/testcases/shr-r32_imd.S new file mode 100644 index 0000000..0d62a94 --- /dev/null +++ b/testcases/shr-r32_imd.S @@ -0,0 +1,134 @@ +#define INSTR shr +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + INSTR eax, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-r32_r32-TP.S b/testcases/sub-r32_r32-TP.S new file mode 100644 index 0000000..2f45769 --- /dev/null +++ b/testcases/sub-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-r32_r32.S b/testcases/sub-r32_r32.S new file mode 100644 index 0000000..91a7610 --- /dev/null +++ b/testcases/sub-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-r64_r64-TP.S b/testcases/sub-r64_r64-TP.S new file mode 100644 index 0000000..6cdf17b --- /dev/null +++ b/testcases/sub-r64_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rdx, rbx + INSTR r9, rcx + INSTR r10, rax + INSTR r11, rbx + INSTR r12, rcx + INSTR r13, rax + INSTR r14, rbx + INSTR r15, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-r64_r64.S b/testcases/sub-r64_r64.S new file mode 100644 index 0000000..cee91a9 --- /dev/null +++ b/testcases/sub-r64_r64.S @@ -0,0 +1,143 @@ +#define INSTR sub +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-r32_r32-TP.S b/testcases/test-r32_r32-TP.S new file mode 100644 index 0000000..5403390 --- /dev/null +++ b/testcases/test-r32_r32-TP.S @@ -0,0 +1,143 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-r32_r32.S b/testcases/test-r32_r32.S new file mode 100644 index 0000000..8c7e48d --- /dev/null +++ b/testcases/test-r32_r32.S @@ -0,0 +1,143 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-r8_imd-TP.S b/testcases/test-r8_imd-TP.S new file mode 100644 index 0000000..b2650fa --- /dev/null +++ b/testcases/test-r8_imd-TP.S @@ -0,0 +1,134 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR dl, 1 + INSTR r9l, 2 + INSTR r10l, 13 + INSTR r11l, 1 + INSTR r12l, 2 + INSTR r13l, 13 + INSTR r14l, 1 + INSTR r15l, 2 + INSTR dl, 13 + INSTR r9l, 1 + INSTR r10l, 2 + INSTR r11l, 13 + INSTR r12l, 1 + INSTR r13l, 2 + INSTR r14l, 13 + INSTR r15l, 1 + INSTR dl, 2 + INSTR r9l, 13 + INSTR r10l, 1 + INSTR r11l, 2 + INSTR r12l, 13 + INSTR r13l, 1 + INSTR r14l, 2 + INSTR r15l, 13 + INSTR dl, 1 + INSTR r9l, 2 + INSTR r10l, 13 + INSTR r11l, 1 + INSTR r12l, 2 + INSTR r13l, 13 + INSTR r14l, 1 + INSTR r15l, 2 + INSTR dl, 13 + INSTR r9l, 1 + INSTR r10l, 2 + INSTR r11l, 13 + INSTR r12l, 1 + INSTR r13l, 2 + INSTR r14l, 13 + INSTR r15l, 1 + INSTR dl, 2 + INSTR r9l, 13 + INSTR r10l, 1 + INSTR r11l, 2 + INSTR r12l, 13 + INSTR r13l, 1 + INSTR r14l, 2 + INSTR r15l, 13 + INSTR dl, 1 + INSTR r9l, 2 + INSTR r10l, 13 + INSTR r11l, 1 + INSTR r12l, 2 + INSTR r13l, 13 + INSTR r14l, 1 + INSTR r15l, 2 + INSTR dl, 13 + INSTR r9l, 1 + INSTR r10l, 2 + INSTR r11l, 13 + INSTR r12l, 1 + INSTR r13l, 2 + INSTR r14l, 13 + INSTR r15l, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-r8_imd.S b/testcases/test-r8_imd.S new file mode 100644 index 0000000..1abce37 --- /dev/null +++ b/testcases/test-r8_imd.S @@ -0,0 +1,134 @@ +#define INSTR test +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 +loop: + inc i + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + INSTR al, 1 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-xmm_xmm_xmm-TP.S b/testcases/vaddpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..7bf13a5 --- /dev/null +++ b/testcases/vaddpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-xmm_xmm_xmm.S b/testcases/vaddpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..a4bf29b --- /dev/null +++ b/testcases/vaddpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-ymm_ymm_ymm-TP.S b/testcases/vaddpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..268aafe --- /dev/null +++ b/testcases/vaddpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-ymm_ymm_ymm.S b/testcases/vaddpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..0edbbbe --- /dev/null +++ b/testcases/vaddpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vaddpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmm_xmm_mem-TP.S b/testcases/vaddsd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..902cf3a --- /dev/null +++ b/testcases/vaddsd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmm_xmm_mem.S b/testcases/vaddsd-xmm_xmm_mem.S new file mode 100644 index 0000000..8a4bc84 --- /dev/null +++ b/testcases/vaddsd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmm_xmm_xmm-TP.S b/testcases/vaddsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..274e201 --- /dev/null +++ b/testcases/vaddsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmm_xmm_xmm.S b/testcases/vaddsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..d071892 --- /dev/null +++ b/testcases/vaddsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vaddsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddss-xmm_xmm_xmm-TP.S b/testcases/vaddss-xmm_xmm_xmm-TP.S index 00ffeb6..af113a5 100644 --- a/testcases/vaddss-xmm_xmm_xmm-TP.S +++ b/testcases/vaddss-xmm_xmm_xmm-TP.S @@ -1,5 +1,5 @@ #define INSTR vaddss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm6, xmm2, xmm2 INSTR xmm7, xmm0, xmm0 INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 cmp i, N jl loop done: diff --git a/testcases/vaddss-xmm_xmm_xmm.S b/testcases/vaddss-xmm_xmm_xmm.S index 550fc3e..2ac1630 100644 --- a/testcases/vaddss-xmm_xmm_xmm.S +++ b/testcases/vaddss-xmm_xmm_xmm.S @@ -1,5 +1,5 @@ #define INSTR vaddss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm1, xmm0, xmm0 INSTR xmm0, xmm1, xmm0 INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 cmp i, N jl loop done: diff --git a/testcases/vcvtsi2ss-xmm_xmm_r32-TP.S b/testcases/vcvtsi2ss-xmm_xmm_r32-TP.S index dd8bece..981f0de 100644 --- a/testcases/vcvtsi2ss-xmm_xmm_r32-TP.S +++ b/testcases/vcvtsi2ss-xmm_xmm_r32-TP.S @@ -1,5 +1,5 @@ #define INSTR vcvtsi2ss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -89,6 +89,38 @@ loop: INSTR xmm6, xmm2, ecx INSTR xmm7, xmm0, eax INSTR xmm8, xmm1, ebx + INSTR xmm9, xmm2, ecx + INSTR xmm10, xmm0, eax + INSTR xmm11, xmm1, ebx + INSTR xmm12, xmm2, ecx + INSTR xmm13, xmm0, eax + INSTR xmm14, xmm1, ebx + INSTR xmm15, xmm2, ecx + INSTR xmm3, xmm0, eax + INSTR xmm4, xmm1, ebx + INSTR xmm5, xmm2, ecx + INSTR xmm6, xmm0, eax + INSTR xmm7, xmm1, ebx + INSTR xmm8, xmm2, ecx + INSTR xmm9, xmm0, eax + INSTR xmm10, xmm1, ebx + INSTR xmm11, xmm2, ecx + INSTR xmm12, xmm0, eax + INSTR xmm13, xmm1, ebx + INSTR xmm14, xmm2, ecx + INSTR xmm15, xmm0, eax + INSTR xmm3, xmm1, ebx + INSTR xmm4, xmm2, ecx + INSTR xmm5, xmm0, eax + INSTR xmm6, xmm1, ebx + INSTR xmm7, xmm2, ecx + INSTR xmm8, xmm0, eax + INSTR xmm9, xmm1, ebx + INSTR xmm10, xmm2, ecx + INSTR xmm11, xmm0, eax + INSTR xmm12, xmm1, ebx + INSTR xmm13, xmm2, ecx + INSTR xmm14, xmm0, eax cmp i, N jl loop pop r15 diff --git a/testcases/vcvtsi2ss-xmm_xmm_r32.S b/testcases/vcvtsi2ss-xmm_xmm_r32.S index 862f951..e2bdd56 100644 --- a/testcases/vcvtsi2ss-xmm_xmm_r32.S +++ b/testcases/vcvtsi2ss-xmm_xmm_r32.S @@ -1,5 +1,5 @@ #define INSTR vcvtsi2ss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -89,6 +89,38 @@ loop: INSTR xmm1, xmm0, eax INSTR xmm0, xmm1, eax INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax + INSTR xmm0, xmm1, eax + INSTR xmm1, xmm0, eax cmp i, N jl loop pop r15 diff --git a/testcases/vextractf128-xmm_ymm_imd-TP.S b/testcases/vextractf128-xmm_ymm_imd-TP.S new file mode 100644 index 0000000..6586196 --- /dev/null +++ b/testcases/vextractf128-xmm_ymm_imd-TP.S @@ -0,0 +1,110 @@ +#define INSTR vextractf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR xmm3, ymm0, 1 + INSTR xmm4, ymm1, 2 + INSTR xmm5, ymm2, 13 + INSTR xmm6, ymm0, 1 + INSTR xmm7, ymm1, 2 + INSTR xmm8, ymm2, 13 + INSTR xmm9, ymm0, 1 + INSTR xmm10, ymm1, 2 + INSTR xmm11, ymm2, 13 + INSTR xmm12, ymm0, 1 + INSTR xmm13, ymm1, 2 + INSTR xmm14, ymm2, 13 + INSTR xmm15, ymm0, 1 + INSTR xmm3, ymm1, 2 + INSTR xmm4, ymm2, 13 + INSTR xmm5, ymm0, 1 + INSTR xmm6, ymm1, 2 + INSTR xmm7, ymm2, 13 + INSTR xmm8, ymm0, 1 + INSTR xmm9, ymm1, 2 + INSTR xmm10, ymm2, 13 + INSTR xmm11, ymm0, 1 + INSTR xmm12, ymm1, 2 + INSTR xmm13, ymm2, 13 + INSTR xmm14, ymm0, 1 + INSTR xmm15, ymm1, 2 + INSTR xmm3, ymm2, 13 + INSTR xmm4, ymm0, 1 + INSTR xmm5, ymm1, 2 + INSTR xmm6, ymm2, 13 + INSTR xmm7, ymm0, 1 + INSTR xmm8, ymm1, 2 + INSTR xmm9, ymm2, 13 + INSTR xmm10, ymm0, 1 + INSTR xmm11, ymm1, 2 + INSTR xmm12, ymm2, 13 + INSTR xmm13, ymm0, 1 + INSTR xmm14, ymm1, 2 + INSTR xmm15, ymm2, 13 + INSTR xmm3, ymm0, 1 + INSTR xmm4, ymm1, 2 + INSTR xmm5, ymm2, 13 + INSTR xmm6, ymm0, 1 + INSTR xmm7, ymm1, 2 + INSTR xmm8, ymm2, 13 + INSTR xmm9, ymm0, 1 + INSTR xmm10, ymm1, 2 + INSTR xmm11, ymm2, 13 + INSTR xmm12, ymm0, 1 + INSTR xmm13, ymm1, 2 + INSTR xmm14, ymm2, 13 + INSTR xmm15, ymm0, 1 + INSTR xmm3, ymm1, 2 + INSTR xmm4, ymm2, 13 + INSTR xmm5, ymm0, 1 + INSTR xmm6, ymm1, 2 + INSTR xmm7, ymm2, 13 + INSTR xmm8, ymm0, 1 + INSTR xmm9, ymm1, 2 + INSTR xmm10, ymm2, 13 + INSTR xmm11, ymm0, 1 + INSTR xmm12, ymm1, 2 + INSTR xmm13, ymm2, 13 + INSTR xmm14, ymm0, 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vextractf128-xmm_ymm_imd.S b/testcases/vextractf128-xmm_ymm_imd.S new file mode 100644 index 0000000..1eb9b58 --- /dev/null +++ b/testcases/vextractf128-xmm_ymm_imd.S @@ -0,0 +1,46 @@ +#define INSTR vextractf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vinsertf128-ymm_ymm_-TP.S b/testcases/vinsertf128-ymm_ymm_-TP.S new file mode 100644 index 0000000..f0b4652 --- /dev/null +++ b/testcases/vinsertf128-ymm_ymm_-TP.S @@ -0,0 +1,110 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + INSTR ymm15 + INSTR ymm3 + INSTR ymm4 + INSTR ymm5 + INSTR ymm6 + INSTR ymm7 + INSTR ymm8 + INSTR ymm9 + INSTR ymm10 + INSTR ymm11 + INSTR ymm12 + INSTR ymm13 + INSTR ymm14 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vinsertf128-ymm_ymm_.S b/testcases/vinsertf128-ymm_ymm_.S new file mode 100644 index 0000000..7527b42 --- /dev/null +++ b/testcases/vinsertf128-ymm_ymm_.S @@ -0,0 +1,46 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vinsertf128-ymm_ymm_imd-TP.S b/testcases/vinsertf128-ymm_ymm_imd-TP.S new file mode 100644 index 0000000..ba164fa --- /dev/null +++ b/testcases/vinsertf128-ymm_ymm_imd-TP.S @@ -0,0 +1,110 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, 1 + INSTR ymm4, ymm1, 2 + INSTR ymm5, ymm2, 13 + INSTR ymm6, ymm0, 1 + INSTR ymm7, ymm1, 2 + INSTR ymm8, ymm2, 13 + INSTR ymm9, ymm0, 1 + INSTR ymm10, ymm1, 2 + INSTR ymm11, ymm2, 13 + INSTR ymm12, ymm0, 1 + INSTR ymm13, ymm1, 2 + INSTR ymm14, ymm2, 13 + INSTR ymm15, ymm0, 1 + INSTR ymm3, ymm1, 2 + INSTR ymm4, ymm2, 13 + INSTR ymm5, ymm0, 1 + INSTR ymm6, ymm1, 2 + INSTR ymm7, ymm2, 13 + INSTR ymm8, ymm0, 1 + INSTR ymm9, ymm1, 2 + INSTR ymm10, ymm2, 13 + INSTR ymm11, ymm0, 1 + INSTR ymm12, ymm1, 2 + INSTR ymm13, ymm2, 13 + INSTR ymm14, ymm0, 1 + INSTR ymm15, ymm1, 2 + INSTR ymm3, ymm2, 13 + INSTR ymm4, ymm0, 1 + INSTR ymm5, ymm1, 2 + INSTR ymm6, ymm2, 13 + INSTR ymm7, ymm0, 1 + INSTR ymm8, ymm1, 2 + INSTR ymm9, ymm2, 13 + INSTR ymm10, ymm0, 1 + INSTR ymm11, ymm1, 2 + INSTR ymm12, ymm2, 13 + INSTR ymm13, ymm0, 1 + INSTR ymm14, ymm1, 2 + INSTR ymm15, ymm2, 13 + INSTR ymm3, ymm0, 1 + INSTR ymm4, ymm1, 2 + INSTR ymm5, ymm2, 13 + INSTR ymm6, ymm0, 1 + INSTR ymm7, ymm1, 2 + INSTR ymm8, ymm2, 13 + INSTR ymm9, ymm0, 1 + INSTR ymm10, ymm1, 2 + INSTR ymm11, ymm2, 13 + INSTR ymm12, ymm0, 1 + INSTR ymm13, ymm1, 2 + INSTR ymm14, ymm2, 13 + INSTR ymm15, ymm0, 1 + INSTR ymm3, ymm1, 2 + INSTR ymm4, ymm2, 13 + INSTR ymm5, ymm0, 1 + INSTR ymm6, ymm1, 2 + INSTR ymm7, ymm2, 13 + INSTR ymm8, ymm0, 1 + INSTR ymm9, ymm1, 2 + INSTR ymm10, ymm2, 13 + INSTR ymm11, ymm0, 1 + INSTR ymm12, ymm1, 2 + INSTR ymm13, ymm2, 13 + INSTR ymm14, ymm0, 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vinsertf128-ymm_ymm_imd.S b/testcases/vinsertf128-ymm_ymm_imd.S new file mode 100644 index 0000000..3372e40 --- /dev/null +++ b/testcases/vinsertf128-ymm_ymm_imd.S @@ -0,0 +1,110 @@ +#define INSTR vinsertf128 +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + INSTR ymm0, ymm1, 1 + INSTR ymm1, ymm0, 1 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-xmm_xmm-TP.S b/testcases/vmovapd-xmm_xmm-TP.S new file mode 100644 index 0000000..f39e016 --- /dev/null +++ b/testcases/vmovapd-xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-xmm_xmm.S b/testcases/vmovapd-xmm_xmm.S new file mode 100644 index 0000000..6b4e7af --- /dev/null +++ b/testcases/vmovapd-xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-ymm_ymm-TP.S b/testcases/vmovapd-ymm_ymm-TP.S new file mode 100644 index 0000000..754eedb --- /dev/null +++ b/testcases/vmovapd-ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm3, ymm1 + INSTR ymm4, ymm2 + INSTR ymm5, ymm0 + INSTR ymm6, ymm1 + INSTR ymm7, ymm2 + INSTR ymm8, ymm0 + INSTR ymm9, ymm1 + INSTR ymm10, ymm2 + INSTR ymm11, ymm0 + INSTR ymm12, ymm1 + INSTR ymm13, ymm2 + INSTR ymm14, ymm0 + INSTR ymm15, ymm1 + INSTR ymm3, ymm2 + INSTR ymm4, ymm0 + INSTR ymm5, ymm1 + INSTR ymm6, ymm2 + INSTR ymm7, ymm0 + INSTR ymm8, ymm1 + INSTR ymm9, ymm2 + INSTR ymm10, ymm0 + INSTR ymm11, ymm1 + INSTR ymm12, ymm2 + INSTR ymm13, ymm0 + INSTR ymm14, ymm1 + INSTR ymm15, ymm2 + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm3, ymm1 + INSTR ymm4, ymm2 + INSTR ymm5, ymm0 + INSTR ymm6, ymm1 + INSTR ymm7, ymm2 + INSTR ymm8, ymm0 + INSTR ymm9, ymm1 + INSTR ymm10, ymm2 + INSTR ymm11, ymm0 + INSTR ymm12, ymm1 + INSTR ymm13, ymm2 + INSTR ymm14, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-ymm_ymm.S b/testcases/vmovapd-ymm_ymm.S new file mode 100644 index 0000000..ac3137c --- /dev/null +++ b/testcases/vmovapd-ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vmovapd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovaps-xmm_xmm-TP.S b/testcases/vmovaps-xmm_xmm-TP.S new file mode 100644 index 0000000..53df367 --- /dev/null +++ b/testcases/vmovaps-xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovaps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + INSTR xmm15, xmm1 + INSTR xmm3, xmm2 + INSTR xmm4, xmm0 + INSTR xmm5, xmm1 + INSTR xmm6, xmm2 + INSTR xmm7, xmm0 + INSTR xmm8, xmm1 + INSTR xmm9, xmm2 + INSTR xmm10, xmm0 + INSTR xmm11, xmm1 + INSTR xmm12, xmm2 + INSTR xmm13, xmm0 + INSTR xmm14, xmm1 + INSTR xmm15, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm3, xmm1 + INSTR xmm4, xmm2 + INSTR xmm5, xmm0 + INSTR xmm6, xmm1 + INSTR xmm7, xmm2 + INSTR xmm8, xmm0 + INSTR xmm9, xmm1 + INSTR xmm10, xmm2 + INSTR xmm11, xmm0 + INSTR xmm12, xmm1 + INSTR xmm13, xmm2 + INSTR xmm14, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovaps-xmm_xmm.S b/testcases/vmovaps-xmm_xmm.S new file mode 100644 index 0000000..3e1baac --- /dev/null +++ b/testcases/vmovaps-xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovaps +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovhpd-xmm_xmm_mem-TP.S b/testcases/vmovhpd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..11cbaf0 --- /dev/null +++ b/testcases/vmovhpd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovhpd-xmm_xmm_mem.S b/testcases/vmovhpd-xmm_xmm_mem.S new file mode 100644 index 0000000..b423e4a --- /dev/null +++ b/testcases/vmovhpd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vmovhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-r64_xmm-TP.S b/testcases/vmovq-r64_xmm-TP.S new file mode 100644 index 0000000..b80c773 --- /dev/null +++ b/testcases/vmovq-r64_xmm-TP.S @@ -0,0 +1,141 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + INSTR rdx, xmm1 + INSTR r9, xmm2 + INSTR r10, xmm0 + INSTR r11, xmm1 + INSTR r12, xmm2 + INSTR r13, xmm0 + INSTR r14, xmm1 + INSTR r15, xmm2 + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + INSTR rdx, xmm1 + INSTR r9, xmm2 + INSTR r10, xmm0 + INSTR r11, xmm1 + INSTR r12, xmm2 + INSTR r13, xmm0 + INSTR r14, xmm1 + INSTR r15, xmm2 + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-r64_xmm.S b/testcases/vmovq-r64_xmm.S new file mode 100644 index 0000000..029ebc3 --- /dev/null +++ b/testcases/vmovq-r64_xmm.S @@ -0,0 +1,141 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-xmm_r64-TP.S b/testcases/vmovq-xmm_r64-TP.S new file mode 100644 index 0000000..fc7da5a --- /dev/null +++ b/testcases/vmovq-xmm_r64-TP.S @@ -0,0 +1,143 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm6, rax + INSTR xmm7, rbx + INSTR xmm8, rcx + INSTR xmm9, rax + INSTR xmm10, rbx + INSTR xmm11, rcx + INSTR xmm12, rax + INSTR xmm13, rbx + INSTR xmm14, rcx + INSTR xmm15, rax + INSTR xmm3, rbx + INSTR xmm4, rcx + INSTR xmm5, rax + INSTR xmm6, rbx + INSTR xmm7, rcx + INSTR xmm8, rax + INSTR xmm9, rbx + INSTR xmm10, rcx + INSTR xmm11, rax + INSTR xmm12, rbx + INSTR xmm13, rcx + INSTR xmm14, rax + INSTR xmm15, rbx + INSTR xmm3, rcx + INSTR xmm4, rax + INSTR xmm5, rbx + INSTR xmm6, rcx + INSTR xmm7, rax + INSTR xmm8, rbx + INSTR xmm9, rcx + INSTR xmm10, rax + INSTR xmm11, rbx + INSTR xmm12, rcx + INSTR xmm13, rax + INSTR xmm14, rbx + INSTR xmm15, rcx + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm6, rax + INSTR xmm7, rbx + INSTR xmm8, rcx + INSTR xmm9, rax + INSTR xmm10, rbx + INSTR xmm11, rcx + INSTR xmm12, rax + INSTR xmm13, rbx + INSTR xmm14, rcx + INSTR xmm15, rax + INSTR xmm3, rbx + INSTR xmm4, rcx + INSTR xmm5, rax + INSTR xmm6, rbx + INSTR xmm7, rcx + INSTR xmm8, rax + INSTR xmm9, rbx + INSTR xmm10, rcx + INSTR xmm11, rax + INSTR xmm12, rbx + INSTR xmm13, rcx + INSTR xmm14, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-xmm_r64.S b/testcases/vmovq-xmm_r64.S new file mode 100644 index 0000000..6a89af7 --- /dev/null +++ b/testcases/vmovq-xmm_r64.S @@ -0,0 +1,143 @@ +#define INSTR vmovq +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-mem_xmm-TP.S b/testcases/vmovsd-mem_xmm-TP.S new file mode 100644 index 0000000..14a1cb6 --- /dev/null +++ b/testcases/vmovsd-mem_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-mem_xmm.S b/testcases/vmovsd-mem_xmm.S new file mode 100644 index 0000000..4f1bfbb --- /dev/null +++ b/testcases/vmovsd-mem_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmm_mem-TP.S b/testcases/vmovsd-xmm_mem-TP.S new file mode 100644 index 0000000..74f7da2 --- /dev/null +++ b/testcases/vmovsd-xmm_mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmm_mem.S b/testcases/vmovsd-xmm_mem.S new file mode 100644 index 0000000..6447ff8 --- /dev/null +++ b/testcases/vmovsd-xmm_mem.S @@ -0,0 +1,101 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmm_xmm_xmm-TP.S b/testcases/vmovsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..1c847dd --- /dev/null +++ b/testcases/vmovsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmm_xmm_xmm.S b/testcases/vmovsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..d31c45a --- /dev/null +++ b/testcases/vmovsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmovsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovss-mem_xmm-TP.S b/testcases/vmovss-mem_xmm-TP.S index 57a23e2..226cbb0 100644 --- a/testcases/vmovss-mem_xmm-TP.S +++ b/testcases/vmovss-mem_xmm-TP.S @@ -1,5 +1,5 @@ #define INSTR vmovss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR [rip+PI], xmm2 INSTR [rip+PI], xmm0 INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm1 + INSTR [rip+PI], xmm2 + INSTR [rip+PI], xmm0 cmp i, N jl loop done: diff --git a/testcases/vmovss-mem_xmm.S b/testcases/vmovss-mem_xmm.S index c99d6cc..d4c3ee5 100644 --- a/testcases/vmovss-mem_xmm.S +++ b/testcases/vmovss-mem_xmm.S @@ -1,5 +1,5 @@ #define INSTR vmovss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR [rip+PI], xmm0 INSTR [rip+PI], xmm0 INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 + INSTR [rip+PI], xmm0 cmp i, N jl loop done: diff --git a/testcases/vmovupd-xmm_mem-TP.S b/testcases/vmovupd-xmm_mem-TP.S new file mode 100644 index 0000000..9c5d7a0 --- /dev/null +++ b/testcases/vmovupd-xmm_mem-TP.S @@ -0,0 +1,101 @@ +#define INSTR vmovupd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + INSTR xmm15, [rip+PI] + INSTR xmm3, [rip+PI] + INSTR xmm4, [rip+PI] + INSTR xmm5, [rip+PI] + INSTR xmm6, [rip+PI] + INSTR xmm7, [rip+PI] + INSTR xmm8, [rip+PI] + INSTR xmm9, [rip+PI] + INSTR xmm10, [rip+PI] + INSTR xmm11, [rip+PI] + INSTR xmm12, [rip+PI] + INSTR xmm13, [rip+PI] + INSTR xmm14, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovupd-xmm_mem.S b/testcases/vmovupd-xmm_mem.S new file mode 100644 index 0000000..b5cc153 --- /dev/null +++ b/testcases/vmovupd-xmm_mem.S @@ -0,0 +1,101 @@ +#define INSTR vmovupd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero +loop: + inc i + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + INSTR xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-ymm_ymm_mem-TP.S b/testcases/vmulpd-ymm_ymm_mem-TP.S new file mode 100644 index 0000000..bdbd111 --- /dev/null +++ b/testcases/vmulpd-ymm_ymm_mem-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, [rip+PI] + INSTR ymm4, ymm1, [rip+PI] + INSTR ymm5, ymm2, [rip+PI] + INSTR ymm6, ymm0, [rip+PI] + INSTR ymm7, ymm1, [rip+PI] + INSTR ymm8, ymm2, [rip+PI] + INSTR ymm9, ymm0, [rip+PI] + INSTR ymm10, ymm1, [rip+PI] + INSTR ymm11, ymm2, [rip+PI] + INSTR ymm12, ymm0, [rip+PI] + INSTR ymm13, ymm1, [rip+PI] + INSTR ymm14, ymm2, [rip+PI] + INSTR ymm15, ymm0, [rip+PI] + INSTR ymm3, ymm1, [rip+PI] + INSTR ymm4, ymm2, [rip+PI] + INSTR ymm5, ymm0, [rip+PI] + INSTR ymm6, ymm1, [rip+PI] + INSTR ymm7, ymm2, [rip+PI] + INSTR ymm8, ymm0, [rip+PI] + INSTR ymm9, ymm1, [rip+PI] + INSTR ymm10, ymm2, [rip+PI] + INSTR ymm11, ymm0, [rip+PI] + INSTR ymm12, ymm1, [rip+PI] + INSTR ymm13, ymm2, [rip+PI] + INSTR ymm14, ymm0, [rip+PI] + INSTR ymm15, ymm1, [rip+PI] + INSTR ymm3, ymm2, [rip+PI] + INSTR ymm4, ymm0, [rip+PI] + INSTR ymm5, ymm1, [rip+PI] + INSTR ymm6, ymm2, [rip+PI] + INSTR ymm7, ymm0, [rip+PI] + INSTR ymm8, ymm1, [rip+PI] + INSTR ymm9, ymm2, [rip+PI] + INSTR ymm10, ymm0, [rip+PI] + INSTR ymm11, ymm1, [rip+PI] + INSTR ymm12, ymm2, [rip+PI] + INSTR ymm13, ymm0, [rip+PI] + INSTR ymm14, ymm1, [rip+PI] + INSTR ymm15, ymm2, [rip+PI] + INSTR ymm3, ymm0, [rip+PI] + INSTR ymm4, ymm1, [rip+PI] + INSTR ymm5, ymm2, [rip+PI] + INSTR ymm6, ymm0, [rip+PI] + INSTR ymm7, ymm1, [rip+PI] + INSTR ymm8, ymm2, [rip+PI] + INSTR ymm9, ymm0, [rip+PI] + INSTR ymm10, ymm1, [rip+PI] + INSTR ymm11, ymm2, [rip+PI] + INSTR ymm12, ymm0, [rip+PI] + INSTR ymm13, ymm1, [rip+PI] + INSTR ymm14, ymm2, [rip+PI] + INSTR ymm15, ymm0, [rip+PI] + INSTR ymm3, ymm1, [rip+PI] + INSTR ymm4, ymm2, [rip+PI] + INSTR ymm5, ymm0, [rip+PI] + INSTR ymm6, ymm1, [rip+PI] + INSTR ymm7, ymm2, [rip+PI] + INSTR ymm8, ymm0, [rip+PI] + INSTR ymm9, ymm1, [rip+PI] + INSTR ymm10, ymm2, [rip+PI] + INSTR ymm11, ymm0, [rip+PI] + INSTR ymm12, ymm1, [rip+PI] + INSTR ymm13, ymm2, [rip+PI] + INSTR ymm14, ymm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-ymm_ymm_mem.S b/testcases/vmulpd-ymm_ymm_mem.S new file mode 100644 index 0000000..3193575 --- /dev/null +++ b/testcases/vmulpd-ymm_ymm_mem.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + INSTR ymm0, ymm1, [rip+PI] + INSTR ymm1, ymm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-ymm_ymm_ymm-TP.S b/testcases/vmulpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..029acd9 --- /dev/null +++ b/testcases/vmulpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-ymm_ymm_ymm.S b/testcases/vmulpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..830c26d --- /dev/null +++ b/testcases/vmulpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vmulpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmm_xmm_mem-TP.S b/testcases/vmulsd-xmm_xmm_mem-TP.S new file mode 100644 index 0000000..5a0359f --- /dev/null +++ b/testcases/vmulsd-xmm_xmm_mem-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + INSTR xmm15, xmm1, [rip+PI] + INSTR xmm3, xmm2, [rip+PI] + INSTR xmm4, xmm0, [rip+PI] + INSTR xmm5, xmm1, [rip+PI] + INSTR xmm6, xmm2, [rip+PI] + INSTR xmm7, xmm0, [rip+PI] + INSTR xmm8, xmm1, [rip+PI] + INSTR xmm9, xmm2, [rip+PI] + INSTR xmm10, xmm0, [rip+PI] + INSTR xmm11, xmm1, [rip+PI] + INSTR xmm12, xmm2, [rip+PI] + INSTR xmm13, xmm0, [rip+PI] + INSTR xmm14, xmm1, [rip+PI] + INSTR xmm15, xmm2, [rip+PI] + INSTR xmm3, xmm0, [rip+PI] + INSTR xmm4, xmm1, [rip+PI] + INSTR xmm5, xmm2, [rip+PI] + INSTR xmm6, xmm0, [rip+PI] + INSTR xmm7, xmm1, [rip+PI] + INSTR xmm8, xmm2, [rip+PI] + INSTR xmm9, xmm0, [rip+PI] + INSTR xmm10, xmm1, [rip+PI] + INSTR xmm11, xmm2, [rip+PI] + INSTR xmm12, xmm0, [rip+PI] + INSTR xmm13, xmm1, [rip+PI] + INSTR xmm14, xmm2, [rip+PI] + INSTR xmm15, xmm0, [rip+PI] + INSTR xmm3, xmm1, [rip+PI] + INSTR xmm4, xmm2, [rip+PI] + INSTR xmm5, xmm0, [rip+PI] + INSTR xmm6, xmm1, [rip+PI] + INSTR xmm7, xmm2, [rip+PI] + INSTR xmm8, xmm0, [rip+PI] + INSTR xmm9, xmm1, [rip+PI] + INSTR xmm10, xmm2, [rip+PI] + INSTR xmm11, xmm0, [rip+PI] + INSTR xmm12, xmm1, [rip+PI] + INSTR xmm13, xmm2, [rip+PI] + INSTR xmm14, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmm_xmm_mem.S b/testcases/vmulsd-xmm_xmm_mem.S new file mode 100644 index 0000000..4b70252 --- /dev/null +++ b/testcases/vmulsd-xmm_xmm_mem.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + INSTR xmm0, xmm1, [rip+PI] + INSTR xmm1, xmm0, [rip+PI] + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmm_xmm_xmm-TP.S b/testcases/vmulsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..c2dc870 --- /dev/null +++ b/testcases/vmulsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmm_xmm_xmm.S b/testcases/vmulsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..97d4bac --- /dev/null +++ b/testcases/vmulsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vmulsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulss-xmm_xmm_xmm-TP.S b/testcases/vmulss-xmm_xmm_xmm-TP.S index e865ed8..a8b7b5b 100644 --- a/testcases/vmulss-xmm_xmm_xmm-TP.S +++ b/testcases/vmulss-xmm_xmm_xmm-TP.S @@ -1,5 +1,5 @@ #define INSTR vmulss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm6, xmm2, xmm2 INSTR xmm7, xmm0, xmm0 INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 cmp i, N jl loop done: diff --git a/testcases/vmulss-xmm_xmm_xmm.S b/testcases/vmulss-xmm_xmm_xmm.S index f91adc3..4a8d582 100644 --- a/testcases/vmulss-xmm_xmm_xmm.S +++ b/testcases/vmulss-xmm_xmm_xmm.S @@ -1,5 +1,5 @@ #define INSTR vmulss -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm1, xmm0, xmm0 INSTR xmm0, xmm1, xmm0 INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 cmp i, N jl loop done: diff --git a/testcases/vsubpd-ymm_ymm_ymm-TP.S b/testcases/vsubpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..2eca166 --- /dev/null +++ b/testcases/vsubpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vsubpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubpd-ymm_ymm_ymm.S b/testcases/vsubpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..96d3fe9 --- /dev/null +++ b/testcases/vsubpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vsubpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubsd-xmm_xmm_xmm-TP.S b/testcases/vsubsd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..ceb9507 --- /dev/null +++ b/testcases/vsubsd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vsubsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubsd-xmm_xmm_xmm.S b/testcases/vsubsd-xmm_xmm_xmm.S new file mode 100644 index 0000000..b7429a4 --- /dev/null +++ b/testcases/vsubsd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vsubsd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vunpckhpd-xmm_xmm_xmm-TP.S b/testcases/vunpckhpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..1d99838 --- /dev/null +++ b/testcases/vunpckhpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vunpckhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vunpckhpd-xmm_xmm_xmm.S b/testcases/vunpckhpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..8807655 --- /dev/null +++ b/testcases/vunpckhpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vunpckhpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-xmm_xmm_xmm-TP.S b/testcases/vxorpd-xmm_xmm_xmm-TP.S new file mode 100644 index 0000000..2188e73 --- /dev/null +++ b/testcases/vxorpd-xmm_xmm_xmm-TP.S @@ -0,0 +1,108 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + INSTR xmm15, xmm1, xmm1 + INSTR xmm3, xmm2, xmm2 + INSTR xmm4, xmm0, xmm0 + INSTR xmm5, xmm1, xmm1 + INSTR xmm6, xmm2, xmm2 + INSTR xmm7, xmm0, xmm0 + INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-xmm_xmm_xmm.S b/testcases/vxorpd-xmm_xmm_xmm.S new file mode 100644 index 0000000..eb1d6c9 --- /dev/null +++ b/testcases/vxorpd-xmm_xmm_xmm.S @@ -0,0 +1,108 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-ymm_ymm_ymm-TP.S b/testcases/vxorpd-ymm_ymm_ymm-TP.S new file mode 100644 index 0000000..3a7e7fe --- /dev/null +++ b/testcases/vxorpd-ymm_ymm_ymm-TP.S @@ -0,0 +1,110 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + INSTR ymm15, ymm1, ymm1 + INSTR ymm3, ymm2, ymm2 + INSTR ymm4, ymm0, ymm0 + INSTR ymm5, ymm1, ymm1 + INSTR ymm6, ymm2, ymm2 + INSTR ymm7, ymm0, ymm0 + INSTR ymm8, ymm1, ymm1 + INSTR ymm9, ymm2, ymm2 + INSTR ymm10, ymm0, ymm0 + INSTR ymm11, ymm1, ymm1 + INSTR ymm12, ymm2, ymm2 + INSTR ymm13, ymm0, ymm0 + INSTR ymm14, ymm1, ymm1 + INSTR ymm15, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm3, ymm1, ymm1 + INSTR ymm4, ymm2, ymm2 + INSTR ymm5, ymm0, ymm0 + INSTR ymm6, ymm1, ymm1 + INSTR ymm7, ymm2, ymm2 + INSTR ymm8, ymm0, ymm0 + INSTR ymm9, ymm1, ymm1 + INSTR ymm10, ymm2, ymm2 + INSTR ymm11, ymm0, ymm0 + INSTR ymm12, ymm1, ymm1 + INSTR ymm13, ymm2, ymm2 + INSTR ymm14, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-ymm_ymm_ymm.S b/testcases/vxorpd-ymm_ymm_ymm.S new file mode 100644 index 0000000..8ab0f92 --- /dev/null +++ b/testcases/vxorpd-ymm_ymm_ymm.S @@ -0,0 +1,110 @@ +#define INSTR vxorpd +#define NINST 64 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.align 32 +PI: +.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9 +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorps-xmm_xmm_xmm-TP.S b/testcases/vxorps-xmm_xmm_xmm-TP.S index d71e189..77475af 100644 --- a/testcases/vxorps-xmm_xmm_xmm-TP.S +++ b/testcases/vxorps-xmm_xmm_xmm-TP.S @@ -1,5 +1,5 @@ #define INSTR vxorps -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm6, xmm2, xmm2 INSTR xmm7, xmm0, xmm0 INSTR xmm8, xmm1, xmm1 + INSTR xmm9, xmm2, xmm2 + INSTR xmm10, xmm0, xmm0 + INSTR xmm11, xmm1, xmm1 + INSTR xmm12, xmm2, xmm2 + INSTR xmm13, xmm0, xmm0 + INSTR xmm14, xmm1, xmm1 + INSTR xmm15, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm3, xmm1, xmm1 + INSTR xmm4, xmm2, xmm2 + INSTR xmm5, xmm0, xmm0 + INSTR xmm6, xmm1, xmm1 + INSTR xmm7, xmm2, xmm2 + INSTR xmm8, xmm0, xmm0 + INSTR xmm9, xmm1, xmm1 + INSTR xmm10, xmm2, xmm2 + INSTR xmm11, xmm0, xmm0 + INSTR xmm12, xmm1, xmm1 + INSTR xmm13, xmm2, xmm2 + INSTR xmm14, xmm0, xmm0 cmp i, N jl loop done: diff --git a/testcases/vxorps-xmm_xmm_xmm.S b/testcases/vxorps-xmm_xmm_xmm.S index a8314c5..f1a1a8c 100644 --- a/testcases/vxorps-xmm_xmm_xmm.S +++ b/testcases/vxorps-xmm_xmm_xmm.S @@ -1,5 +1,5 @@ #define INSTR vxorps -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -67,6 +67,38 @@ loop: INSTR xmm1, xmm0, xmm0 INSTR xmm0, xmm1, xmm0 INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 cmp i, N jl loop done: diff --git a/testcases/xor-r32_r32-TP.S b/testcases/xor-r32_r32-TP.S index 5e122f7..bf5757b 100644 --- a/testcases/xor-r32_r32-TP.S +++ b/testcases/xor-r32_r32-TP.S @@ -1,5 +1,5 @@ #define INSTR xor -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -91,6 +91,38 @@ loop: INSTR r13d, ecx INSTR r14d, eax INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR edx, ebx + INSTR r9d, ecx + INSTR r10d, eax + INSTR r11d, ebx + INSTR r12d, ecx + INSTR r13d, eax + INSTR r14d, ebx + INSTR r15d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax cmp i, N jl loop pop r15 diff --git a/testcases/xor-r32_r32.S b/testcases/xor-r32_r32.S index b1e71b7..652a935 100644 --- a/testcases/xor-r32_r32.S +++ b/testcases/xor-r32_r32.S @@ -1,5 +1,5 @@ #define INSTR xor -#define NINST 32 +#define NINST 64 #define N edi #define i r8d @@ -91,6 +91,38 @@ loop: INSTR ebx, eax INSTR eax, ebx INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax cmp i, N jl loop pop r15