restructured repo and renamed files in lowercase

This commit is contained in:
Jan Laukemann
2017-09-23 17:47:14 +02:00
parent 02cdb1dd5a
commit 10959f2bac
242 changed files with 29824 additions and 24 deletions

41
osaca/create_testcase.py Executable file
View File

@@ -0,0 +1,41 @@
#!/apps/python/3.5-anaconda/bin/python
from param import *
from testcase import *
# Choose out of various operands
reg8 = Register('al')
reg16 = Register('ax')
reg32 = Register('eax')
reg64 = Register('rax')
xmm = Register('xmm0')
ymm = Register('ymm0')
zmm = Register('zmm0')
mem0 = MemAddr('(%rax, %esi, 4)')
imd1 = Parameter('IMD')
#-----------------------------------------------
#-USER INPUT------------------------------------
#-----------------------------------------------
# Enter your mnemonic
mnemonic = 'vxorpd'
# Define your operands. If you don't need it, just type in None
dst = xmm
op1 = xmm
op2 = xmm
# Define the number of instructions per loop (default: 12)
per_loop = '128'
#-----------------------------------------------
#-----------------------------------------------
# Start
operands = [x for x in [dst, op1, op2] if x is not None]
opListStr = ', '.join([str(x) for x in operands])
print('Create Testcase for {} {}'.format(mnemonic, opListStr ), end='')
tc = Testcase(mnemonic, operands, per_loop)
tc.write_testcase()
print(' --------> SUCCEEDED')

53
osaca/data/ivb_data.csv Normal file
View File

@@ -0,0 +1,53 @@
instr,TP,LT,ports
jmp-lbl,0.0,0.0,"((5,),)"
jo-lbl,0.0,0.0,"((5,),)"
jno-lbl,0.0,0.0,"((5,),)"
js-lbl,0.0,0.0,"((5,),)"
jns-lbl,0.0,0.0,"((5,),)"
je-lbl,0.0,0.0,"((5,),)"
jz-lbl,0.0,0.0,"((5,),)"
jne-lbl,0.0,0.0,"((5,),)"
jnz-lbl,0.0,0.0,"((5,),)"
jb-lbl,0.0,0.0,"((5,),)"
jnae-lbl,0.0,0.0,"((5,),)"
jc-lbl,0.0,0.0,"((5,),)"
jnb-lbl,0.0,0.0,"((5,),)"
jae-lbl,0.0,0.0,"((5,),)"
jnc-lbl,0.0,0.0,"((5,),)"
jbe-lbl,0.0,0.0,"((5,),)"
jna-lbl,0.0,0.0,"((5,),)"
ja-lbl,0.0,0.0,"((5,),)"
jnbe-lbl,0.0,0.0,"((5,),)"
jl-lbl,0.0,0.0,"((5,),)"
jnge-lbl,0.0,0.0,"((5,),)"
jge-lbl,0.0,0.0,"((5,),)"
jnl-lbl,0.0,0.0,"((5,),)"
jle-lbl,0.0,0.0,"((5,),)"
jng-lbl,0.0,0.0,"((5,),)"
jg-lbl,0.0,0.0,"((5,),)"
jnle-lbl,0.0,0.0,"((5,),)"
jp-lbl,0.0,0.0,"((5,),)"
jpe-lbl,0.0,0.0,"((5,),)"
jnp-lbl,0.0,0.0,"((5,),)"
jpo-lbl,0.0,0.0,"((5,),)"
jcxz-lbl,0.0,0.0,"((5,),)"
jecxz-lbl,0.0,0.0,"((5,),)"
jo-lbl,0.0,0.0,"((5,),)"
jno-lbl,0.0,0.0,"((5,),)"
js-lbl,0.0,0.0,"((5,),)"
jns-lbl,0.0,0.0,"((5,),)"
lea-r64_mem,1.0,1.0,"((2,),(3,))"
lea-r32_mem,1.0,1.0,"((2,),(3,))"
vcvtsi2ss-xmm_xmm_r64,1.0,3.0,"((0,1),(1,5))"
vcvtsi2ss-xmm_xmm_r32,1.0,3.0,"((1,5),(0,1))"
vmulss-xmm_xmm_xmm,1.0,5.0,"((0,),)"
vaddss-xmm_xmm_mem,1.0,3.0,"((1,),)"
vaddss-xmm_xmm_xmm,1.0,3.0,"((1,),)"
vxorps-xmm_xmm_xmm,0.3333333333333333,1.0,"((0,),(1,),(5,))"
vmovss-xmm_mem,0.5,1.0,"((2,),(3,))"
vmovss-mem_xmm,1.0,1.0,"((2,4),(3,4))"
inc-r32,0.3333333333333333,1.0,"((0,),(1,),(5,))"
inc-r64,0.3333333333333333,1.0,"((0,),(1,),(5,))"
cmp-r64_imd,0.3333333333333333,1.0,"((0,),(1,),(5,))"
cmp-r32_mem,0.5,1.0,"((0,2),(0,3),(1,2),(1,3),(2,5),(3,5))"
cmp-r32_r32,0.3333333333333333,1.0,"((0,),(1,),(5,))"
1 instr TP LT ports
2 jmp-lbl 0.0 0.0 ((5,),)
3 jo-lbl 0.0 0.0 ((5,),)
4 jno-lbl 0.0 0.0 ((5,),)
5 js-lbl 0.0 0.0 ((5,),)
6 jns-lbl 0.0 0.0 ((5,),)
7 je-lbl 0.0 0.0 ((5,),)
8 jz-lbl 0.0 0.0 ((5,),)
9 jne-lbl 0.0 0.0 ((5,),)
10 jnz-lbl 0.0 0.0 ((5,),)
11 jb-lbl 0.0 0.0 ((5,),)
12 jnae-lbl 0.0 0.0 ((5,),)
13 jc-lbl 0.0 0.0 ((5,),)
14 jnb-lbl 0.0 0.0 ((5,),)
15 jae-lbl 0.0 0.0 ((5,),)
16 jnc-lbl 0.0 0.0 ((5,),)
17 jbe-lbl 0.0 0.0 ((5,),)
18 jna-lbl 0.0 0.0 ((5,),)
19 ja-lbl 0.0 0.0 ((5,),)
20 jnbe-lbl 0.0 0.0 ((5,),)
21 jl-lbl 0.0 0.0 ((5,),)
22 jnge-lbl 0.0 0.0 ((5,),)
23 jge-lbl 0.0 0.0 ((5,),)
24 jnl-lbl 0.0 0.0 ((5,),)
25 jle-lbl 0.0 0.0 ((5,),)
26 jng-lbl 0.0 0.0 ((5,),)
27 jg-lbl 0.0 0.0 ((5,),)
28 jnle-lbl 0.0 0.0 ((5,),)
29 jp-lbl 0.0 0.0 ((5,),)
30 jpe-lbl 0.0 0.0 ((5,),)
31 jnp-lbl 0.0 0.0 ((5,),)
32 jpo-lbl 0.0 0.0 ((5,),)
33 jcxz-lbl 0.0 0.0 ((5,),)
34 jecxz-lbl 0.0 0.0 ((5,),)
35 jo-lbl 0.0 0.0 ((5,),)
36 jno-lbl 0.0 0.0 ((5,),)
37 js-lbl 0.0 0.0 ((5,),)
38 jns-lbl 0.0 0.0 ((5,),)
39 lea-r64_mem 1.0 1.0 ((2,),(3,))
40 lea-r32_mem 1.0 1.0 ((2,),(3,))
41 vcvtsi2ss-xmm_xmm_r64 1.0 3.0 ((0,1),(1,5))
42 vcvtsi2ss-xmm_xmm_r32 1.0 3.0 ((1,5),(0,1))
43 vmulss-xmm_xmm_xmm 1.0 5.0 ((0,),)
44 vaddss-xmm_xmm_mem 1.0 3.0 ((1,),)
45 vaddss-xmm_xmm_xmm 1.0 3.0 ((1,),)
46 vxorps-xmm_xmm_xmm 0.3333333333333333 1.0 ((0,),(1,),(5,))
47 vmovss-xmm_mem 0.5 1.0 ((2,),(3,))
48 vmovss-mem_xmm 1.0 1.0 ((2,4),(3,4))
49 inc-r32 0.3333333333333333 1.0 ((0,),(1,),(5,))
50 inc-r64 0.3333333333333333 1.0 ((0,),(1,),(5,))
51 cmp-r64_imd 0.3333333333333333 1.0 ((0,),(1,),(5,))
52 cmp-r32_mem 0.5 1.0 ((0,2),(0,3),(1,2),(1,3),(2,5),(3,5))
53 cmp-r32_r32 0.3333333333333333 1.0 ((0,),(1,),(5,))

View File

@@ -0,0 +1,92 @@
instr,TP,LT
jmp-lbl,0.0,-1.0
jo-lbl,0.0,-1.0
jno-lbl,0.0,-1.0
js-lbl,0.0,-1.0
jns-lbl,0.0,-1.0
je-lbl,0.0,-1.0
jz-lbl,0.0,-1.0
jne-lbl,0.0,-1.0
jnz-lbl,0.0,-1.0
jb-lbl,0.0,-1.0
jnae-lbl,0.0,-1.0
jc-lbl,0.0,-1.0
jnb-lbl,0.0,-1.0
jae-lbl,0.0,-1.0
jnc-lbl,0.0,-1.0
jbe-lbl,0.0,-1.0
jna-lbl,0.0,-1.0
ja-lbl,0.0,-1.0
jnbe-lbl,0.0,-1.0
jl-lbl,0.0,-1.0
jnge-lbl,0.0,-1.0
jge-lbl,0.0,-1.0
jnl-lbl,0.0,-1.0
jle-lbl,0.0,-1.0
jng-lbl,0.0,-1.0
jg-lbl,0.0,-1.0
jnle-lbl,0.0,-1.0
jp-lbl,0.0,-1.0
jpe-lbl,0.0,-1.0
jnp-lbl,0.0,-1.0
jpo-lbl,0.0,-1.0
jcxz-lbl,0.0,-1.0
jecxz-lbl,0.0,-1.0
jo-lbl,0.0,-1.0
jno-lbl,0.0,-1.0
js-lbl,0.0,-1.0
jns-lbl,0.0,-1.0
vmulss-xmm_xmm_xmm,1.0,-1.0
vaddss-xmm_xmm_xmm,1.0,-1.0
vxorps-xmm_xmm_xmm,0.25,-1.0
inc-r64,0.3333333333333333,-1.0
xor-r32_r32,0.3333333333333333,-1.0
vcvtsi2ss-xmm_xmm_r32,1.0,-1.0
vaddss-xmm_xmm_mem,1.0,-1.0
vmovupd-load-avx,1.0,-1.0
lea-r32_mem,1.0,-1.0
vmovss-xmm_mem,0.5,-1.0
vmovss-mem_xmm,1.0,-1.0
vmovupd-store-avx,2.0,-1.0
lea-r64_mem,1.0,-1.0
movslq-r64_mem,0.5,-1.0
mov-r64_mem,0.5,-1.0
vaddpd-ymm_ymm_ymm,1.0,-1.0
cmp-r32_r32,0.3333333333333333,-1.0
vmovsd-xmm_xmm_xmm,1.0,-1.0
vmulsd-xmm_xmm_mem,1.0,-1.0
vmovsd-mem_xmm,1.0,-1.0
vmovhpd-xmm_xmm_mem,1.0,-1.0
vsubpd-ymm_ymm_ymm,1.0,-1.0
vmovq-xmm_r64,1.0,-1.0
vunpckhpd-xmm_xmm_xmm,1.0,-1.0
vmulpd-ymm_ymm_mem,1.0,-1.0
mov-mem_r64,1.0,-1.0
movzbl-r32_r8,0.29600000000000004,-1.0
vmulsd-xmm_xmm_xmm,1.0,-1.0
vaddsd-xmm_xmm_mem,1.0,-1.0
vmovq-r64_xmm,1.0,-1.0
vmulpd-ymm_ymm_ymm,1.0,-1.0
mov-r32_mem,0.5,-1.0
cmp-r32_mem,0.5,-1.0
vaddpd-xmm_xmm_xmm,1.0,-1.0
mov-mem_r32,1.0,-1.0
vmovsd-xmm_mem,0.5,-1.0
vsubsd-xmm_xmm_xmm,1.0,-1.0
vmovaps-xmm_xmm,0.845,-1.0
vaddsd-xmm_xmm_xmm,1.0,-1.0
add-r32_mem,0.5,-1.0
vmovupd-xmm_mem,0.5,-1.0
test-r32_r32,0.3333333333333333,-1.0
add-r64_r64,0.3333333333333333,-1.0
dec-r32,0.3333333333333333,-1.0
movslq-r64_r32,0.3333333333333333,-1.0
vxorpd-ymm_ymm_ymm,0.25,-1.0
sub-r32_r32,0.3333333333333333,-1.0
inc-r32,0.3333333333333333,-1.0
neg-r32,0.3333333333333333,-1.0
cmp-r64_imd,0.3333333333333333,-1.0
vxorpd-xmm_xmm_xmm,0.25,-1.0
vmovapd-ymm_ymm,0.856,-1.0
vmovapd-xmm_xmm,0.855,-1.0
mov-r32_r32,0.3333333333333333,-1.0
1 instr TP LT
2 jmp-lbl 0.0 -1.0
3 jo-lbl 0.0 -1.0
4 jno-lbl 0.0 -1.0
5 js-lbl 0.0 -1.0
6 jns-lbl 0.0 -1.0
7 je-lbl 0.0 -1.0
8 jz-lbl 0.0 -1.0
9 jne-lbl 0.0 -1.0
10 jnz-lbl 0.0 -1.0
11 jb-lbl 0.0 -1.0
12 jnae-lbl 0.0 -1.0
13 jc-lbl 0.0 -1.0
14 jnb-lbl 0.0 -1.0
15 jae-lbl 0.0 -1.0
16 jnc-lbl 0.0 -1.0
17 jbe-lbl 0.0 -1.0
18 jna-lbl 0.0 -1.0
19 ja-lbl 0.0 -1.0
20 jnbe-lbl 0.0 -1.0
21 jl-lbl 0.0 -1.0
22 jnge-lbl 0.0 -1.0
23 jge-lbl 0.0 -1.0
24 jnl-lbl 0.0 -1.0
25 jle-lbl 0.0 -1.0
26 jng-lbl 0.0 -1.0
27 jg-lbl 0.0 -1.0
28 jnle-lbl 0.0 -1.0
29 jp-lbl 0.0 -1.0
30 jpe-lbl 0.0 -1.0
31 jnp-lbl 0.0 -1.0
32 jpo-lbl 0.0 -1.0
33 jcxz-lbl 0.0 -1.0
34 jecxz-lbl 0.0 -1.0
35 jo-lbl 0.0 -1.0
36 jno-lbl 0.0 -1.0
37 js-lbl 0.0 -1.0
38 jns-lbl 0.0 -1.0
39 vmulss-xmm_xmm_xmm 1.0 -1.0
40 vaddss-xmm_xmm_xmm 1.0 -1.0
41 vxorps-xmm_xmm_xmm 0.25 -1.0
42 inc-r64 0.3333333333333333 -1.0
43 xor-r32_r32 0.3333333333333333 -1.0
44 vcvtsi2ss-xmm_xmm_r32 1.0 -1.0
45 vaddss-xmm_xmm_mem 1.0 -1.0
46 vmovupd-load-avx 1.0 -1.0
47 lea-r32_mem 1.0 -1.0
48 vmovss-xmm_mem 0.5 -1.0
49 vmovss-mem_xmm 1.0 -1.0
50 vmovupd-store-avx 2.0 -1.0
51 lea-r64_mem 1.0 -1.0
52 movslq-r64_mem 0.5 -1.0
53 mov-r64_mem 0.5 -1.0
54 vaddpd-ymm_ymm_ymm 1.0 -1.0
55 cmp-r32_r32 0.3333333333333333 -1.0
56 vmovsd-xmm_xmm_xmm 1.0 -1.0
57 vmulsd-xmm_xmm_mem 1.0 -1.0
58 vmovsd-mem_xmm 1.0 -1.0
59 vmovhpd-xmm_xmm_mem 1.0 -1.0
60 vsubpd-ymm_ymm_ymm 1.0 -1.0
61 vmovq-xmm_r64 1.0 -1.0
62 vunpckhpd-xmm_xmm_xmm 1.0 -1.0
63 vmulpd-ymm_ymm_mem 1.0 -1.0
64 mov-mem_r64 1.0 -1.0
65 movzbl-r32_r8 0.29600000000000004 -1.0
66 vmulsd-xmm_xmm_xmm 1.0 -1.0
67 vaddsd-xmm_xmm_mem 1.0 -1.0
68 vmovq-r64_xmm 1.0 -1.0
69 vmulpd-ymm_ymm_ymm 1.0 -1.0
70 mov-r32_mem 0.5 -1.0
71 cmp-r32_mem 0.5 -1.0
72 vaddpd-xmm_xmm_xmm 1.0 -1.0
73 mov-mem_r32 1.0 -1.0
74 vmovsd-xmm_mem 0.5 -1.0
75 vsubsd-xmm_xmm_xmm 1.0 -1.0
76 vmovaps-xmm_xmm 0.845 -1.0
77 vaddsd-xmm_xmm_xmm 1.0 -1.0
78 add-r32_mem 0.5 -1.0
79 vmovupd-xmm_mem 0.5 -1.0
80 test-r32_r32 0.3333333333333333 -1.0
81 add-r64_r64 0.3333333333333333 -1.0
82 dec-r32 0.3333333333333333 -1.0
83 movslq-r64_r32 0.3333333333333333 -1.0
84 vxorpd-ymm_ymm_ymm 0.25 -1.0
85 sub-r32_r32 0.3333333333333333 -1.0
86 inc-r32 0.3333333333333333 -1.0
87 neg-r32 0.3333333333333333 -1.0
88 cmp-r64_imd 0.3333333333333333 -1.0
89 vxorpd-xmm_xmm_xmm 0.25 -1.0
90 vmovapd-ymm_ymm 0.856 -1.0
91 vmovapd-xmm_xmm 0.855 -1.0
92 mov-r32_r32 0.3333333333333333 -1.0

107
osaca/data/res_ivb.dat Normal file
View File

@@ -0,0 +1,107 @@
Using frequency 2.20GHz.
vmovsd-xmm_mem: 0.503 (clock cycles) [DEBUG - result: 3.141590]
lea-r64_mem-TP: 1.015 (clock cycles) [DEBUG - result: 1.000000]
vmovupd-load-avx-TP: 1.004 (clock cycles) [DEBUG - result: 3.141590]
movslq-r64_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000]
lea-r32_mem-TP: 1.015 (clock cycles) [DEBUG - result: 1.000000]
cmp-r32_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000]
sub-r32_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000]
test-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000]
vaddss-xmm_xmm_xmm-TP: 1.015 (clock cycles) [DEBUG - result: 1.000000]
vsubsd-xmm_xmm_xmm: 3.005 (clock cycles) [DEBUG - result: -1.000000]
vunpckhpd-xmm_xmm_xmm: 1.017 (clock cycles) [DEBUG - result: 1.000000]
movzbl-r32_r8: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vaddss-xmm_xmm_mem: 3.005 (clock cycles) [DEBUG - result: 2.000002]
dec-r32: 1.003 (clock cycles) [DEBUG - result: 1.000000]
vxorpd-ymm_ymm_ymm: 0.517 (clock cycles) [DEBUG - result: inf]
vaddpd-xmm_xmm_xmm: 3.005 (clock cycles) [DEBUG - result: inf]
cmp-r64_imd-TP: 0.341 (clock cycles) [DEBUG - result: 1.000000]
cmp-r64_imd: 0.341 (clock cycles) [DEBUG - result: 1.000000]
vaddsd-xmm_xmm_xmm: 3.004 (clock cycles) [DEBUG - result: inf]
vmovapd-ymm_ymm-TP: 0.864 (clock cycles) [DEBUG - result: 1.000000]
vmovaps-xmm_xmm: 0.681 (clock cycles) [DEBUG - result: 2.000000]
vmovq-xmm_r64: 1.017 (clock cycles) [DEBUG - result: 1.000000]
vxorpd-xmm_xmm_xmm: 0.517 (clock cycles) [DEBUG - result: inf]
vmovq-r64_xmm: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vcvtsi2ss-xmm_xmm_r32-TP: 1.033 (clock cycles) [DEBUG - result: 1.000000]
inc-r64: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vmovsd-mem_xmm: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vaddpd-ymm_ymm_ymm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000]
add-r32_mem: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vmulsd-xmm_xmm_mem: 5.007 (clock cycles) [DEBUG - result: inf]
lea-r64_mem: 1.015 (clock cycles) [DEBUG - result: 1.000000]
vcvtsi2ss-xmm_xmm_r32: 3.005 (clock cycles) [DEBUG - result: 2.000000]
movslq-r64_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000]
lea-r32_mem: 1.015 (clock cycles) [DEBUG - result: 1.000000]
cmp-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000]
vxorpd-xmm_xmm_xmm-TP: 0.261 (clock cycles) [DEBUG - result: 1.000000]
vmovsd-xmm_xmm_xmm-TP: 1.018 (clock cycles) [DEBUG - result: 1.000000]
vmovapd-ymm_ymm: 0.681 (clock cycles) [DEBUG - result: 2.000000]
vaddss-xmm_xmm_xmm: 3.005 (clock cycles) [DEBUG - result: 2.000000]
vmulsd-xmm_xmm_mem-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000]
vmovsd-mem_xmm-TP: 1.003 (clock cycles) [DEBUG - result: 1.000000]
mov-r32_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000]
vmulss-xmm_xmm_xmm: 5.012 (clock cycles) [DEBUG - result: 2.000000]
vmovhpd-xmm_xmm_mem-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000]
vsubpd-ymm_ymm_ymm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000]
vmovss-xmm_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000]
vmovq-xmm_r64-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000]
vunpckhpd-xmm_xmm_xmm-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000]
add-r64_r64-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000]
inc-r32: 1.002 (clock cycles) [DEBUG - result: 1.000000]
mov-r64_mem: 0.501 (clock cycles) [DEBUG - result: 1.000000]
vmulpd-ymm_ymm_mem-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000]
mov-mem_r64-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vmovupd-xmm_mem: 0.503 (clock cycles) [DEBUG - result: 3.141590]
movzbl-r32_r8-TP: 0.286 (clock cycles) [DEBUG - result: 1.000000]
dec-r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000]
mov-r32_r32-TP: 0.287 (clock cycles) [DEBUG - result: 1.000000]
vmulpd-ymm_ymm_mem: 5.007 (clock cycles) [DEBUG - result: inf]
vaddpd-ymm_ymm_ymm: 3.005 (clock cycles) [DEBUG - result: inf]
movslq-r64_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000]
vxorpd-ymm_ymm_ymm-TP: 0.258 (clock cycles) [DEBUG - result: 1.000000]
cmp-r32_r32: 0.344 (clock cycles) [DEBUG - result: 1.000000]
vmulsd-xmm_xmm_xmm-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000]
mov-r32_r32: 0.668 (clock cycles) [DEBUG - result: 1.000000]
vxorps-xmm_xmm_xmm-TP: 0.258 (clock cycles) [DEBUG - result: 1.000000]
neg-r32: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vaddsd-xmm_xmm_mem-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000]
vmovq-r64_xmm-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vmulpd-ymm_ymm_ymm-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000]
vmovss-mem_xmm-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000]
mov-r32_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000]
vmulpd-ymm_ymm_ymm: 5.007 (clock cycles) [DEBUG - result: inf]
test-r32_r32: 0.346 (clock cycles) [DEBUG - result: 1.000000]
xor-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000]
vmovupd-store-avx-TP: 2.005 (clock cycles) [DEBUG - result: 0.000000]
cmp-r32_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000]
mov-r64_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000]
vmovapd-xmm_xmm: 0.681 (clock cycles) [DEBUG - result: 2.000000]
vaddpd-xmm_xmm_xmm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000]
sub-r32_r32-TP: 0.345 (clock cycles) [DEBUG - result: 1.000000]
vmovss-xmm_mem: 0.516 (clock cycles) [DEBUG - result: 0.000000]
add-r64_r64: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vmulsd-xmm_xmm_xmm: 5.007 (clock cycles) [DEBUG - result: inf]
vmulss-xmm_xmm_xmm-TP: 1.016 (clock cycles) [DEBUG - result: 1.000000]
mov-mem_r32-TP: 1.002 (clock cycles) [DEBUG - result: 1.000000]
mov-mem_r64: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vmovsd-xmm_mem-TP: 0.507 (clock cycles) [DEBUG - result: 1.000000]
vaddss-xmm_xmm_mem-TP: 1.017 (clock cycles) [DEBUG - result: 1.000000]
vsubsd-xmm_xmm_xmm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000]
vmovaps-xmm_xmm-TP: 0.860 (clock cycles) [DEBUG - result: 1.000000]
movslq-r64_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000]
vmovss-mem_xmm: 1.002 (clock cycles) [DEBUG - result: 1.000000]
inc-r32-TP: 0.344 (clock cycles) [DEBUG - result: 1.000000]
vmovapd-xmm_xmm-TP: 0.856 (clock cycles) [DEBUG - result: 1.000000]
vaddsd-xmm_xmm_xmm-TP: 1.014 (clock cycles) [DEBUG - result: 1.000000]
vmovhpd-xmm_xmm_mem: 1.017 (clock cycles) [DEBUG - result: 2.000000]
vxorps-xmm_xmm_xmm: 0.517 (clock cycles) [DEBUG - result: inf]
vmovsd-xmm_xmm_xmm: 1.017 (clock cycles) [DEBUG - result: 1.000000]
vaddsd-xmm_xmm_mem: 3.005 (clock cycles) [DEBUG - result: 201061760.000000]
add-r32_mem-TP: 0.501 (clock cycles) [DEBUG - result: 1.000000]
vmovupd-xmm_mem-TP: 0.509 (clock cycles) [DEBUG - result: 1.000000]
mov-mem_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000]
inc-r64-TP: 0.355 (clock cycles) [DEBUG - result: 1.000000]
neg-r32-TP: 0.344 (clock cycles) [DEBUG - result: 1.000000]
vsubpd-ymm_ymm_ymm: 3.004 (clock cycles) [DEBUG - result: -1.000000]
xor-r32_r32: 1.002 (clock cycles) [DEBUG - result: 1.000000]

4
osaca/data/res_test.dat Normal file
View File

@@ -0,0 +1,4 @@
Using frequency 2.20GHz.
lea-r64_mem-TP: 1.003 (clock cycles) [DEBUG - result: 3.141590]
jan-xmm_xmm-TP: 0.995 (clock cycles) [DEBUG - result: 3.141590]
jan-xmm_xmm: 2.037 (clock cycles) [DEBUG - result: 3.141590]

53
osaca/data/skl_data.csv Normal file
View File

@@ -0,0 +1,53 @@
instr,TP,LT,ports
jmp-lbl,0.0,0.0,"((5,),)"
jo-lbl,0.0,0.0,"((5,),)"
jno-lbl,0.0,0.0,"((5,),)"
js-lbl,0.0,0.0,"((5,),)"
jns-lbl,0.0,0.0,"((5,),)"
je-lbl,0.0,0.0,"((5,),)"
jz-lbl,0.0,0.0,"((5,),)"
jne-lbl,0.0,0.0,"((5,),)"
jnz-lbl,0.0,0.0,"((5,),)"
jb-lbl,0.0,0.0,"((5,),)"
jnae-lbl,0.0,0.0,"((5,),)"
jc-lbl,0.0,0.0,"((5,),)"
jnb-lbl,0.0,0.0,"((5,),)"
jae-lbl,0.0,0.0,"((5,),)"
jnc-lbl,0.0,0.0,"((5,),)"
jbe-lbl,0.0,0.0,"((5,),)"
jna-lbl,0.0,0.0,"((5,),)"
ja-lbl,0.0,0.0,"((5,),)"
jnbe-lbl,0.0,0.0,"((5,),)"
jl-lbl,0.0,0.0,"((5,),)"
jnge-lbl,0.0,0.0,"((5,),)"
jge-lbl,0.0,0.0,"((5,),)"
jnl-lbl,0.0,0.0,"((5,),)"
jle-lbl,0.0,0.0,"((5,),)"
jng-lbl,0.0,0.0,"((5,),)"
jg-lbl,0.0,0.0,"((5,),)"
jnle-lbl,0.0,0.0,"((5,),)"
jp-lbl,0.0,0.0,"((5,),)"
jpe-lbl,0.0,0.0,"((5,),)"
jnp-lbl,0.0,0.0,"((5,),)"
jpo-lbl,0.0,0.0,"((5,),)"
jcxz-lbl,0.0,0.0,"((5,),)"
jecxz-lbl,0.0,0.0,"((5,),)"
jo-lbl,0.0,0.0,"((5,),)"
jno-lbl,0.0,0.0,"((5,),)"
js-lbl,0.0,0.0,"((5,),)"
jns-lbl,0.0,0.0,"((5,),)"
lea-r64_mem,1.0,1.0,"((2,),(3,))"
lea-r32_mem,1.0,1.0,"((2,),(3,))"
vcvtsi2ss-xmm_xmm_r64,1.0,3.0,"((0,1),(1,5))"
vcvtsi2ss-xmm_xmm_r32,1.0,3.0,"((-1,))"
vmulss-xmm_xmm_xmm,1.0,5.0,"((0,),)"
vaddss-xmm_xmm_mem,1.0,3.0,"((1,),)"
vaddss-xmm_xmm_xmm,1.0,3.0,"((1,),)"
vxorps-xmm_xmm_xmm,0.3333333333333333,1.0,"((0,),(1,),(5,))"
vmovss-xmm_mem,0.5,1.0,"((2,),(3,))"
vmovss-mem_xmm,1.0,1.0,"((2,4),(3,4))"
inc-r32,0.3333333333333333,1.0,"((0,),(1,),(5,))"
inc-r64,0.3333333333333333,1.0,"((0,),(1,),(5,))"
cmp-r64_imd,0.3333333333333333,1.0,"((0,),(1,),(5,))"
cmp-r32_mem,0.5,1.0,"((0,),(1,),(5,))"
cmp-r32_r32,0.3333333333333333,1.0,"((0,),(1,),(5,))"
1 instr TP LT ports
2 jmp-lbl 0.0 0.0 ((5,),)
3 jo-lbl 0.0 0.0 ((5,),)
4 jno-lbl 0.0 0.0 ((5,),)
5 js-lbl 0.0 0.0 ((5,),)
6 jns-lbl 0.0 0.0 ((5,),)
7 je-lbl 0.0 0.0 ((5,),)
8 jz-lbl 0.0 0.0 ((5,),)
9 jne-lbl 0.0 0.0 ((5,),)
10 jnz-lbl 0.0 0.0 ((5,),)
11 jb-lbl 0.0 0.0 ((5,),)
12 jnae-lbl 0.0 0.0 ((5,),)
13 jc-lbl 0.0 0.0 ((5,),)
14 jnb-lbl 0.0 0.0 ((5,),)
15 jae-lbl 0.0 0.0 ((5,),)
16 jnc-lbl 0.0 0.0 ((5,),)
17 jbe-lbl 0.0 0.0 ((5,),)
18 jna-lbl 0.0 0.0 ((5,),)
19 ja-lbl 0.0 0.0 ((5,),)
20 jnbe-lbl 0.0 0.0 ((5,),)
21 jl-lbl 0.0 0.0 ((5,),)
22 jnge-lbl 0.0 0.0 ((5,),)
23 jge-lbl 0.0 0.0 ((5,),)
24 jnl-lbl 0.0 0.0 ((5,),)
25 jle-lbl 0.0 0.0 ((5,),)
26 jng-lbl 0.0 0.0 ((5,),)
27 jg-lbl 0.0 0.0 ((5,),)
28 jnle-lbl 0.0 0.0 ((5,),)
29 jp-lbl 0.0 0.0 ((5,),)
30 jpe-lbl 0.0 0.0 ((5,),)
31 jnp-lbl 0.0 0.0 ((5,),)
32 jpo-lbl 0.0 0.0 ((5,),)
33 jcxz-lbl 0.0 0.0 ((5,),)
34 jecxz-lbl 0.0 0.0 ((5,),)
35 jo-lbl 0.0 0.0 ((5,),)
36 jno-lbl 0.0 0.0 ((5,),)
37 js-lbl 0.0 0.0 ((5,),)
38 jns-lbl 0.0 0.0 ((5,),)
39 lea-r64_mem 1.0 1.0 ((2,),(3,))
40 lea-r32_mem 1.0 1.0 ((2,),(3,))
41 vcvtsi2ss-xmm_xmm_r64 1.0 3.0 ((0,1),(1,5))
42 vcvtsi2ss-xmm_xmm_r32 1.0 3.0 ((-1,))
43 vmulss-xmm_xmm_xmm 1.0 5.0 ((0,),)
44 vaddss-xmm_xmm_mem 1.0 3.0 ((1,),)
45 vaddss-xmm_xmm_xmm 1.0 3.0 ((1,),)
46 vxorps-xmm_xmm_xmm 0.3333333333333333 1.0 ((0,),(1,),(5,))
47 vmovss-xmm_mem 0.5 1.0 ((2,),(3,))
48 vmovss-mem_xmm 1.0 1.0 ((2,4),(3,4))
49 inc-r32 0.3333333333333333 1.0 ((0,),(1,),(5,))
50 inc-r64 0.3333333333333333 1.0 ((0,),(1,),(5,))
51 cmp-r64_imd 0.3333333333333333 1.0 ((0,),(1,),(5,))
52 cmp-r32_mem 0.5 1.0 ((0,),(1,),(5,))
53 cmp-r32_r32 0.3333333333333333 1.0 ((0,),(1,),(5,))

View File

@@ -0,0 +1,92 @@
instr,clock_cycles
jmp-lbl-TP,0.0
jo-lbl-TP,0.0
jno-lbl-TP,0.0
js-lbl-TP,0.0
jns-lbl-TP,0.0
je-lbl-TP,0.0
jz-lbl-TP,0.0
jne-lbl-TP,0.0
jnz-lbl-TP,0.0
jb-lbl-TP,0.0
jnae-lbl-TP,0.0
jc-lbl-TP,0.0
jnb-lbl-TP,0.0
jae-lbl-TP,0.0
jnc-lbl-TP,0.0
jbe-lbl-TP,0.0
jna-lbl-TP,0.0
ja-lbl-TP,0.0
jnbe-lbl-TP,0.0
jl-lbl-TP,0.0
jnge-lbl-TP,0.0
jge-lbl-TP,0.0
jnl-lbl-TP,0.0
jle-lbl-TP,0.0
jng-lbl-TP,0.0
jg-lbl-TP,0.0
jnle-lbl-TP,0.0
jp-lbl-TP,0.0
jpe-lbl-TP,0.0
jnp-lbl-TP,0.0
jpo-lbl-TP,0.0
jcxz-lbl-TP,0.0
jecxz-lbl-TP,0.0
jo-lbl-TP,0.0
jno-lbl-TP,0.0
js-lbl-TP,0.0
jns-lbl-TP,0.0
vmulss-xmm_xmm_xmm-TP,1.0
vaddss-xmm_xmm_xmm-TP,1.0
vxorps-xmm_xmm_xmm-TP,0.25
inc-r64-TP,0.3333333333333333
xor-r32_r32-TP,0.3333333333333333
vcvtsi2ss-xmm_xmm_r32-TP,1.0
vaddss-xmm_xmm_mem-TP,1.0
vmovupd-load-avx-TP,1.0
lea-r32_mem-TP,1.0
vmovss-xmm_mem-TP,0.5
vmovss-mem_xmm-TP,1.0
vmovupd-store-avx-TP,2.0
lea-r64_mem-TP,1.0
movslq-r64_mem-TP,0.5
mov-r64_mem-TP,0.5
vaddpd-ymm_ymm_ymm-TP,1.0
cmp-r32_r32-TP,0.3333333333333333
vmovsd-xmm_xmm_xmm-TP,1.0
vmulsd-xmm_xmm_mem-TP,1.0
vmovsd-mem_xmm-TP,1.0
vmovhpd-xmm_xmm_mem-TP,1.0
vsubpd-ymm_ymm_ymm-TP,1.0
vmovq-xmm_r64-TP,1.0
vunpckhpd-xmm_xmm_xmm-TP,1.0
vmulpd-ymm_ymm_mem-TP,1.0
mov-mem_r64-TP,1.0
movzbl-r32_r8-TP,0.29600000000000004
vmulsd-xmm_xmm_xmm-TP,1.0
vaddsd-xmm_xmm_mem-TP,1.0
vmovq-r64_xmm-TP,1.0
vmulpd-ymm_ymm_ymm-TP,1.0
mov-r32_mem-TP,0.5
cmp-r32_mem-TP,0.5
vaddpd-xmm_xmm_xmm-TP,1.0
mov-mem_r32-TP,1.0
vmovsd-xmm_mem-TP,0.5
vsubsd-xmm_xmm_xmm-TP,1.0
vmovaps-xmm_xmm-TP,0.845
vaddsd-xmm_xmm_xmm-TP,1.0
add-r32_mem-TP,0.5
vmovupd-xmm_mem-TP,0.5
test-r32_r32-TP,0.3333333333333333
add-r64_r64-TP,0.3333333333333333
dec-r32-TP,0.3333333333333333
movslq-r64_r32-TP,0.3333333333333333
vxorpd-ymm_ymm_ymm-TP,0.25
sub-r32_r32-TP,0.3333333333333333
inc-r32-TP,0.3333333333333333
neg-r32-TP,0.3333333333333333
cmp-r64_imd-TP,0.3333333333333333
vxorpd-xmm_xmm_xmm-TP,0.25
vmovapd-ymm_ymm-TP,0.856
vmovapd-xmm_xmm-TP,0.855
mov-r32_r32-TP,0.3333333333333333
1 instr clock_cycles
2 jmp-lbl-TP 0.0
3 jo-lbl-TP 0.0
4 jno-lbl-TP 0.0
5 js-lbl-TP 0.0
6 jns-lbl-TP 0.0
7 je-lbl-TP 0.0
8 jz-lbl-TP 0.0
9 jne-lbl-TP 0.0
10 jnz-lbl-TP 0.0
11 jb-lbl-TP 0.0
12 jnae-lbl-TP 0.0
13 jc-lbl-TP 0.0
14 jnb-lbl-TP 0.0
15 jae-lbl-TP 0.0
16 jnc-lbl-TP 0.0
17 jbe-lbl-TP 0.0
18 jna-lbl-TP 0.0
19 ja-lbl-TP 0.0
20 jnbe-lbl-TP 0.0
21 jl-lbl-TP 0.0
22 jnge-lbl-TP 0.0
23 jge-lbl-TP 0.0
24 jnl-lbl-TP 0.0
25 jle-lbl-TP 0.0
26 jng-lbl-TP 0.0
27 jg-lbl-TP 0.0
28 jnle-lbl-TP 0.0
29 jp-lbl-TP 0.0
30 jpe-lbl-TP 0.0
31 jnp-lbl-TP 0.0
32 jpo-lbl-TP 0.0
33 jcxz-lbl-TP 0.0
34 jecxz-lbl-TP 0.0
35 jo-lbl-TP 0.0
36 jno-lbl-TP 0.0
37 js-lbl-TP 0.0
38 jns-lbl-TP 0.0
39 vmulss-xmm_xmm_xmm-TP 1.0
40 vaddss-xmm_xmm_xmm-TP 1.0
41 vxorps-xmm_xmm_xmm-TP 0.25
42 inc-r64-TP 0.3333333333333333
43 xor-r32_r32-TP 0.3333333333333333
44 vcvtsi2ss-xmm_xmm_r32-TP 1.0
45 vaddss-xmm_xmm_mem-TP 1.0
46 vmovupd-load-avx-TP 1.0
47 lea-r32_mem-TP 1.0
48 vmovss-xmm_mem-TP 0.5
49 vmovss-mem_xmm-TP 1.0
50 vmovupd-store-avx-TP 2.0
51 lea-r64_mem-TP 1.0
52 movslq-r64_mem-TP 0.5
53 mov-r64_mem-TP 0.5
54 vaddpd-ymm_ymm_ymm-TP 1.0
55 cmp-r32_r32-TP 0.3333333333333333
56 vmovsd-xmm_xmm_xmm-TP 1.0
57 vmulsd-xmm_xmm_mem-TP 1.0
58 vmovsd-mem_xmm-TP 1.0
59 vmovhpd-xmm_xmm_mem-TP 1.0
60 vsubpd-ymm_ymm_ymm-TP 1.0
61 vmovq-xmm_r64-TP 1.0
62 vunpckhpd-xmm_xmm_xmm-TP 1.0
63 vmulpd-ymm_ymm_mem-TP 1.0
64 mov-mem_r64-TP 1.0
65 movzbl-r32_r8-TP 0.29600000000000004
66 vmulsd-xmm_xmm_xmm-TP 1.0
67 vaddsd-xmm_xmm_mem-TP 1.0
68 vmovq-r64_xmm-TP 1.0
69 vmulpd-ymm_ymm_ymm-TP 1.0
70 mov-r32_mem-TP 0.5
71 cmp-r32_mem-TP 0.5
72 vaddpd-xmm_xmm_xmm-TP 1.0
73 mov-mem_r32-TP 1.0
74 vmovsd-xmm_mem-TP 0.5
75 vsubsd-xmm_xmm_xmm-TP 1.0
76 vmovaps-xmm_xmm-TP 0.845
77 vaddsd-xmm_xmm_xmm-TP 1.0
78 add-r32_mem-TP 0.5
79 vmovupd-xmm_mem-TP 0.5
80 test-r32_r32-TP 0.3333333333333333
81 add-r64_r64-TP 0.3333333333333333
82 dec-r32-TP 0.3333333333333333
83 movslq-r64_r32-TP 0.3333333333333333
84 vxorpd-ymm_ymm_ymm-TP 0.25
85 sub-r32_r32-TP 0.3333333333333333
86 inc-r32-TP 0.3333333333333333
87 neg-r32-TP 0.3333333333333333
88 cmp-r64_imd-TP 0.3333333333333333
89 vxorpd-xmm_xmm_xmm-TP 0.25
90 vmovapd-ymm_ymm-TP 0.856
91 vmovapd-xmm_xmm-TP 0.855
92 mov-r32_r32-TP 0.3333333333333333

331
osaca/eu_sched.py Executable file
View File

@@ -0,0 +1,331 @@
#!/apps/python/3.5-anaconda/bin/python
import sys
import os
import math
import ast
from param import *
from operator import add
import pandas as pd
class Scheduler(object):
arch_dict = {'SNB':6, 'IVB':6, 'HSW':8, 'BDW':8, 'SKL':8}
ports = None #type: int
instrList = None #type: list<list<str,Param[,Param][,Param],str>>
# instr, operand(s), instr form
df = None #type: DataFrame
def __init__(self, arch, instructionList):
arch = arch.upper()
try:
self.ports = self.arch_dict[arch]
except KeyError:
print('Architecture not supportet for EU scheduling.')
sys.exit()
self.instrList = instructionList
currDir = os.path.realpath(__file__)[:-11]
self.df = pd.read_csv(currDir+'data/'+arch.lower()+'_data.csv', quotechar='"', converters={'ports':ast.literal_eval})
def schedule(self):
'''
Schedules Instruction Form list and calculates port bindings.
Returns
-------
(str, [int, ...])
A tuple containing the graphic output of the schedule as string and
the port bindings as list of ints.
'''
sched = self.get_head()
# Initialize ports
# groups = [[] for x in range(len(set(portOccurances))-1)]
occ_ports = [[0]*self.ports for x in range(len(self.instrList))]
# occ_ports = [[0]*self.ports]*len(self.instrList)
port_bndgs = [0]*self.ports
# Check if there's a port occupation stored in the CSV, otherwise leave the
# occ_port list item empty
for i,instrForm in enumerate(self.instrList):
try:
searchString = instrForm[0]+'-'+self.get_operand_suffix(instrForm)
entry = self.df.loc[lambda df: df.instr == searchString,'TP':'ports']
tup = entry.ports.values[0]
if(len(tup) == 1 and tup[0][0] == -1):
raise IndexError()
except IndexError:
# Instruction form not in CSV
sched += self.get_line(occ_ports[i], '* '+instrForm[-1])
continue
# Get the occurance of each port from the occupation list
portOccurances = self.get_port_occurances(tup)
# Get 'occurance groups'
occuranceGroups = self.get_occurance_groups(portOccurances)
# Calculate port dependent throughput
TPGes = entry.TP.values[0]*len(occuranceGroups[0])
for occGroup in occuranceGroups:
for port in occGroup:
occ_ports[i][port] = TPGes/len(occGroup)
# Write schedule line
sched += self.get_line(occ_ports[i], instrForm[-1])
# Add throughput to total port binding
port_bndgs = list(map(add, port_bndgs, occ_ports[i]))
return (sched, port_bndgs)
def schedule_FCFS(self):
'''
Schedules Instruction Form list for a single run with latencies.
Returns
-------
(str, int)
A tuple containing the graphic output as string and the total throughput time as int.
'''
sched = self.get_head()
total = 0
# Initialize ports
occ_ports = [0]*self.ports
for i,instrForm in enumerate(self.instrList):
try:
searchString = instrForm[0]+'-'+self.get_operand_suffix(instrForm)
entry = self.df.loc[lambda df: df.instr == searchString,'LT':'ports']
tup = entry.ports.values[0]
if(len(tup) == 1 and tup[0][0] == -1):
raise IndexError()
except IndexError:
# Instruction form not in CSV
sched += self.get_line([0]*self.ports,'* '+instrForm[-1])
continue
found = False
while(not found):
for portOcc in tup:
# Test if chosen instruction form port occupation suits the current CPU port occupation
if(self.test_ports_FCFS(occ_ports, portOcc)):
# Current port occupation fits for chosen port occupation of the instruction!
found = True
good = [entry.LT.values[0] if (j in portOcc) else 0 for j in range(0,self.ports)]
sched += self.get_line(good, instrForm[-1])
# Add new occupation
occ_ports = [occ_ports[j]+good[j] for j in range(0, self.ports)]
break
# Step
occ_ports = [j-1 if (j > 0) else 0 for j in occ_ports]
if(entry.LT.values[0] != 0):
total += 1
total += max(occ_ports)
return (sched, total)
def get_occurance_groups(self, portOccurances):
'''
Groups ports in groups by the number of their occurance and sorts
groups by cardinality
Parameters
----------
portOccurances : [int, ...]
List with the length of ports containing the number of occurances
of each port
Returns
-------
[[int, ...], ...]
List of lists with all occurance groups sorted by cardinality
(smallest group first)
'''
groups = [[] for x in range(len(set(portOccurances))-1)]
for i,groupInd in enumerate(range(min(list(filter(lambda x: x > 0, portOccurances))),max(portOccurances)+1)):
for p, occurs in enumerate(portOccurances):
if groupInd == occurs:
groups[i].append(p)
# Sort groups by cardinality
groups.sort(key=len)
return groups
def get_port_occurances(self, tups):
'''
Returns the number of each port occurance for the possible port
occupations
Parameters
----------
tups : ((int, ...), ...)
Tuple of tuples of possible port occupations
Returns
-------
[int, ...]
List in the length of the number of ports for the current architecture,
containing the amount of occurances for each port
'''
ports = [0]*self.ports
for tup in tups:
for elem in tup:
ports[elem] += 1
return ports
def test_ports_FCFS(self, occ_ports, needed_ports):
'''
Test if current configuration of ports is possible and returns boolean
Parameters
----------
occ_ports : [int]
Tuple to inspect for current port occupation
needed_ports : (int)
Tuple with needed port(s) for particular instruction form
Returns
-------
bool
True if needed ports can get scheduled on current port occupation
False if not
'''
for port in needed_ports:
if(occ_ports[port] != 0):
return False
return True
def get_report_info(self):
'''
Creates Report information including all needed annotations.
Returns
-------
str
String containing the report information
'''
analysis = 'Throughput Analysis Report\n'+('-'*26)+'\n'
annotations = ( '* - No information for this instruction in database\n'
'\n')
return analysis+annotations
def get_head(self):
'''
Creates right heading for CPU architecture.
Returns
-------
str
String containing the header
'''
horizLine = '-'*7*self.ports+'-\n'
portAnno = ' '*(math.floor((len(horizLine)-24)/2))+'Ports Pressure in cycles'+' '*(math.ceil((len(horizLine)-24)/2))+'\n'
portLine = ''
for i in range(0,self.ports):
portLine += '| {} '.format(i)
portLine += '|\n'
head = portAnno+portLine+horizLine
return head
def get_line(self, occ_ports, instrName):
'''
Create line with port occupation for output.
Parameters
----------
occ_ports : (int, ...)
Integer tuple containing needed ports
instrName : str
Name of instruction form for output
Returns
-------
str
String for output containing port scheduling for instrName
'''
line = ''
for i in occ_ports:
cycles = ' ' if (i == 0) else '%.2f' % float(i)
line += '| '+cycles+' '
line += '| '+instrName+'\n'
return line
def get_port_binding(self, port_bndg):
'''
Creates port binding out of scheduling result.
Parameters
----------
port_bndg : [int, ...]
Integer list containing port bindings
Returns
-------
str
String containing the port binding graphical output
'''
header = 'Port Binding in Cycles Per Iteration:\n'
horizLine = '-'*10+'-'*6*self.ports+'\n'
portLine = '| Port |'
for i in range(0, self.ports):
portLine += ' {} |'.format(i)
portLine += '\n'
cycLine = '| Cycles |'
for i in range(len(port_bndg)):
cycLine += ' {} |'.format(round(port_bndg[i], 2))
cycLine += '\n'
binding = header+horizLine+portLine+horizLine+cycLine+horizLine
return binding
def get_operand_suffix(self, instrForm):
'''
Creates operand suffix out of list of Parameters.
Parameters
----------
instrForm : [str, Parameter, ..., Parameter, str]
Instruction Form data structure
Returns
-------
str
Operand suffix for searching in database
'''
extension = ''
opExt = []
for i in range(1, len(instrForm)-1):
optmp = ''
if(isinstance(instrForm[i], Register) and instrForm[i].reg_type == 'GPR'):
optmp = 'r'+str(instrForm[i].size)
elif(isinstance(instrForm[i], MemAddr)):
optmp = 'mem'
else:
optmp = str(instrForm[i]).lower()
opExt.append(optmp)
operands = '_'.join(opExt)
return operands
if __name__ == '__main__':
data = [
['lea',Register('RAX'),MemAddr('%edx,(%rax,%rax,1)'),'lea 0x1(%rax,%rax,1),%edx'],
['vcvtsi2ss',Register('XMM0'),Register('XMM0'),Register('RAX'),'vcvtsi2ss %edx,%xmm2,%xmm2'],
['vmulss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vmulss %xmm2,%xmm0, %xmm3'],
['lea',Register('RAX'),MemAddr('%edx,(%rax,%rax,1)'),'lea 0x2(%rax,%rax,1),%ecx'],
['vaddss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vaddss %xmm3,%xmm1,%xmm4'],
['vxorps',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vxorps %xmm1, %xmm1,%xmm1'],
['vcvtsi2ss',Register('XMM0'),Register('XMM0'),Register('RAX'),'vcvtsi2ss %ecx,%xmm1, %xmm1'],
['vmulss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vmulss %xmm1,%xmm0,%xmm5'],
['vmovss',MemAddr('%edx,(%rax,%rax,1)'),Register('XMM0'),'vmovss %xmm4,0x4(%rsp,%rax,8)'],
['vaddss',Register('XMM0'),Register('XMM0'),Register('XMM0'),'vaddss %xmm5,%xmm4,%xmm1'],
['vmovss',MemAddr('%edx,(%rax,%rax,1)'),Register('XMM0'),'vmovss %xmm1,0x8(%rsp,%rax,8)'],
['inc',Register('RAX'),'inc %rax'],
['cmp',Register('RAX'),Parameter('IMD'),'cmp $0x1f3,%rax'],
['jb',Parameter('LBL'),'jb 400bc2 <main+0x62>']
]
sched = Scheduler('ivb', data)
output,binding = sched.schedule()
print(sched.get_port_binding(binding))
print(sched.get_report_info(),end='')
print(output)
print('Block Throughput: {}'.format(round(max(binding),2)))

247
osaca/get_instr.py Executable file
View File

@@ -0,0 +1,247 @@
#!/apps/python/3.5-anaconda/bin/python
import sys
import re
from testcase import *
from param import *
marker = r'//STARTLOOP'
asm_line = re.compile(r'\s[0-9a-f]+[:]')
numSeps = 0
sem = 0
db = {}
sorted_db = []
lncnt = 1
#cnt=0
fname = ""
cntChar = ''
first = True
def extract_instr(asmFile):
global once
global lncnt
global fname
fname = asmFile
#Check if parameter is in the correct file format
if(asmFile[-4:] != ".log"):
print("Invalid argument")
sys.exit()
#Open file
try:
f=open(asmFile, "r")
except IOError:
print("IOError: File not found")
#Analyse code line by line and check the instructions
lncnt = 1
for line in f:
check_line(line)
lncnt += 1
f.close()
def check_line(line):
global numSeps
global sem
global first
#Check if marker is in line and count the number of whitespaces if so
if(marker in line):
#But first, check if high level code ist indented with whitespaces or tabs
if(first):
set_counter_char(line)
first = False
numSeps = (re.split(marker,line)[0]).count(cntChar)
sem = 2;
elif(sem > 0):
#We're in the marked code snipped
#Check if the line is ASM code and - if not - check if we're still in the loop
match = re.search(asm_line, line)
if(match):
#Further analysis of instructions
# print("".join(re.split(r'\t',line)[-1:]),end="")
#Check if there are commetns in line
if(r'//' in line):
return
check_instr("".join(re.split(r'\t',line)[-1:]))
elif((re.split(r'\S',line)[0]).count(cntChar) <= numSeps):
#Not in the loop anymore - or yet - so we decrement the semaphore
sem = sem-1
#Check if seperator is either tabulator or whitespace
def set_counter_char(line):
global cntChar
numSpaces = (re.split(marker,line)[0]).count(" ")
numTabs = (re.split(marker,line)[0]).count("\t")
if(numSpaces != 0 and numTabs == 0):
cntChar = ' '
elif(numSpaces == 0 and numTabs != 0):
cntChar = '\t'
else:
raise NotImplementedError("Indentation of code is only supported for whitespaces and tabs.")
def check_instr(instr):
global db
global lncnt
global cnt
global fname
#Check for strange clang padding bytes
while(instr.startswith("data32")):
instr = instr[7:]
#Seperate mnemonic and operands
mnemonic = instr.split()[0]
params = "".join(instr.split()[1:])
#Check if line is not only a byte
empty_byte = re.compile(r'[0-9a-f]{2}')
if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2):
return
#Check if there's one or more operand and store all in a list
param_list = flatten(separate_params(params))
opList = list(param_list)
#Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or LABEL (LBL)
for i in range(len(param_list)):
op = param_list[i]
if(len(op) <= 0):
op = Parameter("NONE")
elif(op[0] == '$'):
op = Parameter("IMD")
elif(op[0] == '%' and '(' not in op):
j = len(op)
opmask = False
if('{' in op):
j = op.index('{')
opmask = True
op = Register(op[1:j], opmask)
elif('<' in op):
op = Parameter("LBL")
else:
op = MemAddr(op)
param_list[i] = str(op)
opList[i] = op
#Join mnemonic and operand(s) to an instruction form
if(len(mnemonic) > 7):
tabs = "\t"
else:
tabs = "\t\t"
instr_form = mnemonic+tabs+(" ".join(param_list))
#Check in database for instruction form and increment the counter
if(instr_form in db):
db[instr_form] = db[instr_form]+1
else:
db[instr_form] = 1
#Create testcase for instruction form, since it is the first appearance of it
#But (as far as now) only for instr forms with only registers as operands
# is_Reg = True
# for par in opList:
# print(par.print()+" is Register: "+str(isinstance(par, Register)))
# if(not isinstance(par, Register)):
# is_Reg = False
# if(is_Reg):
#print(mnemonic)
# print("create testcase for "+mnemonic+" with params:")
# for p in opList:
# print(p.print(),end=", ")
# print()
#Only create benchmark if no label (LBL) is part of the operands
do_bench = True
for par in opList:
if(str(par) == 'LBL' or str(par) == ''):
do_bench = False
if(do_bench):
#Create testcase with reversed param list, due to the fact its intel syntax!
# create_testcase(mnemonic, list(reversed(opList)))
# print('menmonic: '+mnemonic+' ops: '+str(list(reversed(opList))))
tc = Testcase(mnemonic, list(reversed(opList)), '64')
tc.write_testcase()
# print("-----------")
def separate_params(params):
param_list = [params]
if(',' in params):
if(')' in params):
if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','):
i = params.index(')')+1
elif(params.index('(') < params.index(',')):
return param_list
else:
i = params.index(',')
else:
i = params.index(',')
param_list = [params[:i],separate_params(params[i+1:])]
elif('#' in params):
i = params.index('#')
param_list = [params[:i]]
return param_list
def sort_db():
global sorted_db
sorted_db=sorted(db.items(), key=lambda x:x[1], reverse=True)
def print_sorted_db():
sort_db()
sum = 0
print("Number of\tmnemonic")
print("calls\n")
for i in range(len(sorted_db)):
print(str(sorted_db[i][1])+"\t\t"+sorted_db[i][0])
sum += sorted_db[i][1]
print("\nCumulated number of instructions: "+str(sum))
def save_db():
global db
file = open(".cnt_asm_ops.db","w")
for i in db.items():
file.write(i[0]+"\t"+str(i[1])+"\n")
file.close()
def load_db():
global db
try:
file = open(".cnt_asm_ops.db", "r")
except FileNotFoundError:
print("no database found in current directory")
return
for line in file:
mnemonic = line.split('\t')[0]
#Join mnemonic and operand(s) to an instruction form
if(len(mnemonic) > 7):
tabs = "\t"
params = line.split('\t')[1]
numCalls = line.split("\t")[2][:-1]
else:
tabs = "\t\t"
params = line.split('\t')[2]
numCalls = line.split("\t")[3][:-1]
instr_form = mnemonic+tabs+params
db[instr_form] = int(numCalls)
file.close()
def flatten(l):
if l == []:
return l
if(isinstance(l[0], list)):
return flatten(l[0]) + flatten(l[1:])
return l[:1] + flatten(l[1:])
if __name__ == "__main__":
# load_db()
# r0 = Register("ymm0")
# r1 = Register("xmm0")
# r64 = Register("rax")
# r32 = Register("eax")
# mem0 = MemAddr('(%rax, %esi, 4)')
# tc = Testcase("XOR", [r32, r32], '64')
# tc.write_testcase()
# create_testcase("VADDPD", [r0, r0, r0])
if(len(sys.argv) > 1):
for i in range(1,len(sys.argv)):
extract_instr(sys.argv[i])
print_sorted_db()
# save_db()

826
osaca/osaca.py Executable file
View File

@@ -0,0 +1,826 @@
#!/apps/python/3.5-anaconda/bin/python
import argparse
import sys
import subprocess
import os
import re
from param import *
from eu_sched import *
from testcase import *
import pandas as pd
from datetime import datetime
import numpy as np
class Osaca(object):
arch = None
filepath = None
srcCode = None
df = None
instrForms = None
# Variables for checking lines
numSeps = 0
indentChar = ''
sem = 0
marker = r'//STARTLOOP'
# Variables for creating output
longestInstr = 30
# Constants
ASM_LINE = re.compile(r'\s[0-9a-f]+[:]')
# Matches every variation of the IACA start marker
IACA_SM = re.compile(r'\s*movl[ \t]+\$111[ \t]*,[ \t]*%ebx[ \t]*\n\s*\.byte[ \t]+100[ \t]*((,[ \t]*103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144)))|(\n\s*\.byte[ \t]+103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144))))')
# Matches every variation of the IACA end marker
IACA_EM = re.compile(r'\s*movl[ \t]+\$222[ \t]*,[ \t]*%ebx[ \t]*\n\s*\.byte[ \t]+100[ \t]*((,[ \t]*103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144)))|(\n\s*\.byte[ \t]+103[ \t]*((,[ \t]*144)|(\n\s*\.byte[ \t]+144))))')
def __init__(self, _arch, _filepath):
self.arch = _arch
self.filepath = _filepath
self.instrForms = []
##-------------------main functions depending on arguments----------------------
def include_ibench(self):
"""
Reads ibench output and includes it in the architecture specific csv
file.
"""
# Check args and exit program if something's wrong
if(not self.check_arch()):
print('Invalid microarchitecture.')
sys.exit()
if(not self.check_file()):
print('Invalid file path or file format.')
sys.exit()
# Check for database for the chosen architecture
self.df = self.read_csv()
# Create sequence of numbers and their reciprokals for validate the measurements
cycList,reciList = self.create_sequences()
print('Everything seems fine! Let\'s start!')
newData = []
addedValues = 0
for line in self.srcCode:
if('Using frequency' in line or len(line) == 0):
continue
clmn = 'LT'
instr = line.split()[0][:-1]
if('TP' in line):
# We found a command with a throughput value. Get instruction and the number of
# clock cycles and remove the '-TP' suffix.
clmn = 'TP'
instr = instr[:-3]
# Otherwise it is a latency value. Nothing to do.
clkC = line.split()[1]
clkC_tmp = clkC
clkC = self.validate_val(clkC, instr, True if (clmn == 'TP') else False, cycList, reciList)
txtOutput = True if (clkC_tmp == clkC) else False
val = -2
new = False
try:
entry = self.df.loc[lambda df: df.instr == instr,clmn]
val = entry.values[0]
except IndexError:
# Instruction not in database yet --> add it
new = True
# First check if LT or TP value has already been added before
for i,item in enumerate(newData):
if(instr in item):
if(clmn == 'TP'):
newData[i][1] = clkC
elif(clmn == 'LT'):
newData[i][2] = clkC
new = False
break
if(new and clmn == 'TP'):
newData.append([instr,clkC,'-1',((-1,),)])
elif(new and clmn == 'LT'):
newData.append([instr,'-1',clkC,((-1,),)])
new = True
addedValues += 1
pass
# If val is -1 (= not filled with a valid value) add it immediately
if(val == -1):
self.df.set_value(entry.index[0], clmn, clkC)
addedValues += 1
continue
if(not new and abs((val/np.float64(clkC))-1) > 0.05):
print('Different measurement for {} ({}): {}(old) vs. {}(new)\nPlease check for correctness (no changes were made).'.format(instr, clmn, val, clkC))
txtOutput = True
if(txtOutput):
print()
txtOutput = False
# Now merge the DataFrames and write new csv file
self.df = self.df.append(pd.DataFrame(newData, columns=['instr','TP','LT','ports']), ignore_index=True)
csv = self.df.to_csv(index=False)
self.write_csv(csv)
print('ibench output {} successfully in database included.'.format(self.filepath.split('/')[-1]))
print('{} values were added.'.format(addedValues))
def inspect_binary(self):
"""
Main function of OSACA. Inspect binary file and create analysis.
"""
# Check args and exit program if something's wrong
if(not self.check_arch()):
print('Invalid microarchitecture.')
sys.exit()
if(not self.check_elffile()):
print('Invalid file path or file format.')
sys.exit()
# Finally check for database for the chosen architecture
self.read_csv()
print('Everything seems fine! Let\'s start checking!')
for i,line in enumerate(self.srcCode):
if(i == 0):
self.check_line(line, True)
else:
self.check_line(line)
output = self.create_output()
print(output)
def inspect_with_iaca(self):
"""
Main function of OSACA with IACA markers instead of OSACA marker.
Inspect binary file and create analysis.
"""
# Check args and exit program if something's wrong
if(not self.check_arch()):
print('Invalid microarchitecture.')
sys.exit()
# Check if input file is a binary or assembly file
try:
binaryFile = True
if(not self.check_elffile()):
print('Invalid file path or file format.')
sys.exit()
except (TypeError,IndexError):
binaryFile = False
if(not self.check_file(True)):
print('Invalid file path or file format.')
sys.exit()
# Finally check for database for the chosen architecture
self.read_csv()
print('Everything seems fine! Let\'s start checking!')
if(binaryFile):
self.iaca_bin()
else:
self.iaca_asm()
output = self.create_output()
print(output)
##------------------------------------------------------------------------------
def check_arch(self):
"""
Check if the architecture is valid.
Returns
-------
bool
True if arch is supported
False if arch is not supported
"""
archList = ['SNB','IVB','HSW', 'BDW', 'SKL']
if(self.arch in archList):
return True
else:
return False
def check_elffile(self):
"""
Check if the given filepath exists, if the format is the needed elf64
and store file data in attribute srcCode.
Returns
-------
bool
True if file is expected elf64 file
False if file does not exist or is not an elf64 file
"""
if(os.path.isfile(self.filepath)):
self.store_srcCode_elf()
if('file format elf64' in self.srcCode[1]):
return True
return False
def check_file(self,iacaFlag=False):
"""
Check if the given filepath exists and store file data in attribute
srcCode.
Parameters
----------
iacaFlag : bool
store file data as a string in attribute srcCode if True,
store it as a list of strings (lines) if False (default False)
Returns
-------
bool
True if file exists
False if file does not exist
"""
if(os.path.isfile(self.filepath)):
self.store_srcCode(iacaFlag)
return True
return False
def store_srcCode_elf(self):
"""
Load binary file compiled with '-g' in class attribute srcCode and
separate by line.
"""
self.srcCode = subprocess.run(['objdump', '--source', self.filepath], stdout=subprocess.PIPE).stdout.decode('utf-8').split('\n')
def store_srcCode(self,iacaFlag=False):
"""
Load arbitrary file in class attribute srcCode.
Parameters
----------
iacaFlag : bool
store file data as a string in attribute srcCode if True,
store it as a list of strings (lines) if False (default False)
"""
try:
f = open(self.filepath, 'r')
except IOError:
print('IOError: file \'{}\' not found'.format(self.filepath))
self.srcCode = ''
for line in f:
self.srcCode += line
f.close()
if(iacaFlag):
return
self.srcCode = self.srcCode.split('\n')
def read_csv(self):
"""
Reads architecture dependent CSV from data directory.
Returns
-------
DataFrame
CSV as DataFrame object
"""
currDir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
df = pd.read_csv(currDir+'/data/'+self.arch.lower()+'_data.csv')
return df
def write_csv(self,csv):
"""
Writes architecture dependent CSV into data directory.
Parameters
----------
csv : str
CSV data as string
"""
try:
f = open('data/'+self.arch.lower()+'_data.csv', 'w')
except IOError:
print('IOError: file \'{}\' not found in ./data'.format(self.arch.lower()+'_data.csv'))
f.write(csv)
f.close()
def create_sequences(self,end=101):
"""
Creates list of integers from 1 to end and list of their reciprocals.
Parameters
----------
end : int
End value for list of integers (default 101)
Returns
-------
[int]
cycList of integers
[float]
reciList of floats
"""
cycList = []
reciList = []
for i in range(1, end):
cycList.append(i)
reciList.append(1/i)
return cycList,reciList
def validate_val(self,clkC, instr, isTP, cycList, reciList):
"""
Validate given clock cycle clkC and return rounded value in case of
success.
A succeeded validation means the clock cycle clkC is only 5% higher or
lower than an integer value from cycList or - if clkC is a throughput
value - 5% higher or lower than a reciprocal from the reciList.
Parameters
----------
clkC : float
Clock cycle to validate
instr : str
Instruction for warning output
isTP : bool
True if a throughput value is to check, False for a latency value
cycList : [int]
Cycle list for validating
reciList : [float]
Reciprocal cycle list for validating
Returns
-------
float
Clock cycle, either rounded to an integer or its reciprocal or the
given clkC parameter
"""
clmn = 'LT'
if(isTP):
clmn = 'TP'
for i in range(0, len(cycList)):
if(cycList[i]*1.05 > float(clkC) and cycList[i]*0.95 < float(clkC)):
# Value is probably correct, so round it to the estimated value
return cycList[i]
# Check reciprocal only if it is a throughput value
elif(isTP and reciList[i]*1.05 > float(clkC) and reciList[i]*0.95 < float(clkC)):
# Value is probably correct, so round it to the estimated value
return reciList[i]
# No value close to an integer or its reciprocal found, we assume the
# measurement is incorrect
print('Your measurement for {} ({}) is probably wrong. Please inspect your benchmark!'.format(instr, clmn))
print('The program will continue with the given value')
return clkC
def check_line(self,line,firstAppearance=False):
"""
Inspect line of source code and process it if inside the marked snippet.
Parameter
---------
line : str
Line of source code
firstAppearance : bool
Necessary for setting indenting character (default False)
"""
# Check if marker is in line
if(self.marker in line):
# First, check if high level code in indented with whitespaces or tabs
if(firstAppearance):
self.indentChar = self.get_indent_chars(line)
# Now count the number of whitespaces
self.numSeps = (re.split(self.marker, line)[0]).count(self.indentChar)
self.sem = 2
elif(self.sem > 0):
# We're in the marked code snippet
# Check if the line is ASM code and - if not - check if we're still in the loop
match = re.search(self.ASM_LINE, line)
if(match):
# Further analysis of instructions
# Check if there are comments in line
if(r'//' in line):
return
self.check_instr(''.join(re.split(r'\t', line)[-1:]))
elif((re.split(r'\S', line)[0]).count(self.indentChar) <= self.numSeps):
# Not in the loop anymore - or yet. We decrement the semaphore
self.sem = self.sem-1
def get_indent_chars(self,line):
"""
Check if indentation characters are either tabulators or whitespaces
Parameters
----------
line : str
Line with start marker in it
Returns
-------
str
Indentation character as string
"""
numSpaces = (re.split(self.marker, line)[0]).count(' ')
numTabs = (re.split(self.marker, line)[0]).count('\t')
if(numSpaces != 0 and numTabs == 0):
return ' '
elif(numSpaces == 0 and numTabs != 0):
return '\t'
else:
raise NotImplementedError('Indentation of code is only supported for whitespaces and tabs.')
def iaca_bin(self):
"""
Extract instruction forms out of binary file using IACA markers.
"""
self.marker = r'fs addr32 nop'
for line in self.srcCode:
# Check if marker is in line
if(self.marker in line):
self.sem += 1
elif(self.sem == 1):
# We're in the marked code snippet
# Check if the line is ASM code
match = re.search(self.ASM_LINE, line)
if(match):
# Further analysis of instructions
# Check if there are comments in line
if(r'//' in line):
continue
# Do the same instruction check as for the OSACA marker line check
self.check_instr(''.join(re.split(r'\t', line)[-1:]))
elif(self.sem == 2):
# Not in the loop anymore. Due to the fact it's the IACA marker we can stop here
# After removing the last line which belongs to the IACA marker
del self.instrForms[-1:]
return
def iaca_asm(self):
"""
Extract instruction forms out of assembly file using IACA markers.
"""
# Extract the code snippet surround by the IACA markers
code = self.srcCode
# Search for the start marker
match = re.match(self.IACA_SM, code)
while(not match):
code = code.split('\n',1)[1]
match = re.match(self.IACA_SM, code)
# Search for the end marker
code = (code.split('144',1)[1]).split('\n',1)[1]
res = ''
match = re.match(self.IACA_EM, code)
while(not match):
res += code.split('\n',1)[0]+'\n'
code = code.split('\n',1)[1]
match = re.match(self.IACA_EM, code)
# Split the result by line go on like with OSACA markers
res = res.split('\n')
for line in res:
line = line.split('#')[0]
line = line.lstrip()
if(len(line) == 0 or '//' in line or line.startswith('..')):
continue
self.check_instr(line)
def check_instr(self,instr):
"""
Inspect instruction for its parameters and add it to the instruction forms
pool instrForm.
Parameters
----------
instr : str
Instruction as string
"""
# Check for strange clang padding bytes
while(instr.startswith('data32')):
instr = instr[7:]
# Separate mnemonic and operands
mnemonic = instr.split()[0]
params = ''.join(instr.split()[1:])
# Check if line is not only a byte
empty_byte = re.compile(r'[0-9a-f]{2}')
if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2):
return
# Check if there's one or more operands and store all in a list
param_list = self.flatten(self.separate_params(params))
param_list_types = list(param_list)
# Check operands and separate them by IMMEDIATE (IMD), REGISTER (REG),
# MEMORY (MEM) or LABEL(LBL)
for i in range(len(param_list)):
op = param_list[i]
if(len(op) <= 0):
op = Parameter('NONE')
elif(op[0] == '$'):
op = Parameter('IMD')
elif(op[0] == '%' and '(' not in op):
j = len(op)
opmask = False
if('{' in op):
j = op.index('{')
opmask = True
op = Register(op[1:j], opmask)
elif('<' in op or op.startswith('.')):
op = Parameter('LBL')
else:
op = MemAddr(op)
param_list[i] = str(op)
param_list_types[i] = op
# Add to list
if(len(instr) > self.longestInstr):
self.longestInstr = len(instr)
instrForm = [mnemonic]+list(reversed(param_list_types))+[instr]
self.instrForms.append(instrForm)
# If flag is set, create testcase for instruction form
# Do this in reversed param list order, du to the fact it's intel syntax
# Only create benchmark if no label (LBL) is part of the operands
if('LBL' in param_list or '' in param_list):
return
tc = Testcase(mnemonic, list(reversed(param_list_types)), '64')
# Only write a testcase if it not already exists
writeTP, writeLT = tc._Testcase__is_in_dir()
tc.write_testcase(not writeTP, not writeLT)
def separate_params(self,params):
"""
Delete comments, separates parameters and return them as a list.
Parameters
----------
params : str
Splitted line after mnemonic
Returns
-------
[[...[str]]]
Nested list of strings. The number of nest levels depend on the
number of parametes given.
"""
param_list = [params]
if(',' in params):
if(')' in params):
if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','):
i = params.index(')')+1
elif(params.index('(') < params.index(',')):
return param_list
else:
i = params.index(',')
else:
i = params.index(',')
param_list = [params[:i],self.separate_params(params[i+1:])]
elif('#' in params):
i = params.index('#')
param_list = [params[:i]]
return param_list
def flatten(self,l):
"""
Flatten a nested list of strings.
Parameters
----------
l : [[...[str]]]
Nested list of strings
Returns
-------
[str]
List of strings
"""
if l == []:
return l
if(isinstance(l[0], list)):
return self.flatten(l[0]) + self.flatten(l[1:])
return l[:1] + self.flatten(l[1:])
def create_output(self,tp_list=False,pr_sched=True):
"""
Creates output of analysed file including a time stamp.
Parameters
----------
tp_list : bool
Boolean for indicating the need for the throughput list as output
(default False)
pr_sched : bool
Boolean for indicating the need for predicting a scheduling
(default True)
Returns
-------
str
OSACA output
"""
# Check the output alignment depending on the longest instruction
if(self.longestInstr > 70):
self.longestInstr = 70
horizLine = self.create_horiz_sep()
ws = ' '*(len(horizLine)-23)
# Write general information about the benchmark
output = ( '--'+horizLine+'\n'
'| Analyzing of file:\t'+os.path.abspath(self.filepath)+'\n'
'| Architecture:\t\t'+self.arch+'\n'
'| Timestamp:\t\t'+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'\n')
if(tp_list):
output += self.create_TP_list(horizLine)
if(pr_sched):
output += '\n\n'
sched = Scheduler(self.arch, self.instrForms)
schedOutput,portBinding = sched.schedule()
binding = sched.get_port_binding(portBinding)
output += sched.get_report_info()+'\n'+binding+'\n\n'+schedOutput
blockTP = round(max(portBinding), 2)
output += 'Total number of estimated throughput: '+str(blockTP)
return output
def create_horiz_sep(self):
"""
Calculate and return horizontal separator line.
Returns
-------
str
Horizontal separator line
"""
return '-'*(self.longestInstr+8)
def create_TP_list(self,horizLine):
"""
Create list of instruction forms with the proper throughput value.
Parameter
---------
horizLine : str
Calculated horizontal line for nice alignement
Returns
-------
str
Throughput list output for printing
"""
warning = False
ws = ' '*(len(horizLine)-23)
output = ('\n| INSTRUCTION'+ws+'CLOCK CYCLES\n'
'| '+horizLine+'\n|\n')
# Check for the throughput data in CSV
for elem in self.instrForms:
extension = ''
opExt = []
for i in range(1, len(elem)-1):
optmp = ''
if(isinstance(elem[i], Register) and elem[i].reg_type == 'GPR'):
optmp = 'r'+str(elem[i].size)
elif(isinstance(elem[i], MemAddr)):
optmp = 'mem'
else:
optmp = str(elem[i]).lower()
opExt.append(optmp)
operands = '_'.join(opExt)
# Now look up the value in the dataframe
# Check if there is a stored throughput value in database
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')
series = self.df['instr'].str.contains(elem[0]+'-'+operands)
if( True in series.values):
# It's a match!
notFound = False
try:
tp = self.df[self.df.instr == elem[0]+'-'+operands].TP.values[0]
except IndexError:
# Something went wrong
print('Error while fetching data from database')
continue
# Did not found the exact instruction form.
# Try to find the instruction form for register operands only
else:
opExtRegs = []
for operand in opExt:
try:
regTmp = Register(operand)
opExtRegs.append(True)
except KeyError:
opExtRegs.append(False)
pass
if(not True in opExtRegs):
# No register in whole instruction form. How can I find out what regsize we need?
print('Feature not included yet: ', end='')
print(elem[0]+' for '+operands)
tp = 0
notFound = True
warning = True
numWhitespaces = self.longestInstr-len(elem[-1])
ws = ' '*numWhitespaces+'| '
n_f = ' '*(5-len(str(tp)))+'*'
data = '| '+elem[-1]+ws+str(tp)+n_f+'\n'
output += data
continue
if(opExtRegs[0] == False):
# Instruction stores result in memory. Check for storing in register instead.
if(len(opExt) > 1):
if(opExtRegs[1] == True):
opExt[0] = opExt[1]
elif(len(optExt > 2)):
if(opExtRegs[2] == True):
opExt[0] = opExt[2]
if(len(opExtRegs) == 2 and opExtRegs[1] == False):
# Instruction loads value from memory and has only two operands. Check for
# loading from register instead
if(opExtRegs[0] == True):
opExt[1] = opExt[0]
if(len(opExtRegs) == 3 and opExtRegs[2] == False):
# Instruction loads value from memory and has three operands. Check for loading
# from register instead
opExt[2] = opExt[0]
operands = '_'.join(opExt)
# Check for register equivalent instruction
series = self.df['instr'].str.contains(elem[0]+'-'+operands)
if( True in series.values):
# It's a match!
notFound = False
try:
tp = self.df[self.df.instr == elem[0]+'-'+operands].TP.values[0]
except IndexError:
# Something went wrong
print('Error while fetching data from database')
continue
# Did not found the register instruction form. Set warning and go on with
# throughput 0
else:
tp = 0
notFound = True
warning = True
# Check the alignement again
numWhitespaces = self.longestInstr-len(elem[-1])
ws = ' '*numWhitespaces+'| '
n_f = ''
if(notFound):
n_f = ' '*(5-len(str(tp)))+'*'
data = '| '+elem[-1]+ws+'{:3.2f}'.format(tp)+n_f+'\n'
output += data
# Finally end the list of throughput values
numWhitespaces = self.longestInstr-27
ws = ' '+' '*numWhitespaces
output += '| '+horizLine+'\n'
if(warning):
output += ('\n\n* There was no throughput value found '
'for the specific instruction form.'
'\n Please create a testcase via the create_testcase-method '
'or add a value manually.')
return output
##------------------------------------------------------------------------------
##------------Main method--------------
def main():
# Parse args
parser = argparse.ArgumentParser(description='Analyzes a marked innermost loop snippet for a given architecture type and prints out the estimated average throughput')
parser.add_argument('-V', '--version', action='version', version='%(prog)s 0.1')
parser.add_argument('--arch', dest='arch', type=str, help='define architecture (SNB, IVB, HSW, BDW, SKL)')
parser.add_argument('filepath', type=str, help='path to object (Binary, ASM, CSV)')
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument('-i', '--include-ibench', dest='incl', action='store_true', help='includes the given values in form of the output of ibench in the database')
group.add_argument('--iaca', dest='iaca', action='store_true', help='search for IACA markers instead the OSACA marker')
group.add_argument('-m', '--insert-marker', dest='insert_marker', action='store_true', help='try to find blocks probably corresponding to loops in assembly and insert IACA marker')
# Store args in global variables
inp = parser.parse_args()
if(inp.arch is None and inp.insert_marker is None):
raise ValueError('Please specify an architecture')
if(inp.arch is not None):
arch = inp.arch.upper()
filepath = inp.filepath
inclIbench = inp.incl
iacaFlag = inp.iaca
insert_m = inp.insert_marker
# Create Osaca object
if(inp.arch is not None):
osaca = Osaca(arch, filepath)
if(inclIbench):
osaca.include_ibench()
elif(iacaFlag):
osaca.inspect_with_iaca()
elif(insert_m):
try:
from kerncraft import iaca
except ImportError:
print('ImportError: Module kerncraft not installed. Use \'pip install --user kerncraft\' for installation.\nFor more information see https://github.com/RRZE-HPC/kerncraft')
sys.exit()
iaca.iaca_instrumentation(input_file=filepath, output_file=filepath,
block_selection='manual', pointer_increment=1)
else:
osaca.inspect_binary()
##------------Main method--------------
if __name__ == '__main__':
main()

109
osaca/param.py Executable file
View File

@@ -0,0 +1,109 @@
#!/apps/python/3.5-anaconda/bin/python
class Parameter(object):
type_list = ["REG", "MEM", "IMD", "LBL", "NONE"]
def __init__(self, ptype, name="NONE"):
self.ptype = ptype.upper()
if(self.ptype not in self.type_list):
raise NameError("Type not supported: "+ptype)
def __str__(self):
'''returns string representation'''
if(self.ptype == "NONE"):
return ""
else:
return self.ptype
class MemAddr(Parameter):
segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"]
scales = [1, 2, 4, 8]
def __init__(self, name):
self.sreg = False
self.offset = False
self.base = False
self.index = False
self.scale = False
if(':' in name):
if(name[1:name.index(':')].upper() not in self.segment_regs):
raise NameError("Type not supported: "+name)
self.sreg = True
self.offset = True
if('(' not in name or ('(' in name and name.index('(') != 0)):
self.offset = True
if('(' in name):
self.parentheses = name[name.index('(')+1:-1]
self.commacnt = self.parentheses.count(',')
if(self.commacnt == 0):
self.base = True
elif(self.commacnt == 1 or self.commacnt == 2 and int(self.parentheses[-1:]) == 1):
self.base = True
self.index = True
elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales):
self.base = True
self.index = True
self.scale = True
else:
raise NameError("Type not supported: "+name)
def __str__(self):
'''returns string representation'''
mem_format = "MEM("
if(self.sreg):
mem_format += "sreg:"
if(self.offset):
mem_format += "offset"
if(self.base and not self.index):
mem_format += "(base)"
elif(self.base and self.index and self.scale):
mem_format += "(base, index, scale)"
mem_format += ")"
return mem_format
class Register(Parameter):
sizes = {
#General Purpose Registers
"AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"),
"R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"),
"AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"),
"EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"),
"RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"),
"CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"),
"EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"),
#FPU Registers
"ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"),
#MMX Registers
"MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"),
#XMM Registers
"XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"),
#YMM Registers
"YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"),
#ZMM Registers
"ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"),
#Opmask Register
"K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"),
#Bounds Registers
"BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND"),
#Registers in gerneral
"R8":(8,"GPR"), "R16":(16,"GPR"), "R32":(32,"GPR"), "R64":(64,"GPR"), "FPU":(80,"FPU"), "MMX":(64,"MMX"), "XMM":(128,"XMM"), "YMM":(256,"YMM"), "ZMM":(512,"ZMM"), "K":(64,"K"), "BND":(128,"BND")
}
def __init__(self,name,mask=False):
self.name = name.upper()
self.mask = mask
# try:
if[name in self.sizes]:
self.size = self.sizes[self.name][0]
self.reg_type = self.sizes[self.name][1]
else:
print(lncnt)
raise NameError("Register name not in dictionary: "+self.name)
# except KeyError:
# print(lncnt)
def __str__(self):
'''returns string representation'''
opmask = ""
if(self.mask):
opmask = "{opmask}"
return(self.reg_type+opmask)

367
osaca/testcase.py Executable file
View File

@@ -0,0 +1,367 @@
#!/apps/python/3.5-anaconda/bin/python
import os
from subprocess import call
from math import ceil
from param import *
class Testcase(object):
##------------------Constant variables--------------------------
# Lookup tables for regs
gprs64 = ['rax', 'rbx', 'rcx', 'rdx', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15']
gprs32 = ['eax', 'ebx', 'ecx', 'edx', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d']
gprs16 = ['ax', 'bx', 'cx', 'dx', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w']
gprs8 = ['al', 'bl', 'cl', 'dl', 'r9l', 'r10l', 'r11l', 'r12l', 'r13l', 'r14l', 'r15l']
fpus = ['st0', 'st1', 'st2', 'st3', 'st4', 'st5', 'st6', 'st7']
mmxs = ['mm0', 'mm1', 'mm2', 'mm3', 'mm4', 'mm5', 'mm6', 'mm7']
ks = ['k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7']
bnds = ['bnd0', 'bnd1', 'bnd2', 'bnd3', 'bnd4', 'bnd5', 'bnd6', 'bnd7']
xmms = ['xmm0', 'xmm1', 'xmm2', 'xmm3', 'xmm4', 'xmm5', 'xmm6', 'xmm7', 'xmm8', 'xmm9',
'xmm10', 'xmm11', 'xmm12', 'xmm13', 'xmm14', 'xmm15']
ymms = ['ymm0', 'ymm1', 'ymm2', 'ymm3', 'ymm4', 'ymm5', 'ymm6', 'ymm7', 'ymm8', 'ymm9',
'ymm10', 'ymm11', 'ymm12', 'ymm13', 'ymm14', 'ymm15']
zmms = ['zmm0', 'zmm1', 'zmm2', 'zmm3', 'zmm4', 'zmm5', 'zmm6', 'zmm7', 'zmm8', 'zmm9',
'zmm10', 'zmm11', 'zmm12', 'zmm13', 'zmm14', 'zmm15']
# Lookup table for memory
mems = ['[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]','[rip+PI]']
# Lookup table for immediates
imds = ['1', '2', '13', '22', '8', '78', '159', '222', '3', '9', '5', '55', '173', '317', '254', '255']
# TODO Differentiate between AVX512 (with additional xmm16-31) and the rest
# ...
# ...
# end TODO
ops = {'gpr64':gprs64, 'gpr32':gprs32, 'gpr16':gprs16, 'gpr8':gprs8, 'fpu':fpus, 'mmx':mmxs, 'k':ks, 'bnd':bnds, 'xmm':xmms, 'ymm':ymms, 'zmm':zmms, 'mem':mems, 'imd':imds}
# Create Single Precision 1.0
sp1 = '\t\t# create SP 1.0\n'
sp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\n'
sp1 += '\t\tvpslld xmm0, xmm0, 25\t\t\t# logical left shift: 11111110..0 (25=32-(8-1))\n'
sp1 += '\t\tvpsrld xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n'
sp1 += '\t\t# copy SP 1.0\n'
# Create Double Precision 1.0
dp1 = '\t\t# create DP 1.0\n'
dp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\t\t# all ones\n'
dp1 += '\t\tvpsllq xmm0, xmm0, 54\t\t\t# logical left shift: 11111110..0 (54=64-(10-1))\n'
dp1 += '\t\tvpsrlq xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n'
# Create epilogue
done = ('done:\n'
'\t\tmov\trsp, rbp\n'
'\t\tpop\trbp\n'
'\t\tret\n'
'.size latency, .-latency')
##----------------------------------------------------------------
# Constructor
def __init__(self, _mnemonic, _param_list, _num_instr='32'):
self.instr = _mnemonic.lower()
self.param_list = _param_list
# num_instr must be an even number
self.num_instr = str(ceil(int(_num_instr)/2)*2)
# Check for the number of operands and initialise the GPRs if necessary
self.op_a, self.op_b, self.op_c, self.gprPush, self.gprPop, self.zeroGPR, self.copy = self.__define_operands()
self.num_operands = len(self.param_list)
# Create asm header
self.def_instr, self.ninstr, self.init, self.expand = self.__define_header()
# Create latency and throughput loop
self.loop_lat = self.__define_loop_lat()
self.loop_thrpt = self.__define_loop_thrpt()
# Create extension for testcase name
sep1 = '_' if (self.num_operands > 1) else ''
sep2 = '_' if (self.num_operands > 2) else ''
self.extension = ('-'+(self.op_a if ('gpr' not in self.op_a) else 'r' + self.op_a[3:]) + sep1 + (self.op_b if ('gpr' not in self.op_b) else 'r'+self.op_b[3:]) + sep2 + (self.op_c if ('gpr' not in self.op_c) else 'r'+self.op_c[3:]))
def write_testcase(self, TP=True, LT=True):
"""
Write testcase for class attributes in a file.
Parameters
----------
TP : bool
Controls if throughput testcase should be written
(default True)
LT : bool
Controls if latency testcase should be written
(default True)
"""
if(LT):
# Write latency file
call(['mkdir', '-p', os.path.dirname(__file__)+'/../testcases'])
f = open(os.path.dirname(__file__)+'/../testcases/'+self.instr+self.extension+'.S', 'w')
data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_lat+self.gprPop+self.done)
f.write(data)
f.close()
if(TP):
# Write throughput file
f = open(os.path.dirname(__file__)+'/../testcases/'+self.instr+self.extension+'-TP.S', 'w')
data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_thrpt+self.gprPop+self.done)
f.write(data)
f.close()
# Check operands
def __define_operands(self):
"""
Check for the number of operands and initialise the GPRs if necessary.
Returns
-------
(str, str, str, str, str, str)
String tuple containing types of operands and if needed push/pop operations, the
initialisation of general purpose regs and the copy if registers.
"""
oprnds = self.param_list
op_a, op_b, op_c = ('', '', '')
gprPush, gprPop, zeroGPR = ('', '', '')
if(isinstance(oprnds[0], Register)):
op_a = oprnds[0].reg_type.lower()
elif(isinstance(oprnds[0], MemAddr)):
op_a = 'mem'
elif(isinstance(oprnds[0], Parameter) and str(oprnds[0]) == 'IMD'):
op_a = 'imd'
if(op_a == 'gpr'):
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
op_a += str(oprnds[0].size)
if(len(oprnds) > 1):
if(isinstance(oprnds[1], Register)):
op_b = oprnds[1].reg_type.lower()
elif(isinstance(oprnds[1], MemAddr)):
op_b = 'mem'
elif(isinstance(oprnds[1], Parameter) and str(oprnds[1]) == 'IMD'):
op_b = 'imd'
if(op_b == 'gpr'):
op_b += str(oprnds[1].size)
if('gpr' not in op_a):
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
if(len(oprnds) == 3):
if(isinstance(oprnds[2], Register)):
op_c = oprnds[2].reg_type.lower()
elif(isinstance(oprnds[2], MemAddr)):
op_c = 'mem'
elif(isinstance(oprnds[2], Parameter) and str(oprnds[2]) == 'IMD'):
op_c = 'imd'
if(op_c == 'gpr'):
op_c += str(oprnds[2].size)
if(('gpr' not in op_a) and ('gpr'not in op_b)):
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
if(len(oprnds) == 1 and isinstance(oprnds[0], Register)):
copy = self.__copy_regs(oprnds[0])
elif(len(oprnds) > 1 and isinstance(oprnds[1], Register)):
copy = self.__copy_regs(oprnds[1])
elif(len(oprnds) > 2 and isinstance(oprnds[2], Register)):
copy = self.__copy_regs(oprnds[1])
else:
copy = ''
return (op_a, op_b, op_c, gprPush, gprPop, zeroGPR, copy)
def __initialise_gprs(self):
"""
Initialise eleven general purpose registers and set them to zero.
Returns
-------
(str, str, str)
String tuple for push, pop and initalisation operations
"""
gprPush = ''
gprPop = ''
zeroGPR = ''
for reg in self.gprs64:
gprPush += '\t\tpush {}\n'.format(reg)
for reg in reversed(self.gprs64):
gprPop += '\t\tpop {}\n'.format(reg)
for reg in self.gprs64:
zeroGPR += '\t\txor {}, {}\n'.format(reg, reg)
return (gprPush, gprPop, zeroGPR)
# Copy created values in specific register
def __copy_regs(self, reg):
"""
Copy created values in specific register.
Parameters
----------
reg : Register
Register for copying the value
Returns
-------
str
String containing the copy instructions
"""
copy = '\t\t# copy DP 1.0\n'
# Different handling for GPR, MMX and SSE/AVX registers
if(reg.reg_type == 'GPR'):
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][1])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tadd {}, {}\n'.format(self.ops['gpr64'][1], self.ops['gpr64'][0])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
copy += '\t\tmovq {}, {}\n'.format(self.ops['gpr64'][2], self.ops['gpr64'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
elif(reg.reg_type == 'MMX'):
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][1])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tadd {}, {}\n'.format(ops['mmx'][1], ops['mmx'][0])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
copy += '\t\tmovq {}, {}\n'.format(self.ops['mmx'][2], self.ops['gpr64'][0])
elif(reg.reg_type == 'XMM' or reg.reg_type == 'YMM' or reg.reg_type == 'ZMM'):
key = reg.reg_type.lower()
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][0], self.ops[key][0])
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][1], self.ops[key][0])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tvaddpd {}, {}, {}\n'.format(self.ops[key][1], self.ops[key][1], self.ops[key][1])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tvdivpd {}, {}, {}\n'.format(self.ops[key][2], self.ops[key][0], self.ops[key][1])
else:
copy = ''
return copy
def __define_header(self):
"""
Define header.
Returns
-------
(str, str, str, str)
String tuple containing the header, value initalisations and extensions
"""
def_instr = '#define INSTR '+self.instr+'\n'
ninstr = '#define NINST '+self.num_instr+'\n'
pi = ('PI:\n'
'.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' #128 bit
'0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' #256 bit
'0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' #384 bit
'0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9\n') #512 bit
init = ('#define N edi\n' \
'#define i r8d\n\n\n'
'.intel_syntax noprefix\n'
'.globl ninst\n'
'.data\n'
'ninst:\n'
'.long NINST\n'
'.align 32\n'
+pi+
'.text\n'
'.globl latency\n'
'.type latency, @function\n'
'.align 32\n'
'latency:\n'
'\t\tpush rbp\n'
'\t\tmov rbp, rsp\n'
'\t\txor i, i\n'
'\t\ttest N, N\n'
'\t\tjle done\n')
# Expand to AVX(512) if necessary
expand = ''
if(self.op_a == 'ymm' or self.op_b == 'ymm' or self.op_c == 'ymm'):
expand = ('\t\t# expand from SSE to AVX\n'
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n')
if(self.op_a == 'zmm' or self.op_b == 'zmm' or self.op_c == 'zmm'):
expand = ('\t\t# expand from SSE to AVX\n'
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n'
'\t\t# expand from AVX to AVX512\n'
'\t\tvinsert64x4 zmm0, zmm0, ymm0, 0x1\n')
return (def_instr, ninstr, init, expand)
def __define_loop_lat(self):
"""
Create latency loop.
Returns
-------
str
Latency loop as string
"""
loop_lat = ('loop:\n'
'\t\tinc i\n')
if(self.num_operands == 1):
for i in range(0, int(self.num_instr)):
loop_lat += '\t\tINSTR {}\n'.format(self.ops[self.op_a][0])
elif(self.num_operands == 2 and self.op_a == self.op_b):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][1])
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_b][1], self.ops[self.op_b][0])
elif(self.num_operands == 2 and self.op_a != self.op_b):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][0])
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][0])
elif(self.num_operands == 3 and self.op_a == self.op_b):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][1], self.ops[self.op_c][0])
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][1], self.ops[self.op_b][0], self.ops[self.op_c][0])
elif(self.num_operands == 3 and self.op_a == self.op_c):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][0], self.ops[self.op_b][0], self.ops[self.op_c][0])
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][1], self.ops[self.op_b][0], self.ops[self.op_c][0])
loop_lat += ('\t\tcmp i, N\n'
'\t\tjl loop\n')
return loop_lat
def __define_loop_thrpt(self):
"""
Create throughput loop.
Returns
-------
str
Throughput loop as string
"""
loop_thrpt = ('loop:\n'
'\t\tinc i\n')
ext = ''
ext1 = False
ext2 = False
if(self.num_operands == 2):
ext1 = True
if(self.num_operands == 3):
ext1 = True
ext2 = True
for i in range(0, int(self.num_instr)):
if(ext1):
ext = ', {}'.format(self.ops[self.op_b][i%3])
if(ext2):
ext += ', {}'.format(self.ops[self.op_c][i%3])
regNum = (i%(len(self.ops[self.op_a])-3))+3
loop_thrpt += '\t\tINSTR {}{}\n'.format(self.ops[self.op_a][regNum], ext)
loop_thrpt += ('\t\tcmp i, N\n'
'\t\tjl loop\n')
return loop_thrpt
def __is_in_dir(self):
"""
Check if testcases with the same name already exist in testcase
directory.
Returns
-------
(bool, bool)
True if file is in directory
False if file is not in directory
While the first value stands for the throughput testcase
and the second value stands for the latency testcase
"""
TP = False
LT = False
name = self.instr+self.extension
for root, dirs, files in os.walk(os.path.dirname(__file__)+'/testcases'):
if((name+'-TP.S') in files):
TP = True
if name+'.S' in files:
LT = True
return (TP,LT)