mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2025-12-15 16:40:05 +01:00
Compare commits
836 Commits
v0.2.1
...
097e5a6a81
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
097e5a6a81 | ||
|
|
91da9a311a | ||
|
|
0c201be10e | ||
|
|
2cf2bf5cec | ||
|
|
4e3994fec1 | ||
|
|
ff727223bb | ||
|
|
306abcf0a6 | ||
|
|
0b1ada14d0 | ||
|
|
81dfb0e6cb | ||
|
|
796256fa13 | ||
|
|
99d0e0ffb6 | ||
|
|
9c2f559983 | ||
|
|
02716e7b41 | ||
|
|
5cd6b2cf9d | ||
|
|
be3622ce86 | ||
|
|
63774e65bc | ||
|
|
796cfdc3b5 | ||
|
|
253b0ee9d5 | ||
|
|
e37f9f119d | ||
|
|
400be352e1 | ||
|
|
62908f3b8f | ||
|
|
34fef3823b | ||
|
|
1a7c1588f6 | ||
|
|
785a365c63 | ||
|
|
34321109df | ||
|
|
ea5e56083e | ||
|
|
94e90e292c | ||
|
|
d46ef5db22 | ||
|
|
8482463db3 | ||
|
|
9b9d7f8649 | ||
|
|
2d1d8bf38f | ||
|
|
fdb57fc8da | ||
|
|
856ae737be | ||
|
|
9bc4ca24d8 | ||
|
|
a8cab3170c | ||
|
|
b0c243305c | ||
|
|
fc621b2eff | ||
|
|
b3526e7eba | ||
|
|
3059952025 | ||
|
|
41d62c1100 | ||
|
|
a24710412a | ||
|
|
ce258bb4b8 | ||
|
|
892ff71701 | ||
|
|
152dd57aec | ||
|
|
e0f4ae1e55 | ||
|
|
3ce7e2f202 | ||
|
|
d4d98bd0de | ||
|
|
ef00b67f3f | ||
|
|
aea6212560 | ||
|
|
a731954166 | ||
|
|
2fcc62ae1a | ||
|
|
6ea61d8893 | ||
|
|
2d8bb99d9f | ||
|
|
df55c29e2f | ||
|
|
2bdc765df2 | ||
|
|
628bdf6518 | ||
|
|
0e69fa1e26 | ||
|
|
bc8d0c7088 | ||
|
|
c87524384a | ||
|
|
8d0900e46a | ||
|
|
8fa31a7fca | ||
|
|
5071d63b9a | ||
|
|
2286da45b7 | ||
|
|
2ba04e614a | ||
|
|
d9cf46690f | ||
|
|
aca5511d6a | ||
|
|
c9e38631d1 | ||
|
|
d623115b1b | ||
|
|
5da00d0ae6 | ||
|
|
78309574ac | ||
|
|
764b22cebe | ||
|
|
4fd59eb0d0 | ||
|
|
18da151bbf | ||
|
|
d884f74f5e | ||
|
|
5f9de2c41d | ||
|
|
3435641451 | ||
|
|
38781ecc94 | ||
|
|
46004add41 | ||
|
|
1c0708e750 | ||
|
|
d858827a47 | ||
|
|
dcfe36b850 | ||
|
|
7ad3438af5 | ||
|
|
61dacff53e | ||
|
|
fa95293cb0 | ||
|
|
c2bd484170 | ||
|
|
66e51630af | ||
|
|
999806ec59 | ||
|
|
f88fafdecc | ||
|
|
d4a6a9b44f | ||
|
|
04388af5dd | ||
|
|
07f1af966d | ||
|
|
d47b0192cf | ||
|
|
6df973d16a | ||
|
|
30d536fd26 | ||
|
|
a2b40b9d2c | ||
|
|
abfce92b4b | ||
|
|
ec798f61b2 | ||
|
|
1fb015b312 | ||
|
|
226bc8eee0 | ||
|
|
0b3508abf8 | ||
|
|
cb5e0bdc38 | ||
|
|
4647615c5c | ||
|
|
157f1165bb | ||
|
|
93c7d10bbe | ||
|
|
1d62359cc6 | ||
|
|
f3b50b93f5 | ||
|
|
c5ef5f7432 | ||
|
|
78387a374d | ||
|
|
339b06bd7f | ||
|
|
8a6ae8c701 | ||
|
|
cac4a0ebf2 | ||
|
|
cef7f8098d | ||
|
|
93ae586745 | ||
|
|
2c32ccf37a | ||
|
|
26d65750a6 | ||
|
|
ebb973493b | ||
|
|
14a2aa0b52 | ||
|
|
4186edbc03 | ||
|
|
e0a2ea9eb2 | ||
|
|
c171a11101 | ||
|
|
33d1eec106 | ||
|
|
db02359ea2 | ||
|
|
6384ea2e18 | ||
|
|
e95278d2a2 | ||
|
|
2331e4dd8f | ||
|
|
dc250bcedc | ||
|
|
0b2753a78d | ||
|
|
db899a2709 | ||
|
|
74478034f7 | ||
|
|
42f96753c1 | ||
|
|
a8e5a6ad46 | ||
|
|
7f4f87d192 | ||
|
|
615ef82f04 | ||
|
|
36549dd679 | ||
|
|
76f3baf74e | ||
|
|
b06e6424f7 | ||
|
|
0a32c77751 | ||
|
|
eb09cbde42 | ||
|
|
ecdfc15ac5 | ||
|
|
317816b9d3 | ||
|
|
4c74bb0d46 | ||
|
|
537076fa25 | ||
|
|
8cc408a307 | ||
|
|
6d275a1207 | ||
|
|
c2ee276609 | ||
|
|
03a9caf0eb | ||
|
|
f856c578bf | ||
|
|
ab10febe74 | ||
|
|
b50bc9ba1f | ||
|
|
01cc93f56c | ||
|
|
e1ce402133 | ||
|
|
7cd380e7b8 | ||
|
|
1eb692c86f | ||
|
|
9a0474bcc1 | ||
|
|
71e2931bb0 | ||
|
|
c599ce4967 | ||
|
|
e476893dec | ||
|
|
c090d24edf | ||
|
|
9f9471ee4e | ||
|
|
870074b7ab | ||
|
|
1125e4c5d9 | ||
|
|
88a1efe633 | ||
|
|
a0d8895d38 | ||
|
|
c6ed492db3 | ||
|
|
1ac20073ab | ||
|
|
0a2d1f866d | ||
|
|
d46574db1f | ||
|
|
8e6289af1a | ||
|
|
2a43676097 | ||
|
|
c354306b3b | ||
|
|
9e3ab49065 | ||
|
|
475823d4dc | ||
|
|
54644ffb09 | ||
|
|
af3b1fe3e8 | ||
|
|
0b93766bdd | ||
|
|
eab6907c82 | ||
|
|
27eb8f62b6 | ||
|
|
d1201ace11 | ||
|
|
d6569a0f23 | ||
|
|
7e6eb7ce58 | ||
|
|
10d4c4b87e | ||
|
|
f06f767c34 | ||
|
|
9f715c0ba3 | ||
|
|
2884d17971 | ||
|
|
dbfba9ce5b | ||
|
|
841a4a5724 | ||
|
|
19c47db3ed | ||
|
|
06bc51ba63 | ||
|
|
27f408e4a3 | ||
|
|
13c75a3312 | ||
|
|
98f28a7c01 | ||
|
|
c3001e6ecb | ||
|
|
b20f5539bf | ||
|
|
4ca7c26d20 | ||
|
|
4453d3ca78 | ||
|
|
e6d24ea01d | ||
|
|
bdbbfd446f | ||
|
|
4734ed95b2 | ||
|
|
4c50483a83 | ||
|
|
d61cb287ce | ||
|
|
5c21e18e36 | ||
|
|
8807e3eda6 | ||
|
|
7c907e2432 | ||
|
|
1ea1e68b4e | ||
|
|
907f64d452 | ||
|
|
24de7a762b | ||
|
|
87411ab822 | ||
|
|
2fa25e3099 | ||
|
|
0b440e4da9 | ||
|
|
08e6a4be36 | ||
|
|
7724ce27c7 | ||
|
|
4f8e37d9fd | ||
|
|
d5f1654aa8 | ||
|
|
81f40604cb | ||
|
|
df747b8c48 | ||
|
|
4e25a29a8a | ||
|
|
016061f72c | ||
|
|
ddff8c5012 | ||
|
|
2306cb58d0 | ||
|
|
660a9d0f41 | ||
|
|
3b453de617 | ||
|
|
b93d911bb7 | ||
|
|
21cfb8d011 | ||
|
|
32d60e7966 | ||
|
|
ba60703fb2 | ||
|
|
76542782c8 | ||
|
|
671f7f5591 | ||
|
|
f96f5d7ad1 | ||
|
|
d81c53ef91 | ||
|
|
a018f80597 | ||
|
|
2bc6ba999f | ||
|
|
53cbf39ff9 | ||
|
|
3c5c516a6d | ||
|
|
93c0753db3 | ||
|
|
23ffd06e34 | ||
|
|
ca0540563d | ||
|
|
467f212fa3 | ||
|
|
0de00e512b | ||
|
|
3d26d6b82a | ||
|
|
75bc03bc76 | ||
|
|
fa06b9ccac | ||
|
|
9c966c2359 | ||
|
|
13ec7dc20e | ||
|
|
b2a326070f | ||
|
|
2eb6023b7a | ||
|
|
c2787babee | ||
|
|
0119f97942 | ||
|
|
6514257767 | ||
|
|
e1a5272fdf | ||
|
|
5748b2987b | ||
|
|
a447e289ff | ||
|
|
d2a4749c39 | ||
|
|
c917a83974 | ||
|
|
5ebd8a019e | ||
|
|
fe42870cc2 | ||
|
|
e70229aa32 | ||
|
|
71b9a17ab8 | ||
|
|
b484179e02 | ||
|
|
203ea2dfb0 | ||
|
|
0e984f4ec7 | ||
|
|
c1fa5e3bce | ||
|
|
0ab6efa9cb | ||
|
|
feda03408f | ||
|
|
a738d82533 | ||
|
|
4e10491fcb | ||
|
|
a87c077654 | ||
|
|
ca3ca56a01 | ||
|
|
2c530654dd | ||
|
|
ce83727eaf | ||
|
|
62746dfc9c | ||
|
|
ebadaba3ca | ||
|
|
2be8606e9a | ||
|
|
d170ba72dd | ||
|
|
c35c16e007 | ||
|
|
d3f081f282 | ||
|
|
f7579e83a9 | ||
|
|
ea0576e8ce | ||
|
|
37cc10edde | ||
|
|
939abe2518 | ||
|
|
e120d9229b | ||
|
|
12095979db | ||
|
|
ca5e9c3cae | ||
|
|
7194e79beb | ||
|
|
c97f93c39b | ||
|
|
968c71b7b6 | ||
|
|
df26edd075 | ||
|
|
a767b7f290 | ||
|
|
ba45038ad7 | ||
|
|
72e85075c2 | ||
|
|
40839384ec | ||
|
|
ab615547e5 | ||
|
|
9c16f8bc56 | ||
|
|
be891d45d4 | ||
|
|
5735291d27 | ||
|
|
ab368cded1 | ||
|
|
6e99954f0b | ||
|
|
5205cb5cc6 | ||
|
|
e6ce870ca0 | ||
|
|
566fbc6bc4 | ||
|
|
b70cff21ad | ||
|
|
d181184788 | ||
|
|
fcc3475417 | ||
|
|
d418c16f4a | ||
|
|
34523e1b23 | ||
|
|
457ccdcf77 | ||
|
|
ff61c65d58 | ||
|
|
615c809fe3 | ||
|
|
bce837dec9 | ||
|
|
090c24ade1 | ||
|
|
03a2a1da33 | ||
|
|
d59b100fa8 | ||
|
|
5c741a8a2d | ||
|
|
2f4849f44e | ||
|
|
f13a97e5b5 | ||
|
|
66282b0eef | ||
|
|
9ec7c161ab | ||
|
|
8d8eaa8e4f | ||
|
|
88d5094bf1 | ||
|
|
1f32252f91 | ||
|
|
1de644cd62 | ||
|
|
3d1c6aae8d | ||
|
|
dafec70e6e | ||
|
|
6d85fbe9e4 | ||
|
|
3f31235f8a | ||
|
|
cfc061e5e3 | ||
|
|
5eb3e07ad6 | ||
|
|
a82a0e24a3 | ||
|
|
6db08c7e8e | ||
|
|
e6a54ee131 | ||
|
|
152360bad2 | ||
|
|
607d459569 | ||
|
|
b033b3b7aa | ||
|
|
0c295dc847 | ||
|
|
5588e41492 | ||
|
|
08440ed5e1 | ||
|
|
25a0e0607d | ||
|
|
b0e35316f0 | ||
|
|
94313ec772 | ||
|
|
63563ecabc | ||
|
|
b7625a4a25 | ||
|
|
7da2f5bd7b | ||
|
|
6158a83b4f | ||
|
|
6204c90934 | ||
|
|
1ebe5ecfbd | ||
|
|
0c5ac26f3f | ||
|
|
9a13e5cbc5 | ||
|
|
dcf3e38612 | ||
|
|
09a14465c1 | ||
|
|
d7a687909e | ||
|
|
f8d53a69d7 | ||
|
|
74a479fb95 | ||
|
|
4fdf312622 | ||
|
|
803da767f2 | ||
|
|
0e69db9de9 | ||
|
|
9f87606ce8 | ||
|
|
768a90de10 | ||
|
|
8c9557760e | ||
|
|
4dbcfbda5d | ||
|
|
ed13cde61b | ||
|
|
4f8ed13309 | ||
|
|
3c7971b347 | ||
|
|
d89a742718 | ||
|
|
05fdbcf060 | ||
|
|
0f86d2d1b2 | ||
|
|
dff86d456e | ||
|
|
77e7c3a520 | ||
|
|
ba957877e3 | ||
|
|
1d52362306 | ||
|
|
dd37a21c56 | ||
|
|
0bdc180a52 | ||
|
|
f6a02a8f3e | ||
|
|
60f792c4b2 | ||
|
|
12044e3ac4 | ||
|
|
8454edef73 | ||
|
|
9165306808 | ||
|
|
449932d75b | ||
|
|
c68ad48e6b | ||
|
|
8e3d613843 | ||
|
|
2093610bbf | ||
|
|
f9f1120da6 | ||
|
|
e87ab5d6ca | ||
|
|
82b35e7649 | ||
|
|
23623ca18a | ||
|
|
b9e434d124 | ||
|
|
0e47034c8b | ||
|
|
81ce395115 | ||
|
|
f41854a0a6 | ||
|
|
818b516289 | ||
|
|
d7e5e12961 | ||
|
|
6bc6349c25 | ||
|
|
f69b5f88f0 | ||
|
|
596a323dfb | ||
|
|
08b4586b71 | ||
|
|
ffb263e20f | ||
|
|
b4799d1d45 | ||
|
|
4ff8fdc4ab | ||
|
|
c204096d74 | ||
|
|
dea217c12c | ||
|
|
92c162daa2 | ||
|
|
87ea8f0f0a | ||
|
|
cb04efc384 | ||
|
|
14c0ea6180 | ||
|
|
314ff4cf9d | ||
|
|
f64253b2b9 | ||
|
|
979d08358e | ||
|
|
a2dd6f752d | ||
|
|
2fb36406a7 | ||
|
|
94086033a8 | ||
|
|
75edfc808a | ||
|
|
c8c077a834 | ||
|
|
26ee005adc | ||
|
|
207c53aaad | ||
|
|
fafd7bc526 | ||
|
|
b986d7eba0 | ||
|
|
6b0adb5d68 | ||
|
|
f9f382a948 | ||
|
|
c6b58c63ab | ||
|
|
78530bfdb0 | ||
|
|
5aa0899961 | ||
|
|
7f0abd7d10 | ||
|
|
9ba9bab107 | ||
|
|
983e66938c | ||
|
|
1c889fa785 | ||
|
|
022598d94f | ||
|
|
1f5c9d1c61 | ||
|
|
30e0ad038d | ||
|
|
decec86e56 | ||
|
|
9af689b28c | ||
|
|
3aea3f2b49 | ||
|
|
a6cb09cf1f | ||
|
|
9d2ea8603f | ||
|
|
a7918db145 | ||
|
|
b5b1a1f2b2 | ||
|
|
dd59af16b2 | ||
|
|
d9325724e2 | ||
|
|
7e7269c2bc | ||
|
|
c64a24ae1b | ||
|
|
e8b78e4cc6 | ||
|
|
cd5a706f56 | ||
|
|
13426358d0 | ||
|
|
c80088b628 | ||
|
|
748474cd81 | ||
|
|
2fec0bf810 | ||
|
|
711a41d18e | ||
|
|
cf4a9cddcb | ||
|
|
5a5a1e74f5 | ||
|
|
4865e7ea72 | ||
|
|
d03398ddf9 | ||
|
|
edb8df3205 | ||
|
|
489050723c | ||
|
|
0cc0d35ce9 | ||
|
|
7f65bdb022 | ||
|
|
04360cc897 | ||
|
|
5e7a12f9bb | ||
|
|
1def12ee79 | ||
|
|
7269156854 | ||
|
|
d6529ced73 | ||
|
|
eac728dc9f | ||
|
|
451ba62959 | ||
|
|
57cf1bfe6f | ||
|
|
44b921aa73 | ||
|
|
accb52ce53 | ||
|
|
9e78f85475 | ||
|
|
64da89ec3d | ||
|
|
adeae88665 | ||
|
|
1698ed1776 | ||
|
|
2ef6051e64 | ||
|
|
3308f5d68f | ||
|
|
bd61b94669 | ||
|
|
0db8b6bcbf | ||
|
|
40755b2080 | ||
|
|
269148c2a1 | ||
|
|
12a8506530 | ||
|
|
e715badcf9 | ||
|
|
d6b4355a77 | ||
|
|
5361b63b52 | ||
|
|
cc39342047 | ||
|
|
addcdeda85 | ||
|
|
23d36a651b | ||
|
|
b052ab4151 | ||
|
|
673da99fba | ||
|
|
6c72281d65 | ||
|
|
5520362e65 | ||
|
|
93060eee43 | ||
|
|
0e77b7bc9a | ||
|
|
ce8c3ff9ab | ||
|
|
acbde7a19c | ||
|
|
34e978d2ae | ||
|
|
6294e2e9da | ||
|
|
6801229275 | ||
|
|
d3d1a89600 | ||
|
|
93c1951097 | ||
|
|
7211dd0799 | ||
|
|
5258d65c8e | ||
|
|
379fe80169 | ||
|
|
94d7d35c0b | ||
|
|
1009c60d2d | ||
|
|
229b316b6d | ||
|
|
c0753be899 | ||
|
|
eaa56792ab | ||
|
|
3425fa3024 | ||
|
|
38924b6ec1 | ||
|
|
d6ae457de4 | ||
|
|
a5c2ab1a4a | ||
|
|
e4393189dc | ||
|
|
3016fc7c46 | ||
|
|
82f47d217c | ||
|
|
1754df42d2 | ||
|
|
ac1295aac2 | ||
|
|
9624e6c109 | ||
|
|
2d16037c44 | ||
|
|
c5801cfe2f | ||
|
|
3e960dd4ac | ||
|
|
680774267d | ||
|
|
1aa710f195 | ||
|
|
71206897fd | ||
|
|
af247e64b6 | ||
|
|
2973f543b7 | ||
|
|
0b7f1ed6e7 | ||
|
|
17e7f0e0d8 | ||
|
|
c30ad4fb33 | ||
|
|
666512d54d | ||
|
|
381e9e9f76 | ||
|
|
8f63621d6d | ||
|
|
e41d05868a | ||
|
|
8013a40e52 | ||
|
|
3db330de66 | ||
|
|
4e73e24b99 | ||
|
|
dcd5b8fd61 | ||
|
|
3fb053fa79 | ||
|
|
cfd16aa079 | ||
|
|
1bf9a858ad | ||
|
|
5fc660484c | ||
|
|
c194f57f09 | ||
|
|
40a35ce067 | ||
|
|
4e58552c03 | ||
|
|
280f9c5790 | ||
|
|
d861d66206 | ||
|
|
3348afe219 | ||
|
|
f8ae6599c5 | ||
|
|
ffb016af45 | ||
|
|
51586cdaa1 | ||
|
|
c9000f74bc | ||
|
|
b06570ed45 | ||
|
|
b4682d16fb | ||
|
|
6c08a98418 | ||
|
|
2d30d190f4 | ||
|
|
8cce680bd7 | ||
|
|
9a60aa2c28 | ||
|
|
03b4cd1686 | ||
|
|
5bdc61aa09 | ||
|
|
04db2bfa79 | ||
|
|
5a0365ab35 | ||
|
|
4cdee8b621 | ||
|
|
248829141f | ||
|
|
131646b01a | ||
|
|
3cf40d9cd0 | ||
|
|
0adde7b9fc | ||
|
|
9888ef2da4 | ||
|
|
cadedeba7b | ||
|
|
f5489621fa | ||
|
|
77aa7f8fe0 | ||
|
|
760e3a9846 | ||
|
|
d0436838de | ||
|
|
c204b19caa | ||
|
|
731f1f9636 | ||
|
|
edae1720dc | ||
|
|
79afcba61d | ||
|
|
559c95a34a | ||
|
|
9c7907ee21 | ||
|
|
3243455ec5 | ||
|
|
5574a93a5e | ||
|
|
24583de74e | ||
|
|
530ad8484e | ||
|
|
421cf55af7 | ||
|
|
2fc1f3a186 | ||
|
|
092403c529 | ||
|
|
2d82c32f02 | ||
|
|
53135a03da | ||
|
|
02233f627e | ||
|
|
662ad829ec | ||
|
|
cb34733abe | ||
|
|
aa92234e5d | ||
|
|
1fd2453a50 | ||
|
|
4eea686e8b | ||
|
|
daa566329c | ||
|
|
b202bdfdb0 | ||
|
|
534eda8015 | ||
|
|
b2bb2cd003 | ||
|
|
97dbefdb6f | ||
|
|
789406c863 | ||
|
|
5341a2e94d | ||
|
|
c2d8742ac0 | ||
|
|
5b1c984552 | ||
|
|
6d6d3b7ccb | ||
|
|
3656b222ca | ||
|
|
60b6b603b7 | ||
|
|
70c66dbd0f | ||
|
|
d85daa9ecc | ||
|
|
3f55ae2368 | ||
|
|
7e4fcf5399 | ||
|
|
571d090344 | ||
|
|
55652f84e0 | ||
|
|
6e25da6c08 | ||
|
|
0269fc7085 | ||
|
|
76469f7898 | ||
|
|
3ff8a695b6 | ||
|
|
cb100d118f | ||
|
|
0c22634601 | ||
|
|
54ae9f4d26 | ||
|
|
e8fab533db | ||
|
|
0e05bd66d8 | ||
|
|
e635b2b015 | ||
|
|
383b720cc5 | ||
|
|
b6572720af | ||
|
|
354ab8e148 | ||
|
|
4f4a53c3be | ||
|
|
2a50207045 | ||
|
|
03f544638e | ||
|
|
bfe45f09bc | ||
|
|
8e30cd583a | ||
|
|
e86803df02 | ||
|
|
184751cf9e | ||
|
|
d99522583e | ||
|
|
cafe4c5bf8 | ||
|
|
623c4ea113 | ||
|
|
3ca2586bac | ||
|
|
36d6a82da5 | ||
|
|
b1444cf352 | ||
|
|
4d6d8d9379 | ||
|
|
1687ba8be9 | ||
|
|
59402a0837 | ||
|
|
f5b6611474 | ||
|
|
262fa4b288 | ||
|
|
0fdbb7f52c | ||
|
|
bad230fa7b | ||
|
|
dc02192d04 | ||
|
|
c23e52cdf6 | ||
|
|
b2b4aba0f3 | ||
|
|
bbb004a2aa | ||
|
|
4628d52210 | ||
|
|
99781b4171 | ||
|
|
04a1433f02 | ||
|
|
d88617109f | ||
|
|
cbed2c46f4 | ||
|
|
9d069c39d9 | ||
|
|
c9d3a90cd0 | ||
|
|
9ea2c5f46d | ||
|
|
f18a48653f | ||
|
|
ff68f03aed | ||
|
|
a16fee9fb1 | ||
|
|
2fcfc01542 | ||
|
|
fe9cd6c0c9 | ||
|
|
e7838cac54 | ||
|
|
4dc4323e2e | ||
|
|
6f5b8adadd | ||
|
|
744e1d83cc | ||
|
|
47e39f1f77 | ||
|
|
6d814d416b | ||
|
|
f8ed85b7c9 | ||
|
|
db6e40ee88 | ||
|
|
8359aa4807 | ||
|
|
697c5b5f4b | ||
|
|
f3f91536b5 | ||
|
|
687693d2a5 | ||
|
|
eb55693871 | ||
|
|
abf4fc391f | ||
|
|
4da262a902 | ||
|
|
a91413c270 | ||
|
|
6c56a77967 | ||
|
|
224e24d5e9 | ||
|
|
0b78d290ec | ||
|
|
a839af76c5 | ||
|
|
15da6044dd | ||
|
|
0f5d3a0370 | ||
|
|
1c8067545d | ||
|
|
484d6da85e | ||
|
|
0ecc656055 | ||
|
|
af0c8fc953 | ||
|
|
5fe983f4ef | ||
|
|
8b4acf0508 | ||
|
|
0c63d4f1cd | ||
|
|
b1e4cb90a7 | ||
|
|
159a1fa343 | ||
|
|
db862441b0 | ||
|
|
de5479a06c | ||
|
|
d92523e133 | ||
|
|
cb7cec20a8 | ||
|
|
1c673382b4 | ||
|
|
85dd53dc4e | ||
|
|
792bbb1166 | ||
|
|
f08fbf79ba | ||
|
|
e7ea5451f1 | ||
|
|
0ed113116c | ||
|
|
65100bd9da | ||
|
|
2995287377 | ||
|
|
209ce2a5ef | ||
|
|
3a6fa0475f | ||
|
|
e2abac3e0c | ||
|
|
cd81271b54 | ||
|
|
361a4fd8c2 | ||
|
|
c8c8c13ed1 | ||
|
|
a3ba078e76 | ||
|
|
438a4b3d6b | ||
|
|
8c2dfb27e5 | ||
|
|
14abed8f85 | ||
|
|
39cfd4dcda | ||
|
|
97025b259a | ||
|
|
8f743f2b88 | ||
|
|
cd5f131f35 | ||
|
|
f896440bbe | ||
|
|
f6d12cae2a | ||
|
|
3e0c57f1c7 | ||
|
|
41e3ee57e6 | ||
|
|
586fee5306 | ||
|
|
efb9ba166d | ||
|
|
22bfcd8020 | ||
|
|
625d814dce | ||
|
|
19dbd90849 | ||
|
|
042c034838 | ||
|
|
7bc3f016cd | ||
|
|
7c9ef5f589 | ||
|
|
f278180402 | ||
|
|
fb834e5533 | ||
|
|
36a662267d | ||
|
|
71dc7f5431 | ||
|
|
ab93311ceb | ||
|
|
533f78f9cf | ||
|
|
39004fce1a | ||
|
|
e737abd58f | ||
|
|
5bb6b94803 | ||
|
|
1948c738d1 | ||
|
|
0173481bec | ||
|
|
eeb55e8cf7 | ||
|
|
edd772380e | ||
|
|
ae5845b944 | ||
|
|
9188e8e31e | ||
|
|
fc06b968d8 | ||
|
|
ff672fb5ec | ||
|
|
f2eff01529 | ||
|
|
7dc14fbf39 | ||
|
|
80e741d411 | ||
|
|
7855166624 | ||
|
|
df8a81bf4d | ||
|
|
de2ba87d6b | ||
|
|
e468db4a0d | ||
|
|
866397d6ae | ||
|
|
f99265070f | ||
|
|
01e87b7727 | ||
|
|
b09129eeb7 | ||
|
|
de9350a158 | ||
|
|
9da84cdfb2 | ||
|
|
7183b767f2 | ||
|
|
050a51fc28 | ||
|
|
2493913fd5 | ||
|
|
6ab336f0ab | ||
|
|
8c2f744acf | ||
|
|
184ed73190 | ||
|
|
148977d417 | ||
|
|
ec666b7c79 | ||
|
|
903738161e | ||
|
|
b031b887a6 | ||
|
|
832fa4e241 | ||
|
|
abf8b674be | ||
|
|
cb75bf52ab | ||
|
|
e69baaba41 | ||
|
|
a866500610 | ||
|
|
311535476a | ||
|
|
75393f106c | ||
|
|
03dff0013f | ||
|
|
593dd63897 | ||
|
|
e5fdb7a9ac | ||
|
|
2e23820a55 | ||
|
|
2fffb07feb | ||
|
|
e923c67bdb | ||
|
|
75a405e33e | ||
|
|
8b377f4db1 | ||
|
|
119dc5baa9 | ||
|
|
b81a4c68d3 | ||
|
|
ee15842c97 | ||
|
|
d553998b90 | ||
|
|
6c212d130c | ||
|
|
0944633958 | ||
|
|
b469f5a0c4 | ||
|
|
646490ac2a | ||
|
|
8e13432318 | ||
|
|
f7ce5ac63c | ||
|
|
01a6b15bb1 | ||
|
|
0925af21a0 | ||
|
|
b683bf7ce3 | ||
|
|
64a7cb8196 | ||
|
|
d14ccee0b4 | ||
|
|
171b57b381 | ||
|
|
d6042b4006 | ||
|
|
f9e6583959 | ||
|
|
c1cf539c45 | ||
|
|
8bd7be32e2 | ||
|
|
1f52157e9c | ||
|
|
daa874b396 | ||
|
|
a84150f71b | ||
|
|
c70e57ab1a | ||
|
|
f189d6aca5 | ||
|
|
38bbf2712b | ||
|
|
bc9b380429 | ||
|
|
2d32b3a92a | ||
|
|
9e3aaa7336 | ||
|
|
bfd966e824 | ||
|
|
2fa92bc0b0 | ||
|
|
07cbd63f47 | ||
|
|
bd4a5622b2 | ||
|
|
01b23e1b47 | ||
|
|
aea6f8f043 | ||
|
|
7d3857b023 | ||
|
|
179c78c9ec | ||
|
|
c0413de556 | ||
|
|
3b027e2453 | ||
|
|
5cd80f4e82 | ||
|
|
29e4974662 | ||
|
|
7185174885 | ||
|
|
d3d46bfff5 | ||
|
|
fbc3437cce | ||
|
|
db1cdf5474 | ||
|
|
a460ca8d55 | ||
|
|
595cfa02a0 | ||
|
|
216fbfa7bc | ||
|
|
bac3104c0a | ||
|
|
64de49d497 | ||
|
|
4be8bcafc4 | ||
|
|
4eaa1eeb7d | ||
|
|
4038f07002 | ||
|
|
895a262d5d | ||
|
|
88856eb573 | ||
|
|
880dc332c8 | ||
|
|
6650145543 | ||
|
|
961364cbda | ||
|
|
62ab93a7cd |
2
.git-blame-ignore-revs
Normal file
2
.git-blame-ignore-revs
Normal file
@@ -0,0 +1,2 @@
|
||||
# Migrate code style to Black
|
||||
6204c90934c0e62aed98862ae77368b20a64cbfb
|
||||
33
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
33
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: "[BUG]"
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
A clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
|||
|
||||
|---|---|
|
||||
| OSACA version | v x.y.z |
|
||||
| Used where | \[CLI / Compiler Explorer\]
|
||||
|
||||
Steps to reproduce the behavior:
|
||||
- OSACA command
|
||||
- input code snippet or Compiler Explorer short link
|
||||
|
||||
**OSACA output**
|
||||
Please supply the output of the command with within a code block
|
||||
```
|
||||
```
|
||||
|
||||
**Expected behavior**
|
||||
A clear and concise description of what you expected to happen.
|
||||
|
||||
|
||||
**Additional context**
|
||||
Add any other context about the problem here.
|
||||
20
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
20
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
title: "[REQUEST]"
|
||||
labels: enhancement
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Is your feature request related to a problem? Please describe.**
|
||||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
||||
|
||||
**Describe the solution you'd like**
|
||||
A clear and concise description of what you want to happen.
|
||||
|
||||
**Describe alternatives you've considered**
|
||||
A clear and concise description of any alternative solutions or features you've considered.
|
||||
|
||||
**Additional context**
|
||||
Add any other context or screenshots about the feature request here.
|
||||
23
.github/ISSUE_TEMPLATE/hardware-support-request.md
vendored
Normal file
23
.github/ISSUE_TEMPLATE/hardware-support-request.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
name: Hardware support request
|
||||
about: Request support for a new hardware architecture support
|
||||
title: "[HW REQUEST]"
|
||||
labels: new architecture
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Why do you need support for this specific architecture?**
|
||||
Please write a short note why you need this specific architecture support.
|
||||
|
||||
**Which architecture model, family and further information?**
|
||||
Write a short note about the specific micro-architecture.
|
||||
|
||||
**Is the documentation of the port model publicly available?**
|
||||
Please refer to already existing documentation about the port model.
|
||||
|
||||
**Is any documentation of the performance data of the instruction forms (throughput, latencies, port assignment, ...) publicly available?**
|
||||
Please refer to already existing documentation/data bases containing information about the performance of instruction forms on this micro-arch.
|
||||
|
||||
**Are there already any usable tools (commercial or open-source)?**
|
||||
Please refer to already existing tools with support for this hardware micro-architecture.
|
||||
30
.github/workflows/lint.yml
vendored
Normal file
30
.github/workflows/lint.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Lint
|
||||
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
run-linters:
|
||||
name: Run linters
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Check out Git repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.x"
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: python -m pip install black flake8
|
||||
|
||||
- name: Run linters
|
||||
uses: wearerequired/lint-action@v2
|
||||
with:
|
||||
github_token: ${{ secrets.github_token }}
|
||||
# Enable linters
|
||||
black: true
|
||||
black_args: "-l 99 --diff --color --extend-exclude .ipynb"
|
||||
flake8: true
|
||||
flake8_args: "--max-line-length=99 --extend-ignore=E203,E501"
|
||||
42
.github/workflows/test-n-publish.yml
vendored
Normal file
42
.github/workflows/test-n-publish.yml
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
name: test-n-publish
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
test-n-publish:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
name: Set up Python ${{ matrix.python-version }}
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install
|
||||
run: |
|
||||
python -m pip install wheel
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install setuptools
|
||||
python -m pip install codecov requests
|
||||
python -m pip install bs4
|
||||
sudo apt-get -y install graphviz libgraphviz-dev pkg-config
|
||||
python -m pip install pygraphviz
|
||||
#python -m pip install "kerncraft>=0.8.16"
|
||||
python -m pip install git+https://github.com/RRZE-HPC/kerncraft.git@7caff4e2ecdbef595013041ba0131e37ed33c72c
|
||||
python -m pip install -e .
|
||||
- name: Test
|
||||
run: |
|
||||
coverage run -p tests/all_tests.py
|
||||
- uses: codecov/codecov-action@v3
|
||||
- name: Build package
|
||||
run: |
|
||||
python setup.py build sdist bdist_wheel
|
||||
- name: Publish to PyPI
|
||||
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
skip_existing: true
|
||||
user: __token__
|
||||
password: ${{ secrets.pypi_password }}
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,5 +1,5 @@
|
||||
# OSACA specific files and folders
|
||||
osaca/taxCalc/
|
||||
*.*.pickle
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@@ -109,3 +109,7 @@ venv.bak/
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
# Visual Studio
|
||||
.vs
|
||||
x64/
|
||||
|
||||
33
.travis.yml
33
.travis.yml
@@ -1,8 +1,35 @@
|
||||
sudo: false
|
||||
os: linux
|
||||
language: python
|
||||
python:
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
- "3.7"
|
||||
install: pip install tox-travis
|
||||
script: tox
|
||||
- "3.8"
|
||||
- "3.9"
|
||||
before_install:
|
||||
# - pip install tox-travis
|
||||
- pip install codecov
|
||||
- pip install bs4
|
||||
- pip install pygraphviz
|
||||
- pip install kerncraft
|
||||
install:
|
||||
- pip install -e .
|
||||
cache: pip
|
||||
script:
|
||||
# - tox
|
||||
- coverage run -p tests/all_tests.py
|
||||
after_success:
|
||||
- coverage combine
|
||||
- codecov
|
||||
deploy:
|
||||
provider: pypi
|
||||
username: "__token__"
|
||||
password:
|
||||
secure: "fRRCETOwDkJ4pFacYZghPfCQ9mSsV4PlD3sTDp8rDHoCnebPjvFYc1tIdv+Wds0ae162KNUaj9GbxjK0MTGiRcy4pD08n7ufv8snmBQ2rtOLkj7RCRg1hw30WcMHjzqScFJgQcBrpjdPmR5AlesUufh6OadGvF1NspmVRWKr8ir3KQhmNV+itAliYoqaSTRTg1zC/znm+49l5gkzlLxd+mPj5/dtcc8vZ/i2M2+nNTTjDxq71q4Ddqv+bgZV1y7OZY2YuvjEDPflUbwc3fjOxpj891uMDHodsGmEHBu8WsLpF2tAO0C/x63S0jXamkV+/4cAQqQAwWr0Lby9/BjCfUwyUMOEgZ0S+z9WoFpBpQTQEfkD2JH/UFrv4CMnLFqgDkVMcx0vc/rT4Od8eJ5wOSG5+VdniJNOLpodFOXuKc09eJMk2lE9vk9OBrcsZ09UOTPTUCMZSIP4cBDxaIkx+RHQEy63TQdJZcElRBEWGEgj2e9hbiktvIoOvbFGQDscpz7ShBDklXIpu9hnxcKHtNDEjyywTUJmx7lTMILL05DPUnpUmnMb1Gyx5lbHzhSExc9re0cxEA354UUQKBS5HwHQcEBw9stMfsaForiBAUOocUKdGqlGP9cOXFoxdC9M+ff5FNstgbjPYSowb/JbATMlmCWKgH/bXXcTGCO10sk="
|
||||
distributions: "sdist bdist_wheel"
|
||||
skip_existing: true
|
||||
cleanup: false
|
||||
on:
|
||||
repo: RRZE-HPC/OSACA
|
||||
branch: master
|
||||
tags: true
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
include README.rst
|
||||
include LICENSE
|
||||
include tox.ini
|
||||
recursive-include osaca/data/ *.csv
|
||||
recursive-include osaca/data/ *.yml
|
||||
recursive-include osaca/data/ *.pickle
|
||||
include osaca/data/_build_cache.py
|
||||
include examples/*
|
||||
recursive-include tests *.py *.out
|
||||
recursive-include tests/testfiles/ *
|
||||
|
||||
554
README.rst
554
README.rst
@@ -1,4 +1,4 @@
|
||||
.. image:: doc/osaca-logo.png
|
||||
.. image:: docs/img/osaca-logo.png
|
||||
:alt: OSACA logo
|
||||
:width: 80%
|
||||
|
||||
@@ -6,25 +6,38 @@ OSACA
|
||||
=====
|
||||
|
||||
Open Source Architecture Code Analyzer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
--------------------------------------
|
||||
|
||||
This tool allows automatic instruction fetching of assembly code,
|
||||
auto-generating of testcases for assembly instructions creating latency
|
||||
and throughput benchmarks on a specific instruction form and throughput
|
||||
analysis and throughput prediction for a innermost loop kernel.
|
||||
For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.
|
||||
|
||||
.. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?token=393L6z2HEXNiGLtZ43s6&branch=master
|
||||
:target: https://travis-ci.com/RRZE-HPC/OSACA
|
||||
.. image:: https://github.com/RRZE-HPC/OSACA/workflows/test-n-publish/badge.svg?branch=master&event=push
|
||||
:target: https://github.com/RRZE-HPC/OSACA/actions
|
||||
:alt: Build Status
|
||||
|
||||
.. image:: https://landscape.io/github/RRZE-HPC/OSACA/master/landscape.svg?style=flat&badge_auth_token=c95f01b247f94bc79c09d21c5c827697
|
||||
:target: https://landscape.io/github/RRZE-HPC/OSACA/master
|
||||
:alt: Code Health
|
||||
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
|
||||
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
|
||||
:alt: Code Coverage
|
||||
|
||||
.. image:: https://readthedocs.org/projects/osaca/badge/?version=latest
|
||||
:target: https://osaca.readthedocs.io/en/latest/?badge=latest
|
||||
:alt: Documentation Status
|
||||
|
||||
.. image:: https://img.shields.io/badge/read-the_docs-blue
|
||||
:target: https://osaca.readthedocs.io/
|
||||
:alt: Docs
|
||||
|
||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
||||
:target: https://github.com/ambv/black
|
||||
:alt: Code Style
|
||||
|
||||
Getting started
|
||||
===============
|
||||
OSACA is as a python module with a command line interface.
|
||||
|
||||
OSACA is also integrated into the `Compiler Explorer at godbolt.org <https://godbolt.org>`_, which allows using OSACA from a browser without any installation. To analyze an assembly snippet, go to https://godbolt.org change language to "Analysis", insert an AArch64 or AT&T(!) x86 assembly code and make sure OSACA is selected in the corresponding analysis panel, e.g., https://godbolt.org/z/shK4f8. When analyzing a high-level language code, use the "Add tool..." menu in the compiler output panel to add OSACA analysis, e.g. https://godbolt.org/z/hbMoPn. To change the micro architecture model, add ``--arch`` and µarch shortname (e.g., ``SKX`` for Skylake, ``ZEN2``, ``N1`` for ARM Neoverse) to the "Compiler options..." (when using "Analysis" mode) or "Arguments" (when analyzing compiler output of a high-level code).
|
||||
|
||||
Installation
|
||||
~~~~~~~~~~~~
|
||||
------------
|
||||
On most systems with python pip and setuputils installed, just run:
|
||||
|
||||
.. code:: bash
|
||||
@@ -42,20 +55,28 @@ To build OSACA from source, clone this repository using ``git clone https://gith
|
||||
After installation, OSACA can be started with the command ``osaca`` in the CLI.
|
||||
|
||||
Dependencies:
|
||||
~~~~~~~~~~~~~~~
|
||||
Additional requirements are:
|
||||
-------------
|
||||
Necessary equirements are:
|
||||
|
||||
- `Python3 <https://www.python.org/>`_
|
||||
- `pandas <http://pandas.pydata.org/>`_
|
||||
- `NumPy <http://www.numpy.org/>`_
|
||||
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`_ for marker insertion
|
||||
- `ibench <https://github.com/hofm/ibench>`_ for throughput/latency measurements
|
||||
- `Graphviz <https://www.graphviz.org/>`_ for dependency graph creation (minimal dependency is ``libgraphviz-dev`` on Ubuntu)
|
||||
- Python packages:
|
||||
|
||||
- `networkx <https://networkx.org/>`_
|
||||
- `pyparsing <https://github.com/pyparsing/pyparsing>`_
|
||||
- `ruamel.yaml <https://pypi.org/project/ruamel.yaml/>`_
|
||||
|
||||
Optional requirements are:
|
||||
|
||||
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
|
||||
- `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
|
||||
- `BeautifulSoup4 <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ for scraping instruction form information for the x86 ISA (experimental)
|
||||
|
||||
Design
|
||||
======
|
||||
A schematic design of OSACA's workflow is shown below:
|
||||
|
||||
.. image:: doc/osaca-workflow.png
|
||||
.. image:: docs/img/osaca-workflow.png
|
||||
:alt: OSACA workflow
|
||||
:width: 80%
|
||||
|
||||
@@ -66,216 +87,369 @@ The usage of OSACA can be listed as:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
osaca [-h] [-V] [--arch ARCH] [--tp-list] [-i | --iaca | -m] FILEPATH
|
||||
osaca [-h] [-V] [--arch ARCH] [--fixed] [--lines LINES]
|
||||
[--ignore-unknown] [--lcd-timeout SECONDS]
|
||||
[--db-check] [--import MICROBENCH] [--insert-marker]
|
||||
[--export-graph GRAPHNAME] [--consider-flag-deps]
|
||||
[--out OUT] [--yaml-out YAML_OUT] [--verbose]
|
||||
FILEPATH
|
||||
|
||||
- ``-h`` or ``--help`` prints out the help message.
|
||||
- ``-V`` or ``--version`` shows the program’s version number.
|
||||
- ``ARCH`` needs to be replaced with the wished architecture abbreviation. This flag is necessary for the throughput analysis (default function) and the inclusion of an ibench output (``-i``). Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW`` and ``SKL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN`` for AMD Zen (17h family) architecture .
|
||||
- While in the throughput analysis mode, one can add ``--tp-list`` for printing the additional throughput list of the kernel or ``--iaca`` for letting OSACA to know it has to search for IACA binary markers.
|
||||
- ``-i`` or ``--include-ibench`` starts the integration of ibench output into the CSV data file determined by ``ARCH``.
|
||||
- With the flag ``-m`` or ``--insert-marker`` OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`_ marker in suggested assembly blocks.
|
||||
- ``FILEPATH`` describes the filepath to the file to work with and is always necessary
|
||||
-h, --help
|
||||
prints out the help message.
|
||||
-V, --version
|
||||
shows the program’s version number.
|
||||
--arch ARCH
|
||||
needs to be replaced with the target architecture abbreviation.
|
||||
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ICL`` (Client), ``ICX`` (Server), ``SPR`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN[1-4]`` for AMD Zen architectures.
|
||||
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72, ``TSV110`` for the HiSilicon TaiShan v110, ``A64FX`` for Fujitsu's HPC ARM architecture, ``M1`` for the Apple M1-Firestorm performance core, and ``V2`` for the Neoverse V2 (used in NVIDIA's Grace CPU) are available.
|
||||
If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
|
||||
--fixed
|
||||
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
|
||||
Otherwise, OSACA will print out the optimal port utilization for the kernel.
|
||||
--lines
|
||||
Define lines that should be included in the analysis. This option overwrites any range defined by markers in the assembly. Add either single lines or ranges defined
|
||||
by "-" or ":", each entry separated by commas, e.g.: ``--lines 1,2,8-18,20:24``
|
||||
--db-check
|
||||
Run a sanity check on the by "--arch" specified database.
|
||||
The output depends on the verbosity level.
|
||||
Keep in mind you have to provide an existing (dummy) filename in anyway.
|
||||
--import MICROBENCH
|
||||
Import a given microbenchmark output file into the corresponding architecture instruction database.
|
||||
Define the type of microbenchmark either as "ibench" or "asmbench".
|
||||
--insert-marker
|
||||
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`__ byte markers or OSACA AArch64 byte markers in suggested assembly blocks.
|
||||
--export-graph EXPORT_PATH
|
||||
Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
|
||||
After the file was created, you can convert it to a PDF file using `dot <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__.
|
||||
--ignore-unknown
|
||||
Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
|
||||
If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
|
||||
--lcd-timeout SECONDS
|
||||
Set timeout in seconds for LCD analysis. After timeout, OSACA will continue its analysis with the dependency paths found up to this point.
|
||||
Defaults to `10`.
|
||||
-f, --consider-flag-deps
|
||||
Consider flag dependencies for the critical path and loop-carried dependency analysis. By default, those dependencies are ignored.
|
||||
-v, --verbose
|
||||
Increases verbosity level
|
||||
-o OUT, --out OUT
|
||||
Write analysis to this file (default to stdout)
|
||||
--yaml-out YAML_OUT
|
||||
Write analysis as YAML representation to this file
|
||||
|
||||
The **FILEPATH** describes the filepath to the file to work with and is always necessary, use "-" to read from stdin.
|
||||
|
||||
Supported microarchitectures
|
||||
-----------------------------
|
||||
**x86 CPUs**
|
||||
|
||||
+----------+-----------------+------------+
|
||||
| Designer | Model/microarch | OSACA flag |
|
||||
+==========+=================+============+
|
||||
| Intel | Sandy Bridge | ``SNB`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Ivy Bridge | ``IVB`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Haswell | ``HSW`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Broadwell | ``BDW`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Skylake-X | ``SKX`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Cascadelake-X | ``CSX`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Icelake client | ``ICL`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Icelake server | ``ICX`` |
|
||||
+----------+-----------------+------------+
|
||||
| Intel | Sapphire Rapids | ``SPR`` |
|
||||
+----------+-----------------+------------+
|
||||
| AMD | Naples / Zen 1 | ``ZEN1`` |
|
||||
+----------+-----------------+------------+
|
||||
| AMD | Rome / Zen 2 | ``ZEN2`` |
|
||||
+----------+-----------------+------------+
|
||||
| AMD | Milan / Zen 3 | ``ZEN3`` |
|
||||
+----------+-----------------+------------+
|
||||
| AMD | Genoa / Zen 4 | ``ZEN4`` |
|
||||
+----------+-----------------+------------+
|
||||
|
||||
**ARM AArch64 CPUs**
|
||||
|
||||
+-----------+-------------------+-------------+
|
||||
| Designer | Model/microarch | OSACA flag |
|
||||
+===========+===================+=============+
|
||||
| ARM | Cortex-A72 | ``A72`` |
|
||||
+-----------+-------------------+-------------+
|
||||
| ARM | Neoverse N1 | ``N1`` |
|
||||
+-----------+-------------------+-------------+
|
||||
| ARM | Neoverse V2 | ``V2`` |
|
||||
+-----------+-------------------+-------------+
|
||||
| Marvell | ThunderX2 | ``TX2`` |
|
||||
+-----------+-------------------+-------------+
|
||||
| Fujitsu | FX700/A64FX | ``A64FX`` |
|
||||
+-----------+-------------------+-------------+
|
||||
| HiSilicon | TaiShan v110 | ``TSV110`` |
|
||||
+-----------+-------------------+-------------+
|
||||
| Apple | M1-Firestorm | ``M1`` |
|
||||
+-----------+-------------------+-------------+
|
||||
| NVIDIA | Neoverse V2/Grace | ``V2`` |
|
||||
+-----------+-------------------+-------------+
|
||||
|
||||
----
|
||||
|
||||
Hereinafter OSACA's scope of function will be described.
|
||||
|
||||
Throughput analysis
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
As main functionality of OSACA this process starts by default. It is always necessary to specify the core architecture by the flag ``--arch ARCH``, where ``ARCH`` can stand for ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKL`` or ``ZEN``.
|
||||
Throughput & Latency analysis
|
||||
-----------------------------
|
||||
As main functionality of OSACA, the tool starts the analysis on a marked assembly file by running the following command with one or more of the optional parameters:
|
||||
|
||||
For extracting the right kernel, one has to mark it beforehand. For this there are two different approaches:
|
||||
.. code-block:: bash
|
||||
|
||||
| **High level code**
|
||||
osaca --arch ARCH [--fixed] [--ignore-unknown]
|
||||
[--export-graph EXPORT_PATH]
|
||||
file
|
||||
|
||||
The OSACA marker is ``//STARTLOOP`` and must be put in one line in front of the loop head, and the loop code must be indented consistently. This means the marker and the head must have the same indentation level while the whole loop body needs to be more indented than the code before and after. For instance, this is a valid OSACA marker:
|
||||
The ``file`` parameter specifies the target assembly file and is always mandatory.
|
||||
|
||||
.. code-block:: c
|
||||
The parameter ``ARCH`` is positional for the analysis and must be replaced by the target architecture abbreviation.
|
||||
|
||||
int i = 0;
|
||||
//STARTLOOP
|
||||
while(i < N){
|
||||
// do something...
|
||||
i++;
|
||||
}
|
||||
OSACA assumes an optimal scheduling for all instructions and assumes the processor to be able to schedule instructions in a way that it achieves a minimal reciprocal throughput.
|
||||
However, in older versions (<=v0.2.2) of OSACA, a fixed probability for port utilization was assumed.
|
||||
This means, instructions with *N* available ports for execution were scheduled with a probability of *1/N* to each of the ports.
|
||||
This behavior can be enforced by using the ``--fixed`` flag.
|
||||
|
||||
| **Assembly code**
|
||||
If one or more instruction forms are unknown to OSACA, it refuses to print an overall throughput, CP and
|
||||
LCD analysis and marks all unknown instruction forms with ``X`` next to the mnemonic.
|
||||
This is done so the user does not miss out on this unrecognized instruction and might assume an incorrect runtime prediction.
|
||||
To force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms, the flag ``--ignore-unknown`` can be specified.
|
||||
|
||||
Another way for marking a kernel is to insert the IACA byte markers in the assembly file in before and after the loop.
|
||||
To get a visualization of the analyzed kernel and its dependency chains, OSACA provides the option to additionally produce a graph as DOT file, which represents the kernel and all register dependencies inside of it.
|
||||
The tool highlights all LCDs and the CP.
|
||||
The graph generation is done by running OSACA with the ``--export-graph EXPORT_GRAPH`` flag.
|
||||
OSACA stores the DOT file either at the by ``EXPORT_GRAPH`` specified filepath or uses the default filename "osaca_dg.dot" in the current working directory.
|
||||
Subsequently, the DOT-graph can be adjusted in its appearance and converted to various output formats such as PDF, SVG, or PNG using the `dot command <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__, e.g., ``dot -Tpdf osaca_dg.dot -o
|
||||
graph.pdf`` to generate a PDF document.
|
||||
|
||||
Marker insertion
|
||||
----------------
|
||||
For extracting the right kernel, one can mark it in beforehand.
|
||||
Currently, only the detection of markers in the assembly code and therefore the analysis of assembly files is supported by OSACA.
|
||||
If OSACA cannot find any markers in the given input file, all lines will be evaluated.
|
||||
|
||||
Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
|
||||
For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
|
||||
Start and end marker can be seen in the example below:
|
||||
IACA requires byte markers since it operates on opcode-level.
|
||||
To provide a trade-off between reusability for such tool and convenient usability, OSACA supports both byte markers and comment line markers.
|
||||
While the byte markers for x86 are equivalent to IACA byte markers, the comment keywords ``OSACA-BEGIN`` and ``OSACA-END`` are based on LLVM-MCA's markers.
|
||||
|
||||
.. code-block:: gas
|
||||
x86 markers
|
||||
^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
movl $111,%ebx ;IACA START MARKER
|
||||
.byte 100,103,144 ;IACA START MARKER
|
||||
; LABEL
|
||||
; do something
|
||||
; ...
|
||||
; conditional jump to LABEL
|
||||
movl $222,%ebx ;IACA END MARKER
|
||||
.byte 100,103,144 ;IACA END MARKER
|
||||
.. code-block:: asm
|
||||
|
||||
The optional flag ``--iaca`` defines if OSACA needs to search for the IACA byte markers or the OSACA marker in the chosen file.
|
||||
movl $111,%ebx #IACA/OSACA START MARKER
|
||||
.byte 100,103,144 #IACA/OSACA START MARKER
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
movl $222,%ebx #IACA/OSACA END MARKER
|
||||
.byte 100,103,144 #IACA/OSACA END MARKER
|
||||
|
||||
With an additional, optional ``--tp-list``, OSACA adds a simple list of all kernel instruction forms together with their reciprocal throughput to the output. This is helpful in case of no further information about the port binding of the single instruction forms.
|
||||
**Comment line markers**
|
||||
|
||||
Include new measurements into the data file
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Running OSACA with the flag ``-i`` or ``--include-ibench`` and a specified micro architecture ``ARCH``, it
|
||||
takes the values given in an ibench output file and checks them for reasonability. If a value is not in the data file already, it will be added, otherwise OSACA prints out a warning message and keeps the old value in the data file. If a value does not pass the validation, a warning message is shown, however, OSACA will keep working with the new value.
|
||||
The handling of ibench is shortly described in the example section below.
|
||||
.. code-block:: asm
|
||||
|
||||
Insert IACA markers
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
Using the ``-m`` or ``--insert-marker`` flags for a given file, OSACA calls the implemented Kerncraft module for identifying and marking the inner-loop block in *manual mode*. More information about how this is done can be found in the `Kerncraft repository <https://github.com/RRZE-HPC/kerncraft>`_.
|
||||
# OSACA-BEGIN
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
# OSACA-END
|
||||
|
||||
Example
|
||||
=======
|
||||
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel IVB core hereafter:
|
||||
AArch64 markers
|
||||
^^^^^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
::
|
||||
|
||||
mov x1, #111 // OSACA START
|
||||
.byte 213,3,32,31 // OSACA START
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
mov x1, #222 // OSACA END
|
||||
.byte 213,3,32,31 // OSACA END
|
||||
|
||||
**Comment line markers**
|
||||
|
||||
::
|
||||
|
||||
// OSACA-BEGIN
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
// OSACA-END
|
||||
|
||||
OSACA in combination with Kerncraft provides a functionality for the automatic detection of possible loop kernels and inserting markers.
|
||||
This can be done by using the ``--insert-marker`` flag together with the path to the target assembly file and the target architecture.
|
||||
|
||||
Benchmark import
|
||||
----------------
|
||||
OSACA supports the automatic integration of new instruction forms by parsing the output of the micro-
|
||||
benchmark tools `asmbench <https://github.com/RRZE-HPC/asmbench>`__ and `ibench <https://github.com/RRZE-HPC/ibench>`__.
|
||||
This can be achieved by running OSACA with the command line option ``--import MICROBENCH``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --import MICROBENCH file
|
||||
|
||||
``MICROBENCH`` specifies one of the currently supported benchmark tools, i.e., "asmbench" or "ibench".
|
||||
``ARCH`` defines the abbreviation of the target architecture for which the instructions will be added and file must be the path to the generated output file of the benchmark.
|
||||
The format of this file has to match either the basic command line output of ibench, e.g.,
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]-TP: 0.500 (clock cycles) [DEBUG - result: 1.000000]
|
||||
[INSTRUCTION FORM]-LT: 4.000 (clock cycles) [DEBUG - result: 1.000000]
|
||||
|
||||
or the command line output of asmbench including the name of the instruction form in a separate line at the
|
||||
beginning, e.g.:
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]
|
||||
Latency: 4.00 cycle
|
||||
Throughput: 0.50 cycle
|
||||
|
||||
|
||||
Note that there must be an empty line after each throughput measurement as part of the output so that one instruction form entry consists of four (4) lines.
|
||||
|
||||
To let OSACA import the instruction form with the correct operands, the naming conventions for the instruction form name must be followed:
|
||||
|
||||
* The first part of the name is the mnemonic and ends with the character "``-``" (not part of the mnemonic in the DB).
|
||||
|
||||
* The second part of the name are the operands. Each operand must be separated from another operand by the character "``_``".
|
||||
|
||||
* For each **x86** operand, one of the following symbols must be used:
|
||||
|
||||
* "``r``" for general purpose registers (rax, edi, r9, ...)
|
||||
* "``x``", "``y``", or "``z``" for xmm, ymm, or zmm registers, respectively
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more* than 1.
|
||||
|
||||
* For each **AArch64** operand, one of the following symbols must be used:
|
||||
|
||||
* "``w``", "``x``", "``b``", "``h``", "``s``", "``d``", or "``q``" for registers with the corresponding prefix.
|
||||
* "``v``" followed by a single character ("``b``", "``h``", "``s``", or "``d``") for vector registers with the corresponding lane width of the second character.
|
||||
If no second character is given, OSACA assumes a lane width of 64 bit (``d``) as default.
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more*
|
||||
than 1. Add "``r``" if the address format uses pre-indexing and "``p``" if it uses post-indexing.
|
||||
|
||||
Valid instruction form examples for x86 are ``vaddpd-x_x_x``, ``mov-r_mboi``, and ``vfmadd213pd-mbis_y_y``.
|
||||
|
||||
Valid instruction form examples for AArch64 are ``fadd-vd_vd_v``, ``ldp-d_d_mo``, and ``fmov-s_i``.
|
||||
|
||||
Note that the options to define operands are limited, therefore, one might need to adjust the instruction forms in the architecture DB after importing.
|
||||
OSACA parses the output for an arbitrary number of instruction forms and adds them as entries to the architecture DB.
|
||||
The user must edit the ISA DB in case the instruction form shows irregular source and destination operands for its ISA syntax. OSACA applies the following rules by default:
|
||||
|
||||
* If there is only one operand, it is considered as source operand
|
||||
|
||||
* In case of multiple operands the target operand (depending on the ISA syntax the last or first one) is considered to be the
|
||||
destination operand, all others are considered as source operands.
|
||||
|
||||
Database check
|
||||
--------------
|
||||
Since a manual adjustment of the ISA DB is currently indispensable when adding new instruction forms,
|
||||
OSACA provides a database sanity check using the --db-check flag. It can be executed via:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --db-check [-v] file
|
||||
|
||||
``ARCH`` defines the abbreviation of the target architecture of the database to check.
|
||||
The ``file`` argument needs to be specified as it is positional but may be any existing dummy path.
|
||||
When called, OSACA prints a summary of database information containing the amount of missing throughput values, latency values or μ-ops assignments for an instruction form.
|
||||
Furthermore, it shows the amount of duplicate instruction forms in both the architecture DB and the ISA DB and checks how many instruction forms in the ISA DB are non-existent in the architecture DB.
|
||||
Finally, it checks via simple heuristics how many of the instruction forms contained in the architecture DB might miss an ISA DB entry.
|
||||
Running the database check including the ``-v`` verbosity flag, OSACA prints in addition the specific name of the identified instruction forms so that the user can check the mentioned incidents.
|
||||
|
||||
Examples
|
||||
========
|
||||
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
double a[N], double b[N];
|
||||
double s;
|
||||
|
||||
//STARTLOOP
|
||||
// loop
|
||||
for(int i = 0; i < N; ++i)
|
||||
a[i] = s * b[i];
|
||||
|
||||
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``. The result is
|
||||
written in vector ``a``.
|
||||
After including the OSACA marker ``//STARTLOOP`` and compiling the source, one can
|
||||
start the analysis typing
|
||||
|
||||
.. code:: bash
|
||||
|
||||
osaca --arch IVB PATH/TO/FILE
|
||||
|
||||
in the command line. Optionally, one can create the assembly code out of the file, identify and mark the kernel of interest and run OSACA with the additional ``--iaca`` flag.
|
||||
|
||||
The output is:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
X - No information for this instruction in database
|
||||
* - Instruction micro-ops not bound to a port
|
||||
|
||||
Port Binding in Cycles Per Iteration:
|
||||
-------------------------------------------------
|
||||
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
-------------------------------------------------
|
||||
| Cycles | 2.33 | 1.33 | 5.0 | 5.0 | 2.0 | 1.33 |
|
||||
-------------------------------------------------
|
||||
|
||||
Ports Pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
-------------------------------------------
|
||||
| | | 0.50 | 0.50 | 1.00 | | movl $0x0,-0x24(%rbp)
|
||||
| | | | | | | jmp 10b <scale+0x10b>
|
||||
| | | 0.50 | 0.50 | | | mov -0x48(%rbp),%rax
|
||||
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
|
||||
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
|
||||
| | | 0.50 | 0.50 | | | vmovsd (%rax,%rdx,8),%xmm0
|
||||
| 1.00 | | 0.50 | 0.50 | | | vmulsd -0x50(%rbp),%xmm0,%xmm0
|
||||
| | | 0.50 | 0.50 | | | mov -0x38(%rbp),%rax
|
||||
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
|
||||
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
|
||||
| | | 0.50 | 0.50 | 1.00 | | vmovsd %xmm0,(%rax,%rdx,8)
|
||||
| | | | | | | X addl $0x1,-0x24(%rbp)
|
||||
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%eax
|
||||
| 0.33 | 0.33 | 0.50 | 0.50 | | 0.33 | cmp -0x54(%rbp),%eax
|
||||
| | | | | | | jl e4 <scale+0xe4>
|
||||
| 0.33 | 0.33 | | | | 0.33 | mov %rcx,%rsp
|
||||
Total number of estimated throughput: 5.0
|
||||
|
||||
It shows the whole kernel together with the average port pressure of each instruction form and the overall port binding.
|
||||
In the fifth to last line containing ``addl $0x1, -0x24(%rbp)`` one can see an ``X`` in front of the instruction form and no port occupation.
|
||||
This means either there are no measured values for this instruction form or no port binding is provided in the
|
||||
data file.
|
||||
In the first case, OSACA automatically creates two benchmark assembly files (``add-mem_imd.S`` for latency and ``add-mem_imd-TP.S`` for throughput) in the benchmark folder, if it not already exists there.
|
||||
|
||||
One can now run ibench to get the throughput value for addl with the given file. Mind that the assembly
|
||||
file, which is used for ibench, is implemented in Intel syntax. So for a valid run instruction ``addl`` must be
|
||||
changed to ``add`` manually.
|
||||
|
||||
For measuring the instruction forms with ibench we highly recommend to use an exclusively allocated node,
|
||||
so there is no other workload falsifying the results. For the correct function of ibench the benchmark files
|
||||
from OSACA need to be placed in a subdirectory of src in root so ibench can create the a folder with the
|
||||
subdirectory’s name and the shared objects. For running the tests the frequencies of all cores must set to a
|
||||
constant value and this has to be given as an argument together with the directory of the shared objects to
|
||||
ibench, e.g.:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
./ibench ./AVX 2.2
|
||||
|
||||
for running ibench in the directory ``AVX`` with a core frequency of 2.2 GHz.
|
||||
We get an output like:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
Using frequency 2.20GHz.
|
||||
add-mem_imd-TP: 1.023 (clock cycles) [DEBUG - result: 1.000000]
|
||||
add-mem_imd: 6.050 (clock cycles) [DEBUG - result: 1.000000]
|
||||
|
||||
The debug output as resulting value of register ``xmm0`` is additional validation information depending on
|
||||
the executed instruction form meant for the user and is not considered by OSACA.
|
||||
The ibench output information can be included by OSACA running the program with the flag ``--include-ibench`` or just
|
||||
``-i`` and the specify micro architecture:
|
||||
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``.
|
||||
The result is written in vector ``a``.
|
||||
After including the OSACA byte marker into the assembly, one can start the analysis typing
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch IVB -i PATH/TO/IBENCH-OUTPUTFILE
|
||||
osaca --arch CSX PATH/TO/FILE
|
||||
|
||||
For now no automatic allocation of ports for a instruction form is implemented, so for getting an output in the Ports Pressure table, one must add the port occupation by hand.
|
||||
We know that the inserted instruction form must be assigned always to Port 2, 3 and 4 and additionally to either 0, 1 or 5, a valid data file therefore would look like this:
|
||||
in the command line.
|
||||
|
||||
.. code:: bash
|
||||
The output is:
|
||||
|
||||
addl-mem_imd,1.0,6.0,"(0.33,0.33,1.00,1.00,1.00,0.33)"
|
||||
|
||||
Another thorughput analysis with OSACA now returns all information for the kernel:
|
||||
::
|
||||
|
||||
.. code-block::
|
||||
Open Source Architecture Code Analyzer (OSACA) - v0.3
|
||||
Analyzed file: scale.s.csx.O3.s
|
||||
Architecture: csx
|
||||
Timestamp: 2019-10-03 23:36:21
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
X - No information for this instruction in database
|
||||
* - Instruction micro-ops not bound to a port
|
||||
|
||||
Port Binding in Cycles Per Iteration:
|
||||
-------------------------------------------------
|
||||
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
-------------------------------------------------
|
||||
| Cycles | 2.67 | 1.67 | 6.0 | 6.0 | 3.0 | 1.67 |
|
||||
-------------------------------------------------
|
||||
|
||||
Ports Pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
-------------------------------------------
|
||||
| | | 0.50 | 0.50 | 1.00 | | movl $0x0,-0x24(%rbp)
|
||||
| | | | | | | jmp 10b <scale+0x10b>
|
||||
| | | 0.50 | 0.50 | | | mov -0x48(%rbp),%rax
|
||||
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
|
||||
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
|
||||
| | | 0.50 | 0.50 | | | vmovsd (%rax,%rdx,8),%xmm0
|
||||
| 1.00 | | 0.50 | 0.50 | | | vmulsd -0x50(%rbp),%xmm0,%xmm0
|
||||
| | | 0.50 | 0.50 | | | mov -0x38(%rbp),%rax
|
||||
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
|
||||
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
|
||||
| | | 0.50 | 0.50 | 1.00 | | vmovsd %xmm0,(%rax,%rdx,8)
|
||||
| 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 0.33 | addl $0x1,-0x24(%rbp)
|
||||
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%eax
|
||||
| 0.33 | 0.33 | 0.50 | 0.50 | | 0.33 | cmp -0x54(%rbp),%eax
|
||||
| | | | | | | jl e4 <scale+0xe4>
|
||||
| 0.33 | 0.33 | | | | 0.33 | mov %rcx,%rsp
|
||||
Total number of estimated throughput: 6.0
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
-----------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
170 | | | | | | | | || | | .L22:
|
||||
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
|
||||
172 | | | 0.50 | 0.50 | 1.00 | | | || 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
|
||||
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | || | 1.0 | addq $32, %rax
|
||||
174 | 0.00 | 0.00 | | | | 0.50 | 0.50 | || | | cmpq %rax, %r14
|
||||
175 | | | | | | | | || | | * jne .L22
|
||||
|
||||
0.75 0.75 1.00 0.50 1.00 0.50 1.00 0.75 0.75 13.0 1.0
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
173 | 1.0 | addq $32, %rax | [173]
|
||||
|
||||
|
||||
It shows the whole kernel together with the optimized port pressure of each instruction form and the overall port binding.
|
||||
Furthermore, in the two columns on the right, the critical path (CP) and the longest loop-carried dependency (LCD) of the loop kernel.
|
||||
In the bottom, all loop-carried dependencies are shown, each with a list of line numbers being part of this dependency chain on the right.
|
||||
|
||||
You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
|
||||
|
||||
Citations
|
||||
=========
|
||||
If you use OSACA for scientific work you can cite us as (for the Bibtex, see the `Wiki <https://github.com/RRZE-HPC/OSACA/wiki#acknowledgement>`_):
|
||||
|
||||
* `Automated Instruction Stream Throughput Prediction for Intel and AMD Microarchitectures <https://doi.org/10.1109/PMBS.2018.8641578>`_ (`Pre-print PMBS18 <https://arxiv.org/abs/1809.00912>`_)
|
||||
* `Automatic Throughput and Critical Path Analysis of x86 and ARM Assembly Kernels <https://doi.org/10.1109/PMBS49563.2019.00006>`_ (`Pre-print PMBS19 <https://arxiv.org/abs/1910.00214>`_)
|
||||
|
||||
Credits
|
||||
=======
|
||||
Implementation: Jan Laukemann
|
||||
Implementation: Jan Laukemann, Julian Hammer
|
||||
|
||||
License
|
||||
=======
|
||||
`AGPL-3.0 </LICENSE>`_
|
||||
`AGPL-3.0 </LICENSE>`__
|
||||
|
||||
3
codecov.yml
Normal file
3
codecov.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
ignore:
|
||||
- "tests" # ignore test folder and all its contents
|
||||
- "**/__init__.py" # ignore init files
|
||||
@@ -1,134 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="459.03406mm"
|
||||
height="217.28152mm"
|
||||
viewBox="0 0 1626.4986 769.8951"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.92.1 r15371"
|
||||
sodipodi:docname="OSACA-Logo_05.svg"
|
||||
inkscape:export-filename="/home/cip/2014/ol68umur/Desktop/logo/OSACA-Logo_03b.png"
|
||||
inkscape:export-xdpi="1104"
|
||||
inkscape:export-ydpi="1104">
|
||||
<defs
|
||||
id="defs4" />
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="0.5"
|
||||
inkscape:cx="858.02629"
|
||||
inkscape:cy="511.52256"
|
||||
inkscape:document-units="px"
|
||||
inkscape:current-layer="layer1"
|
||||
showgrid="false"
|
||||
fit-margin-top="0.5"
|
||||
fit-margin-left="0.5"
|
||||
fit-margin-right="0.5"
|
||||
fit-margin-bottom="0.5"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1081"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="49"
|
||||
inkscape:window-maximized="1" />
|
||||
<metadata
|
||||
id="metadata7">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Ebene 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(263.39161,902.34721)">
|
||||
<g
|
||||
id="g4583">
|
||||
<text
|
||||
transform="scale(1.0341487,0.96697893)"
|
||||
id="text4147"
|
||||
y="-333.24573"
|
||||
x="542.02954"
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;line-height:0%;font-family:'Open Sans book';-inkscape-font-specification:'Open Sans book, ';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
|
||||
xml:space="preserve"><tspan
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:1.25;font-family:'Futura Bk';-inkscape-font-specification:'Futura Bk'"
|
||||
y="-333.24573"
|
||||
x="542.02954"
|
||||
id="tspan4149"
|
||||
sodipodi:role="line">ACA</tspan></text>
|
||||
<text
|
||||
transform="scale(1.0341487,0.96697893)"
|
||||
id="text4147-3"
|
||||
y="-417.88809"
|
||||
x="-266.53079"
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:0%;font-family:'Open Sans book';-inkscape-font-specification:'Open Sans book, ';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
|
||||
xml:space="preserve"><tspan
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:1.25;font-family:'Futura Bk';-inkscape-font-specification:'Futura Bk';stroke-width:1px"
|
||||
y="-417.88809"
|
||||
x="-266.53079"
|
||||
id="tspan4149-6"
|
||||
sodipodi:role="line">OS</tspan></text>
|
||||
<g
|
||||
id="g4571">
|
||||
<rect
|
||||
style="fill:#4dd9ff;fill-opacity:1;stroke:none;stroke-width:4.99469042;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-6-3"
|
||||
width="501.93356"
|
||||
height="46.874996"
|
||||
x="-900.57556"
|
||||
y="-486.72452"
|
||||
transform="rotate(90)" />
|
||||
<rect
|
||||
style="fill:#7fff00;fill-opacity:1;stroke:none;stroke-width:7.58152723;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-5"
|
||||
width="676.86932"
|
||||
height="46.874996"
|
||||
x="134.22374"
|
||||
y="-425.20099"
|
||||
transform="matrix(0,-1,-1,0,0,0)" />
|
||||
<rect
|
||||
style="fill:#f2ff19;fill-opacity:1;stroke:none;stroke-width:5.0525918;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-7-6"
|
||||
width="456.4649"
|
||||
height="46.874996"
|
||||
x="-682.95709"
|
||||
y="-363.67743"
|
||||
transform="rotate(90)" />
|
||||
<rect
|
||||
style="fill:#8071ff;fill-opacity:1;stroke:none;stroke-width:5.53460026;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-3-2"
|
||||
width="360.71481"
|
||||
height="46.874996"
|
||||
x="322.24228"
|
||||
y="-548.24811"
|
||||
transform="matrix(0,-1,-1,0,0,0)" />
|
||||
<rect
|
||||
style="fill:#ff2a2a;fill-opacity:1;stroke:none;stroke-width:5.43282366;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-67-9"
|
||||
width="347.57031"
|
||||
height="46.874996"
|
||||
x="398.60016"
|
||||
y="-302.15387"
|
||||
transform="matrix(0,-1,-1,0,0,0)" />
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 5.4 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 231 KiB |
Binary file not shown.
66
docs/conf.py
Normal file
66
docs/conf.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath("."))
|
||||
from version_from_src import get_version # noqa: E402
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = "OSACA"
|
||||
copyright = "2020, Jan Laukemann"
|
||||
author = "Jan Laukemann"
|
||||
html_logo = "img/osaca-logo.png"
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
version = get_version()
|
||||
release = get_version()
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.doctest",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx.ext.mathjax",
|
||||
"sphinx.ext.napoleon",
|
||||
"sphinx.ext.todo",
|
||||
"sphinx.ext.viewcode",
|
||||
]
|
||||
|
||||
add_module_names = False
|
||||
source_suffix = ".rst"
|
||||
master_doc = "index"
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
# e.g., 'alabaster', 'sphinx_rtd_theme'
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = []
|
||||
htmlhelp_basename = "osaca_doc"
|
||||
html_sidebars = {"**": ["globaltoc.html", "relations.html", "sourcelink.html", "searchbox.html"]}
|
||||
|
||||
autodoc_member_order = "bysource"
|
||||
BIN
docs/img/osaca-logo.pdf
Normal file
BIN
docs/img/osaca-logo.pdf
Normal file
Binary file not shown.
|
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 45 KiB |
BIN
docs/img/osaca-workflow.png
Normal file
BIN
docs/img/osaca-workflow.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 206 KiB |
14
docs/index.rst
Normal file
14
docs/index.rst
Normal file
@@ -0,0 +1,14 @@
|
||||
OSACA -- Open Source Architecture Code Analyzer
|
||||
=================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
sphinx/home.rst
|
||||
sphinx/api.rst
|
||||
|
||||
.. image:: /img/osaca-logo.png
|
||||
:alt: OSACA logo
|
||||
:width: 80%
|
||||
|
||||
7
docs/sphinx/api.rst
Normal file
7
docs/sphinx/api.rst
Normal file
@@ -0,0 +1,7 @@
|
||||
API Reference
|
||||
=============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
osaca
|
||||
364
docs/sphinx/home.rst
Normal file
364
docs/sphinx/home.rst
Normal file
@@ -0,0 +1,364 @@
|
||||
.. image:: /img/osaca-logo.png
|
||||
:alt: OSACA logo
|
||||
:width: 80%
|
||||
|
||||
OSACA
|
||||
=====
|
||||
|
||||
Open Source Architecture Code Analyzer
|
||||
--------------------------------------
|
||||
|
||||
For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.
|
||||
|
||||
.. image:: https://travis-ci.org/RRZE-HPC/OSACA.svg?branch=master
|
||||
:target: https://travis-ci.org/RRZE-HPC/OSACA
|
||||
:alt: Build Status
|
||||
|
||||
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
|
||||
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
|
||||
:alt: Code Coverage
|
||||
|
||||
.. image:: https://readthedocs.org/projects/osaca/badge/?version=latest
|
||||
:target: https://osaca.readthedocs.io/en/latest/?badge=latest
|
||||
:alt: Documentation Status
|
||||
|
||||
.. image:: https://img.shields.io/badge/read-the_docs-blue
|
||||
:target: https://osaca.readthedocs.io/
|
||||
:alt: Docs
|
||||
|
||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
||||
:target: https://github.com/ambv/black
|
||||
:alt: Code Style
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
Installation
|
||||
------------
|
||||
On most systems with python pip and setuputils installed, just run:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install --user osaca
|
||||
|
||||
for the latest release.
|
||||
|
||||
To build OSACA from source, clone this repository using ``git clone https://github.com/RRZE-HPC/OSACA`` and run in the root directory:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
python ./setup.py install
|
||||
|
||||
After installation, OSACA can be started with the command ``osaca`` in the CLI.
|
||||
|
||||
Dependencies:
|
||||
-------------
|
||||
Additional requirements are:
|
||||
|
||||
- `Python3 <https://www.python.org/>`__
|
||||
- `Graphviz <https://www.graphviz.org/>`__ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
|
||||
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
|
||||
- `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
|
||||
|
||||
Design
|
||||
======
|
||||
A schematic design of OSACA's workflow is shown below:
|
||||
|
||||
.. image:: /img/osaca-workflow.png
|
||||
:alt: OSACA workflow
|
||||
:width: 80%
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
The usage of OSACA can be listed as:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
osaca [-h] [-V] [--arch ARCH] [--fixed] [--db-check]
|
||||
[--import MICROBENCH] [--insert-marker]
|
||||
[--export-graph GRAPHNAME] [--ignore-unknown] [--verbose]
|
||||
FILEPATH
|
||||
|
||||
-h, --help
|
||||
prints out the help message.
|
||||
-V, --version
|
||||
shows the program’s version number.
|
||||
--arch ARCH
|
||||
needs to be replaced with the target architecture abbreviation.
|
||||
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX`` and ``CSX`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
|
||||
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 architecture is available.
|
||||
--fixed
|
||||
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
|
||||
Otherwise, OSACA will print out the optimal port utilization for the kernel.
|
||||
--db-check
|
||||
Run a sanity check on the by "--arch" specified database.
|
||||
The output depends on the verbosity level.
|
||||
Keep in mind you have to provide an existing (dummy) filename in anyway.
|
||||
--import MICROBENCH
|
||||
Import a given microbenchmark output file into the corresponding architecture instruction database.
|
||||
Define the type of microbenchmark either as "ibench" or "asmbench".
|
||||
--insert-marker
|
||||
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`__ byte markers or OSACA AArch64 byte markers in suggested assembly blocks.
|
||||
--export-graph EXPORT_PATH
|
||||
Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
|
||||
After the file was created, you can convert it to a PDF file using `dot <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__.
|
||||
--ignore-unknown
|
||||
Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
|
||||
If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
|
||||
-v, --verbose
|
||||
Increases verbosity level
|
||||
|
||||
The **FILEPATH** describes the filepath to the file to work with and is always necessary
|
||||
|
||||
______________________
|
||||
|
||||
Hereinafter OSACA's scope of function will be described.
|
||||
|
||||
Throughput & Latency analysis
|
||||
-----------------------------
|
||||
As main functionality of OSACA, the tool starts the analysis on a marked assembly file by running the following command with one or more of the optional parameters:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH [--fixed] [--ignore-unknown]
|
||||
[--export-graph EXPORT_PATH]
|
||||
file
|
||||
|
||||
The ``file`` parameter specifies the target assembly file and is always mandatory.
|
||||
|
||||
The parameter ``ARCH`` is positional for the analysis and must be replaced by the target architecture abbreviation.
|
||||
|
||||
OSACA assumes an optimal scheduling for all instructions and assumes the processor to be able to schedule instructions in a way that it achieves a minimal reciprocal throughput.
|
||||
However, in older versions (<=v0.2.2) of OSACA, a fixed probability for port utilization was assumed.
|
||||
This means, instructions with *N* available ports for execution were scheduled with a probability of *1/N* to each of the ports.
|
||||
This behavior can be enforced by using the ``--fixed`` flag.
|
||||
|
||||
If one or more instruction forms are unknown to OSACA, it refuses to print an overall throughput, CP and
|
||||
LCD analysis and marks all unknown instruction forms with ``X`` next to the mnemonic.
|
||||
This is done so the user does not miss out on this unrecognized instruction and might assume an incorrect runtime prediction.
|
||||
To force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms, the flag ``--ignore-unknown`` can be specified.
|
||||
|
||||
To get a visualization of the analyzed kernel and its dependency chains, OSACA provides the option to additionally produce a graph as DOT file, which represents the kernel and all register dependencies inside of it.
|
||||
The tool highlights all LCDs and the CP.
|
||||
The graph generation is done by running OSACA with the ``--export-graph EXPORT_GRAPH`` flag.
|
||||
OSACA stores the DOT file either at the by ``EXPORT_GRAPH`` specified filepath or uses the default filename "osaca_dg.dot" in the current working directory.
|
||||
Subsequently, the DOT-graph can be adjusted in its appearance and converted to various output formats such as PDF, SVG, or PNG using the `dot command <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__, e.g., ``dot -Tpdf osaca_dg.dot -o
|
||||
graph.pdf`` to generate a PDF document.
|
||||
|
||||
Marker insertion
|
||||
----------------
|
||||
For extracting the right kernel, one has to mark it in beforehand.
|
||||
Currently, only the detection of markers in the assembly code and therefore the analysis of assembly files is supported by OSACA.
|
||||
|
||||
Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
|
||||
For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
|
||||
IACA requires byte markers since it operates on opcode-level.
|
||||
To provide a trade-off between reusability for such tool and convenient usability, OSACA supports both byte markers and comment line markers.
|
||||
While the byte markers for x86 are equivalent to IACA byte markers, the comment keywords ``OSACA-BEGIN`` and ``OSACA-END`` are based on LLVM-MCA's markers.
|
||||
|
||||
x86 markers
|
||||
^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
.. code-block:: asm
|
||||
|
||||
movl $111,%ebx #IACA/OSACA START MARKER
|
||||
.byte 100,103,144 #IACA/OSACA START MARKER
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
movl $222,%ebx #IACA/OSACA END MARKER
|
||||
.byte 100,103,144 #IACA/OSACA END MARKER
|
||||
|
||||
**Comment line markers**
|
||||
|
||||
.. code-block:: asm
|
||||
|
||||
# OSACA-BEGIN
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
# OSACA-END
|
||||
|
||||
AArch64 markers
|
||||
^^^^^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
::
|
||||
|
||||
mov x1, #111 // OSACA START
|
||||
.byte 213,3,32,31 // OSACA START
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
mov x1, #222 // OSACA END
|
||||
.byte 213,3,32,31 // OSACA END
|
||||
|
||||
**Comment line markers**
|
||||
|
||||
::
|
||||
|
||||
// OSACA-BEGIN
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
// OSACA-END
|
||||
|
||||
OSACA in combination with Kerncraft provides a functionality for the automatic detection of possible loop kernels and inserting markers.
|
||||
This can be done by using the ``--insert-marker`` flag together with the path to the target assembly file and the target architecture.
|
||||
|
||||
Benchmark import
|
||||
----------------
|
||||
OSACA supports the automatic integration of new instruction forms by parsing the output of the micro-
|
||||
benchmark tools `asmbench <https://github.com/RRZE-HPC/asmbench>`__ and `ibench <https://github.com/RRZE-HPC/ibench>`__.
|
||||
This can be achieved by running OSACA with the command line option ``--import MICROBENCH``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --import MICROBENCH file
|
||||
|
||||
``MICROBENCH`` specifies one of the currently supported benchmark tools, i.e., "asmbench" or "ibench".
|
||||
``ARCH`` defines the abbreviation of the target architecture for which the instructions will be added and file must be the path to the generated output file of the benchmark.
|
||||
The format of this file has to match either the basic command line output of ibench, e.g.,
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]-TP: 0.500 (clock cycles) [DEBUG - result: 1.000000]
|
||||
[INSTRUCTION FORM]-LT: 4.000 (clock cycles) [DEBUG - result: 1.000000]
|
||||
|
||||
or the command line output of asmbench including the name of the instruction form in a separate line at the
|
||||
beginning, e.g.:
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]
|
||||
Latency: 4.00 cycle
|
||||
Throughput: 0.50 cycle
|
||||
|
||||
|
||||
Note that there must be an empty line after each throughput measurement as part of the output so that one instruction form entry consists of four (4) lines.
|
||||
|
||||
To let OSACA import the instruction form with the correct operands, the naming conventions for the instruction form name must be followed:
|
||||
|
||||
* The first part of the name is the mnemonic and ends with the character "``-``" (not part of the mnemonic in the DB).
|
||||
|
||||
* The second part of the name are the operands. Each operand must be separated from another operand by the character "``_``".
|
||||
|
||||
* For each **x86** operand, one of the following symbols must be used:
|
||||
|
||||
* "``r``" for general purpose registers (rax, edi, r9, ...)
|
||||
* "``x``", "``y``", or "``z``" for xmm, ymm, or zmm registers, respectively
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more* than 1.
|
||||
|
||||
* For each **AArch64** operand, one of the following symbols must be used:
|
||||
|
||||
* "``w``", "``x``", "``b``", "``h``", "``s``", "``d``", or "``q``" for registers with the corresponding prefix.
|
||||
* "``v``" followed by a single character ("``b``", "``h``", "``s``", or "``d``") for vector registers with the corresponding lane width of the second character.
|
||||
If no second character is given, OSACA assumes a lane width of 64 bit (``d``) as default.
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more*
|
||||
than 1. Add "``r``" if the address format uses pre-indexing and "``p``" if it uses post-indexing.
|
||||
|
||||
Valid instruction form examples for x86 are ``vaddpd-x_x_x``, ``mov-r_mboi``, and ``vfmadd213pd-mbis_y_y``.
|
||||
|
||||
Valid instruction form examples for AArch64 are ``fadd-vd_vd_v``, ``ldp-d_d_mo``, and ``fmov-s_i``.
|
||||
|
||||
Note that the options to define operands are limited, therefore, one might need to adjust the instruction forms in the architecture DB after importing.
|
||||
OSACA parses the output for an arbitrary number of instruction forms and adds them as entries to the architecture DB.
|
||||
The user must edit the ISA DB in case the instruction form shows irregular source and destination operands for its ISA syntax. OSACA applies the following rules by default:
|
||||
|
||||
* If there is only one operand, it is considered as source operand
|
||||
|
||||
* In case of multiple operands the target operand (depending on the ISA syntax the last or first one) is considered to be the
|
||||
destination operand, all others are considered as source operands.
|
||||
|
||||
Database check
|
||||
--------------
|
||||
Since a manual adjustment of the ISA DB is currently indispensable when adding new instruction forms,
|
||||
OSACA provides a database sanity check using the --db-check flag. It can be executed via:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --db-check [-v] file
|
||||
|
||||
``ARCH`` defines the abbreviation of the target architecture of the database to check.
|
||||
The ``file`` argument needs to be specified as it is positional but may be any existing dummy path.
|
||||
When called, OSACA prints a summary of database information containing the amount of missing throughput values, latency values or μ-ops assignments for an instruction form.
|
||||
Furthermore, it shows the amount of duplicate instruction forms in both the architecture DB and the ISA DB and checks how many instruction forms in the ISA DB are non-existent in the architecture DB.
|
||||
Finally, it checks via simple heuristics how many of the instruction forms contained in the architecture DB might miss an ISA DB entry.
|
||||
Running the database check including the ``-v`` verbosity flag, OSACA prints in addition the specific name of the identified instruction forms so that the user can check the mentioned incidents.
|
||||
|
||||
Examples
|
||||
========
|
||||
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
double a[N], double b[N];
|
||||
double s;
|
||||
|
||||
// loop
|
||||
for(int i = 0; i < N; ++i)
|
||||
a[i] = s * b[i];
|
||||
|
||||
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``.
|
||||
The result is written in vector ``a``.
|
||||
After including the OSACA byte marker into the assembly, one can start the analysis typing
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch CSX PATH/TO/FILE
|
||||
|
||||
in the command line.
|
||||
|
||||
The output is:
|
||||
|
||||
::
|
||||
|
||||
Open Source Architecture Code Analyzer (OSACA) - v0.3
|
||||
Analyzed file: scale.s.csx.O3.s
|
||||
Architecture: csx
|
||||
Timestamp: 2019-10-03 23:36:21
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
-----------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
170 | | | | | | | | || | | .L22:
|
||||
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
|
||||
172 | | | 0.50 | 0.50 | 1.00 | | | || 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
|
||||
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | || | 1.0 | addq $32, %rax
|
||||
174 | 0.00 | 0.00 | | | | 0.50 | 0.50 | || | | cmpq %rax, %r14
|
||||
175 | | | | | | | | || | | * jne .L22
|
||||
|
||||
0.75 0.75 1.00 0.50 1.00 0.50 1.00 0.75 0.75 13.0 1.0
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
173 | 1.0 | addq $32, %rax | [173]
|
||||
|
||||
|
||||
It shows the whole kernel together with the optimized port pressure of each instruction form and the overall port binding.
|
||||
Furthermore, in the two columns on the right, the critical path (CP) and the longest loop-carried dependency (LCD) of the loop kernel.
|
||||
In the bottom, all loop-carried dependencies are shown, each with a list of line numbers being part of this dependency chain on the right.
|
||||
|
||||
You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
|
||||
|
||||
Credits
|
||||
=======
|
||||
Implementation: Jan Laukemann
|
||||
|
||||
License
|
||||
=======
|
||||
`AGPL-3.0 </LICENSE>`__
|
||||
20
docs/sphinx/osaca.api.rst
Normal file
20
docs/sphinx/osaca.api.rst
Normal file
@@ -0,0 +1,20 @@
|
||||
osaca.api package
|
||||
=================
|
||||
Provides interfaces to other tools.
|
||||
|
||||
osaca.api.kerncraft\_interface module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: osaca.api.kerncraft_interface
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: osaca.api
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
44
docs/sphinx/osaca.parser.rst
Normal file
44
docs/sphinx/osaca.parser.rst
Normal file
@@ -0,0 +1,44 @@
|
||||
osaca.parser package
|
||||
====================
|
||||
Parser module for parsing the assembly code.
|
||||
|
||||
osaca.parser.attr\_dict module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.attr_dict
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.parser.base\_parser module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.base_parser
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.parser.parser\_AArch64v81 module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.parser_AArch64v81
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.parser.parser\_x86att module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.parser_x86att
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: osaca.parser
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
46
docs/sphinx/osaca.rst
Normal file
46
docs/sphinx/osaca.rst
Normal file
@@ -0,0 +1,46 @@
|
||||
osaca package
|
||||
=============
|
||||
|
||||
Subpackages
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
|
||||
osaca.api
|
||||
osaca.parser
|
||||
osaca.semantics
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
osaca.db\_interface module
|
||||
--------------------------
|
||||
|
||||
.. automodule:: osaca.db_interface
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.frontend module
|
||||
---------------------
|
||||
|
||||
.. automodule:: osaca.frontend
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.osaca module
|
||||
------------------
|
||||
|
||||
.. automodule:: osaca.osaca
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.utils module
|
||||
------------------
|
||||
|
||||
.. automodule:: osaca.utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
52
docs/sphinx/osaca.semantics.rst
Normal file
52
docs/sphinx/osaca.semantics.rst
Normal file
@@ -0,0 +1,52 @@
|
||||
osaca.semantics package
|
||||
=======================
|
||||
Semantic part of OSACA.
|
||||
|
||||
osaca.semantics.arch\_semantics module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.arch_semantics
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.hw\_model module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.hw_model
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.isa\_semantics module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.isa_semantics
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.kernel\_dg module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.kernel_dg
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.marker\_utils module
|
||||
------------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.marker_utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: osaca.semantics
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
32
docs/version_from_src.py
Normal file
32
docs/version_from_src.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
# Stolen from pip
|
||||
def __read(*names, **kwargs):
|
||||
"""Reads in file"""
|
||||
with io.open(
|
||||
os.path.join(os.path.dirname(__file__), *names),
|
||||
encoding=kwargs.get("encoding", "utf8"),
|
||||
) as fp:
|
||||
return fp.read()
|
||||
|
||||
|
||||
# Stolen from pip
|
||||
def __find_version(*file_paths):
|
||||
"""Searches for a version attribute in the given file(s)"""
|
||||
version_file = __read(*file_paths)
|
||||
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
||||
if version_match:
|
||||
return version_match.group(1)
|
||||
raise RuntimeError("Unable to find version string.")
|
||||
|
||||
|
||||
def get_version():
|
||||
"""
|
||||
Gets the current OSACA version stated in the __init__ file
|
||||
|
||||
:returns: str -- the version string.
|
||||
"""
|
||||
return __find_version("../osaca/__init__.py")
|
||||
@@ -1,286 +0,0 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.5.239 Build 20170817";
|
||||
# mark_description "-fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o 2d.S";
|
||||
.file "2d-5pt.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
# -- Begin jacobi2D5pt
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl jacobi2D5pt
|
||||
# --- jacobi2D5pt(int, int)
|
||||
jacobi2D5pt:
|
||||
# parameter 1: %edi
|
||||
# parameter 2: %esi
|
||||
..B1.1: # Preds ..B1.0
|
||||
# Execution count [1.00e+00]
|
||||
.cfi_startproc
|
||||
..___tag_value_jacobi2D5pt.1:
|
||||
..L2:
|
||||
#2.31
|
||||
pushq %rbx #2.31
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbx #2.31
|
||||
.cfi_def_cfa 3, 16
|
||||
.cfi_offset 3, -16
|
||||
andq $-32, %rsp #2.31
|
||||
pushq %rbp #2.31
|
||||
pushq %rbp #2.31
|
||||
movq 8(%rbx), %rbp #2.31
|
||||
movq %rbp, 8(%rsp) #2.31
|
||||
movq %rsp, %rbp #2.31
|
||||
.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
|
||||
pushq %r13 #2.31
|
||||
pushq %r14 #2.31
|
||||
pushq %r15 #2.31
|
||||
subq $88, %rsp #2.31
|
||||
movslq %esi, %rsi #2.31
|
||||
movslq %edi, %rcx #2.31
|
||||
.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
|
||||
.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
|
||||
.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
|
||||
movq %rsi, %r13 #4.17
|
||||
imulq %rcx, %r13 #4.17
|
||||
shlq $3, %r13 #4.12
|
||||
movq %r13, %rax #4.12
|
||||
addq $31, %rax #4.12
|
||||
andq $-32, %rax #4.12
|
||||
subq %rax, %rsp #4.12
|
||||
movq %rsp, %rax #4.12
|
||||
# LOE rax rcx rsi r12 r13 edi
|
||||
..B1.29: # Preds ..B1.1
|
||||
# Execution count [1.00e+00]
|
||||
movq %rax, %r14 #4.12
|
||||
# LOE rcx rsi r12 r13 r14 edi
|
||||
..B1.2: # Preds ..B1.29
|
||||
# Execution count [1.00e+00]
|
||||
movq %r13, %rax #5.12
|
||||
addq $31, %rax #5.12
|
||||
andq $-32, %rax #5.12
|
||||
subq %rax, %rsp #5.12
|
||||
movq %rsp, %rax #5.12
|
||||
# LOE rax rcx rsi r12 r13 r14 edi
|
||||
..B1.30: # Preds ..B1.2
|
||||
# Execution count [1.00e+00]
|
||||
movq %rax, %r15 #5.12
|
||||
# LOE rcx rsi r12 r13 r14 r15 edi
|
||||
..B1.3: # Preds ..B1.30
|
||||
# Execution count [1.00e+00]
|
||||
xorl %r10d, %r10d #9.5
|
||||
lea (%r15,%rcx,8), %r11 #13.13
|
||||
vxorpd %xmm1, %xmm1, %xmm1 #6.5
|
||||
lea (%r14,%rcx,8), %rdx #13.37
|
||||
cmpq $2, %rsi #9.18
|
||||
jle ..B1.21 # Prob 9% #9.18
|
||||
# LOE rdx rcx rsi r10 r11 r12 r13 r14 r15 edi xmm1
|
||||
..B1.4: # Preds ..B1.3
|
||||
# Execution count [9.00e-01]
|
||||
addl $-2, %edi #12.9
|
||||
movq %rcx, %r9 #13.61
|
||||
movl %edi, %eax #12.9
|
||||
addq $-2, %rsi #9.18
|
||||
andl $-16, %eax #12.9
|
||||
xorl %r8d, %r8d #9.5
|
||||
shlq $4, %r9 #13.61
|
||||
movslq %eax, %rax #12.9
|
||||
addq %r14, %r9 #13.61
|
||||
movslq %edi, %rdi #12.9
|
||||
vxorps %ymm0, %ymm0, %ymm0 #6.5
|
||||
movq %rax, -80(%rbp) #12.9[spill]
|
||||
movq %rdi, -88(%rbp) #12.9[spill]
|
||||
movl %eax, -72(%rbp) #9.5[spill]
|
||||
movq %rsi, -48(%rbp) #9.5[spill]
|
||||
movq %rdx, -64(%rbp) #9.5[spill]
|
||||
movq %r15, -96(%rbp) #9.5[spill]
|
||||
movq %r14, -56(%rbp) #9.5[spill]
|
||||
movq %r13, -104(%rbp) #9.5[spill]
|
||||
movq %r12, -112(%rbp) #9.5[spill]
|
||||
.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
|
||||
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
|
||||
..B1.5: # Preds ..B1.19 ..B1.4
|
||||
# Execution count [5.00e+00]
|
||||
cmpq $2, %rcx #12.22
|
||||
jle ..B1.19 # Prob 50% #12.22
|
||||
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
|
||||
..B1.6: # Preds ..B1.5
|
||||
# Execution count [4.50e+00]
|
||||
cmpl $16, %edi #12.9
|
||||
jl ..B1.26 # Prob 10% #12.9
|
||||
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
|
||||
..B1.7: # Preds ..B1.6
|
||||
# Execution count [4.50e+00]
|
||||
movl -72(%rbp), %r14d #12.9[spill]
|
||||
xorl %edx, %edx #12.9
|
||||
movq -80(%rbp), %r12 #13.13[spill]
|
||||
lea (%r11,%r8), %rax #13.13
|
||||
# LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
|
||||
..B1.8: # Preds ..B1.8 ..B1.7
|
||||
# Execution count [2.50e+01]
|
||||
vmovupd %ymm0, 8(%rax,%rdx,8) #13.13
|
||||
vmovupd %ymm0, 40(%rax,%rdx,8) #13.13
|
||||
vmovupd %ymm0, 72(%rax,%rdx,8) #13.13
|
||||
vmovupd %ymm0, 104(%rax,%rdx,8) #13.13
|
||||
addq $16, %rdx #12.9
|
||||
cmpq %r12, %rdx #12.9
|
||||
jb ..B1.8 # Prob 82% #12.9
|
||||
# LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
|
||||
..B1.10: # Preds ..B1.8 ..B1.26
|
||||
# Execution count [5.00e+00]
|
||||
lea 1(%r14), %eax #12.9
|
||||
cmpl %edi, %eax #12.9
|
||||
ja ..B1.19 # Prob 50% #12.9
|
||||
# LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
|
||||
..B1.11: # Preds ..B1.10
|
||||
# Execution count [4.50e+00]
|
||||
movslq %r14d, %r14 #12.9
|
||||
movq -88(%rbp), %r13 #12.9[spill]
|
||||
subq %r14, %r13 #12.9
|
||||
cmpq $4, %r13 #12.9
|
||||
jl ..B1.25 # Prob 10% #12.9
|
||||
# LOE rcx r8 r9 r10 r11 r13 r14 edi xmm1 ymm0
|
||||
..B1.12: # Preds ..B1.11
|
||||
# Execution count [4.50e+00]
|
||||
movl %r13d, %r15d #12.9
|
||||
lea (%r11,%r8), %rax #13.13
|
||||
andl $-4, %r15d #12.9
|
||||
xorl %edx, %edx #12.9
|
||||
movslq %r15d, %r15 #12.9
|
||||
lea (%rax,%r14,8), %rax #13.13
|
||||
# LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
|
||||
..B1.13: # Preds ..B1.13 ..B1.12
|
||||
# Execution count [2.50e+01]
|
||||
vmovupd %ymm0, 8(%rax,%rdx,8) #13.13
|
||||
addq $4, %rdx #12.9
|
||||
cmpq %r15, %rdx #12.9
|
||||
jb ..B1.13 # Prob 82% #12.9
|
||||
# LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
|
||||
..B1.15: # Preds ..B1.13 ..B1.25
|
||||
# Execution count [5.00e+00]
|
||||
cmpq %r13, %r15 #12.9
|
||||
jae ..B1.19 # Prob 10% #12.9
|
||||
# LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
|
||||
..B1.16: # Preds ..B1.15
|
||||
# Execution count [4.50e+00]
|
||||
movq -56(%rbp), %rax #13.49[spill]
|
||||
lea (%r11,%r8), %r12 #13.13
|
||||
movq -64(%rbp), %rsi #13.25[spill]
|
||||
lea (%r9,%r8), %rdx #13.61
|
||||
lea (%r12,%r14,8), %r12 #13.13
|
||||
addq %r8, %rax #13.49
|
||||
addq %r8, %rsi #13.25
|
||||
lea (%rdx,%r14,8), %rdx #13.61
|
||||
lea (%rax,%r14,8), %rax #13.49
|
||||
lea (%rsi,%r14,8), %r14 #13.25
|
||||
# LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.17: # Preds ..B1.17 ..B1.16
|
||||
# Execution count [2.50e+01]
|
||||
vmovsd (%r14,%r15,8), %xmm2 #13.25
|
||||
vaddsd 16(%r14,%r15,8), %xmm2, %xmm3 #13.37
|
||||
vaddsd 8(%rax,%r15,8), %xmm3, %xmm4 #13.49
|
||||
vaddsd 8(%rdx,%r15,8), %xmm4, %xmm5 #13.61
|
||||
vmulsd %xmm5, %xmm1, %xmm6 #13.74
|
||||
vmovsd %xmm6, 8(%r12,%r15,8) #13.13
|
||||
incq %r15 #12.9
|
||||
cmpq %r13, %r15 #12.9
|
||||
jb ..B1.17 # Prob 82% #12.9
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
# LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
|
||||
..B1.19: # Preds ..B1.17 ..B1.5 ..B1.10 ..B1.15
|
||||
# Execution count [5.00e+00]
|
||||
incq %r10 #9.5
|
||||
lea (%r8,%rcx,8), %r8 #9.5
|
||||
cmpq -48(%rbp), %r10 #9.5[spill]
|
||||
jb ..B1.5 # Prob 82% #9.5
|
||||
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
|
||||
..B1.20: # Preds ..B1.19
|
||||
# Execution count [9.00e-01]
|
||||
movq -64(%rbp), %rdx #[spill]
|
||||
movq -96(%rbp), %r15 #[spill]
|
||||
movq -56(%rbp), %r14 #[spill]
|
||||
movq -104(%rbp), %r13 #[spill]
|
||||
movq -112(%rbp), %r12 #[spill]
|
||||
.cfi_restore 12
|
||||
# LOE rdx r11 r12 r13 r14 r15
|
||||
..B1.21: # Preds ..B1.3 ..B1.20
|
||||
# Execution count [1.00e+00]
|
||||
addq $8, %rdx #16.5
|
||||
addq $8, %r11 #16.5
|
||||
movq %rdx, %rdi #16.5
|
||||
movq %r11, %rsi #16.5
|
||||
vzeroupper #16.5
|
||||
..___tag_value_jacobi2D5pt.12:
|
||||
# dummy(double *, double *)
|
||||
call dummy #16.5
|
||||
..___tag_value_jacobi2D5pt.13:
|
||||
# LOE r12 r13 r14 r15
|
||||
..B1.22: # Preds ..B1.21
|
||||
# Execution count [1.00e+00]
|
||||
movq %r15, %rdx #16.5
|
||||
movq %r13, %rax #16.5
|
||||
addq $31, %rax #16.5
|
||||
andq $-32, %rax #16.5
|
||||
addq %rax, %rsp #16.5
|
||||
# LOE r12 r13 r14
|
||||
..B1.23: # Preds ..B1.22
|
||||
# Execution count [1.00e+00]
|
||||
movq %r14, %rdx #16.5
|
||||
movq %r13, %rax #16.5
|
||||
addq $31, %rax #16.5
|
||||
andq $-32, %rax #16.5
|
||||
addq %rax, %rsp #16.5
|
||||
# LOE r12
|
||||
..B1.24: # Preds ..B1.23
|
||||
# Execution count [1.00e+00]
|
||||
lea -24(%rbp), %rsp #17.1
|
||||
.cfi_restore 15
|
||||
popq %r15 #17.1
|
||||
.cfi_restore 14
|
||||
popq %r14 #17.1
|
||||
.cfi_restore 13
|
||||
popq %r13 #17.1
|
||||
popq %rbp #17.1
|
||||
.cfi_restore 6
|
||||
movq %rbx, %rsp #17.1
|
||||
popq %rbx #17.1
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 3
|
||||
ret #17.1
|
||||
.cfi_def_cfa 3, 16
|
||||
.cfi_offset 3, -16
|
||||
.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
|
||||
.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
|
||||
.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
|
||||
.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
|
||||
.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
|
||||
# LOE
|
||||
..B1.25: # Preds ..B1.11
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r15d, %r15d #12.9
|
||||
jmp ..B1.15 # Prob 100% #12.9
|
||||
# LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
|
||||
..B1.26: # Preds ..B1.6
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r14d, %r14d #12.9
|
||||
jmp ..B1.10 # Prob 100% #12.9
|
||||
.align 16,0x90
|
||||
# LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
|
||||
.cfi_endproc
|
||||
# mark_end;
|
||||
.type jacobi2D5pt,@function
|
||||
.size jacobi2D5pt,.-jacobi2D5pt
|
||||
.data
|
||||
# -- End jacobi2D5pt
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
// -- Begin DWARF2 SEGMENT .eh_frame
|
||||
.section .eh_frame,"a",@progbits
|
||||
.eh_frame_seg:
|
||||
.align 8
|
||||
# End
|
||||
@@ -1,16 +0,0 @@
|
||||
|
||||
void jacobi2D5pt(int N, int M){
|
||||
void dummy(double*, double*);
|
||||
double a[M][N];
|
||||
double b[M][N];
|
||||
double s;
|
||||
|
||||
for(int j=1; j<M-1; ++j){
|
||||
#pragma vector aligned
|
||||
//STARTLOOP
|
||||
for(int i=1; i<N-1; ++i){
|
||||
b[j][i] = ( a[j][i-1] + a[j][i+1] + a[j-1][i] + a[j+1][i]) * s;
|
||||
}
|
||||
}
|
||||
dummy(&a[1][1], &b[1][1]);
|
||||
}
|
||||
114
examples/README.md
Normal file
114
examples/README.md
Normal file
@@ -0,0 +1,114 @@
|
||||
# Examples
|
||||
We collected sample kernels for the user to run examples with OSACA.
|
||||
The assembly files contain only the extracted and already marked kernel for code compiled with on Intel Cascade Lake (CSX), AMD Zen and Marvell ThunderX2 (TX2), but can be run on any system supporting the ISA and supported by OSACA.
|
||||
The used compilers were Intel Parallel Studio 19.0up05 and GNU 9.1.0 in case of the x86 systems and ARM HPC Compiler for Linux version 19.2 and GNU 8.2.0 for the ARM-based TX2.
|
||||
|
||||
To analyze the kernels with OSACA, run
|
||||
```
|
||||
osaca --arch ARCH FILE
|
||||
```
|
||||
While all Zen and TX2 kernels use the comment-style OSACA markers, the kernels for Intel Cascade Lake (*.csx.*.s) use the byte markers to be able to be analyzed by IACA as well.
|
||||
For this use
|
||||
```
|
||||
gcc -c FILE.s
|
||||
iaca -arch SKX FILE.o
|
||||
```
|
||||
|
||||
------------
|
||||
The kernels currently contained in the examples are shown briefly in the following.
|
||||
|
||||
### Copy (`copy/`)
|
||||
```c
|
||||
double * restrict a, * restrict b;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Vector add (`add/`)
|
||||
```c
|
||||
double * restrict a, * restrict b, * restrict c;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i] + c[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Vector update (`update/`)
|
||||
```c
|
||||
double * restrict a;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = scale * a[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Sum reduction (`sum_reduction/`)
|
||||
```c
|
||||
double * restrict a;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
scale = scale + a[i];
|
||||
}
|
||||
```
|
||||
For this kernel we noticed an overlap of the loop bodies when using gcc with `-Ofast` flag (see this [blog post](https://blogs.fau.de/hager/archives/7658) for more information).
|
||||
We therefore compiled all gcc version additionally with `-O3` flag instead.
|
||||
These versions are named accordingly.
|
||||
|
||||
### DAXPY (`daxpy/`)
|
||||
```c
|
||||
double * restrict a, * restrict b;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = a[i] + scale * b[i];
|
||||
}
|
||||
```
|
||||
|
||||
### STREAM triad (`triad/`)
|
||||
```c
|
||||
double * restrict a, * restrict b, * restrict c;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i] + scale * c[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Schönauer triad (`striad/`)
|
||||
```c
|
||||
double * restrict a, * restrict b, * restrict c, * restrict d;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i] + c[i] * d[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Gauss-Seidel method (`gs/`)
|
||||
```c
|
||||
double ** restrict a;
|
||||
|
||||
for(long k=1; k < size_k-1; ++k){
|
||||
for(long i=1; i < size_i-1; ++i){
|
||||
a[k][i] = scale * (
|
||||
a[k][i-1] + a[k+1][i]
|
||||
+ a[k][i+1] + a[k-1][i]
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Jacobi 2D (`j2d/`)
|
||||
```c
|
||||
double ** restrict a, ** restrict b;
|
||||
|
||||
for(long k=1; k < size_k-1; ++k){
|
||||
for(long i=1; i < size_i-1; ++i){
|
||||
a[k][i] = 0.25 * (
|
||||
b[k][i-1] + b[k+1][i]
|
||||
+ b[k][i+1] + b[k-1][i]
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
For this kernel we noticed a discrepancy between measurements and predcitions especially when using AVX-512 instructions.
|
||||
We therefore compiled the x86 kernels additionally with AVX/SSE instruction and marekd those kernels accordingly.
|
||||
36
examples/add/add.s.csx.gcc.s
Normal file
36
examples/add/add.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r14,%rax), %ymm3
|
||||
vmovupd 32(%r14,%rax), %ymm4
|
||||
vmovupd 64(%r14,%rax), %ymm6
|
||||
vmovupd 96(%r14,%rax), %ymm9
|
||||
vmovupd 128(%r14,%rax), %ymm11
|
||||
vmovupd 160(%r14,%rax), %ymm13
|
||||
vmovupd 192(%r14,%rax), %ymm15
|
||||
vmovupd 224(%r14,%rax), %ymm0
|
||||
vaddpd 0(%r13,%rax), %ymm3, %ymm7
|
||||
vaddpd 32(%r13,%rax), %ymm4, %ymm5
|
||||
vaddpd 64(%r13,%rax), %ymm6, %ymm8
|
||||
vaddpd 96(%r13,%rax), %ymm9, %ymm10
|
||||
vaddpd 128(%r13,%rax), %ymm11, %ymm12
|
||||
vaddpd 160(%r13,%rax), %ymm13, %ymm14
|
||||
vaddpd 192(%r13,%rax), %ymm15, %ymm1
|
||||
vaddpd 224(%r13,%rax), %ymm0, %ymm2
|
||||
vmovupd %ymm7, (%r12,%rax)
|
||||
vmovupd %ymm5, 32(%r12,%rax)
|
||||
vmovupd %ymm8, 64(%r12,%rax)
|
||||
vmovupd %ymm10, 96(%r12,%rax)
|
||||
vmovupd %ymm12, 128(%r12,%rax)
|
||||
vmovupd %ymm14, 160(%r12,%rax)
|
||||
vmovupd %ymm1, 192(%r12,%rax)
|
||||
vmovupd %ymm2, 224(%r12,%rax)
|
||||
addq $256, %rax
|
||||
cmpq %rax, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
19
examples/add/add.s.csx.icc.s
Normal file
19
examples/add/add.s.csx.icc.s
Normal file
@@ -0,0 +1,19 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.40: # Preds ..B1.40 ..B1.39
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%rcx,%rax,8), %zmm1 #78.5
|
||||
vmovups 64(%rcx,%rax,8), %zmm3 #78.5
|
||||
vaddpd (%r13,%rax,8), %zmm1, %zmm2 #78.5
|
||||
vaddpd 64(%r13,%rax,8), %zmm3, %zmm4 #78.5
|
||||
vmovupd %zmm2, (%r14,%rax,8) #78.5
|
||||
vmovupd %zmm4, 64(%r14,%rax,8) #78.5
|
||||
addq $16, %rax #78.5
|
||||
cmpq %r12, %rax #78.5
|
||||
jb ..B1.40 # Prob 82% #78.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
91
examples/add/add.s.tx2.clang.s
Normal file
91
examples/add/add.s.tx2.clang.s
Normal file
@@ -0,0 +1,91 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q4, q5, [x9, #-224]
|
||||
ldp q2, q3, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fadd v2.2d, v2.2d, v0.2d
|
||||
fadd v3.2d, v3.2d, v1.2d
|
||||
stp q2, q3, [x11, #-256]
|
||||
fadd v0.2d, v6.2d, v4.2d
|
||||
fadd v1.2d, v7.2d, v5.2d
|
||||
stp q0, q1, [x11, #-224]
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q16, q17, [x9, #-160]
|
||||
ldp q6, q7, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fadd v6.2d, v6.2d, v4.2d
|
||||
fadd v7.2d, v7.2d, v5.2d
|
||||
stp q6, q7, [x11, #-192]
|
||||
fadd v4.2d, v18.2d, v16.2d
|
||||
fadd v5.2d, v19.2d, v17.2d
|
||||
stp q4, q5, [x11, #-160]
|
||||
ldp q16, q17, [x9, #-128]
|
||||
ldp q19, q20, [x9, #-96]
|
||||
ldp q18, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fadd v16.2d, v18.2d, v16.2d
|
||||
fadd v18.2d, v21.2d, v17.2d
|
||||
stp q16, q18, [x11, #-128]
|
||||
fadd v17.2d, v22.2d, v19.2d
|
||||
fadd v19.2d, v23.2d, v20.2d
|
||||
stp q17, q19, [x11, #-96]
|
||||
ldp q20, q21, [x9, #-64]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
ldp q22, q23, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fadd v20.2d, v24.2d, v20.2d
|
||||
fadd v21.2d, v25.2d, v21.2d
|
||||
stp q20, q21, [x11, #-64]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q28, q29, [x10]
|
||||
fadd v22.2d, v26.2d, v22.2d
|
||||
fadd v23.2d, v27.2d, v23.2d
|
||||
stp q22, q23, [x11, #-32]
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q30, q31, [x10, #32]
|
||||
fadd v24.2d, v28.2d, v24.2d
|
||||
fadd v25.2d, v29.2d, v25.2d
|
||||
stp q24, q25, [x11]
|
||||
ldp q28, q29, [x9, #64]
|
||||
ldp q8, q10, [x10, #64]
|
||||
fadd v26.2d, v30.2d, v26.2d
|
||||
fadd v27.2d, v31.2d, v27.2d
|
||||
stp q26, q27, [x11, #32]
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldp q11, q12, [x10, #96]
|
||||
fadd v28.2d, v8.2d, v28.2d
|
||||
fadd v29.2d, v10.2d, v29.2d
|
||||
stp q28, q29, [x11, #64]
|
||||
ldp q8, q10, [x9, #128]
|
||||
ldp q13, q14, [x10, #128]
|
||||
ldp q3, q0, [x9, #192]
|
||||
ldp q1, q6, [x10, #192]
|
||||
fadd v30.2d, v11.2d, v30.2d
|
||||
fadd v31.2d, v12.2d, v31.2d
|
||||
stp q30, q31, [x11, #96]
|
||||
ldp q11, q12, [x9, #160]
|
||||
fadd v8.2d, v13.2d, v8.2d
|
||||
fadd v10.2d, v14.2d, v10.2d
|
||||
stp q8, q10, [x11, #128]
|
||||
ldp q13, q14, [x10, #160]
|
||||
fadd v1.2d, v1.2d, v3.2d
|
||||
ldp q3, q4, [x9, #224]
|
||||
fadd v0.2d, v6.2d, v0.2d
|
||||
stp q1, q0, [x11, #192]
|
||||
ldp q5, q6, [x10, #224]
|
||||
fadd v11.2d, v13.2d, v11.2d
|
||||
fadd v2.2d, v14.2d, v12.2d
|
||||
stp q11, q2, [x11, #160]
|
||||
fadd v3.2d, v5.2d, v3.2d
|
||||
fadd v4.2d, v6.2d, v4.2d
|
||||
stp q3, q4, [x11, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x12, x12, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
45
examples/add/add.s.tx2.gcc.s
Normal file
45
examples/add/add.s.tx2.gcc.s
Normal file
@@ -0,0 +1,45 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x0, x10, 16
|
||||
ldr q29, [x21, x10]
|
||||
ldr q30, [x20, x10]
|
||||
add x7, x10, 32
|
||||
ldr q31, [x21, x0]
|
||||
ldr q2, [x20, x0]
|
||||
add x6, x10, 48
|
||||
add x5, x10, 64
|
||||
ldr q5, [x21, x7]
|
||||
ldr q1, [x20, x7]
|
||||
add x4, x10, 80
|
||||
add x11, x10, 96
|
||||
ldr q4, [x21, x6]
|
||||
ldr q0, [x20, x6]
|
||||
add x2, x10, 112
|
||||
fadd v7.2d, v29.2d, v30.2d
|
||||
ldr q3, [x21, x5]
|
||||
ldr q9, [x20, x5]
|
||||
fadd v6.2d, v31.2d, v2.2d
|
||||
ldr q19, [x21, x4]
|
||||
ldr q18, [x20, x4]
|
||||
fadd v20.2d, v5.2d, v1.2d
|
||||
ldr q21, [x21, x11]
|
||||
ldr q17, [x20, x11]
|
||||
fadd v22.2d, v4.2d, v0.2d
|
||||
ldr q23, [x21, x2]
|
||||
ldr q16, [x20, x2]
|
||||
fadd v24.2d, v3.2d, v9.2d
|
||||
fadd v25.2d, v19.2d, v18.2d
|
||||
fadd v26.2d, v21.2d, v17.2d
|
||||
str q7, [x19, x10]
|
||||
add x10, x10, 128
|
||||
fadd v27.2d, v23.2d, v16.2d
|
||||
str q6, [x19, x0]
|
||||
str q20, [x19, x7]
|
||||
str q22, [x19, x6]
|
||||
str q24, [x19, x5]
|
||||
str q25, [x19, x4]
|
||||
str q26, [x19, x11]
|
||||
str q27, [x19, x2]
|
||||
cmp x24, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/add/add.s.zen.gcc.s
Normal file
30
examples/add/add.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%r13,%rax), %xmm0
|
||||
vmovups 16(%r13,%rax), %xmm3
|
||||
vmovups 32(%r13,%rax), %xmm4
|
||||
vmovups 48(%r13,%rax), %xmm6
|
||||
vmovups 64(%r13,%rax), %xmm9
|
||||
vmovups 80(%r13,%rax), %xmm11
|
||||
vmovups 96(%r13,%rax), %xmm13
|
||||
vmovups 112(%r13,%rax), %xmm15
|
||||
vaddpd (%r12,%rax), %xmm0, %xmm7
|
||||
vaddpd 16(%r12,%rax), %xmm3, %xmm2
|
||||
vaddpd 32(%r12,%rax), %xmm4, %xmm5
|
||||
vaddpd 48(%r12,%rax), %xmm6, %xmm8
|
||||
vaddpd 64(%r12,%rax), %xmm9, %xmm10
|
||||
vaddpd 80(%r12,%rax), %xmm11, %xmm12
|
||||
vaddpd 96(%r12,%rax), %xmm13, %xmm14
|
||||
vaddpd 112(%r12,%rax), %xmm15, %xmm1
|
||||
vmovups %xmm7, 0(%rbp,%rax)
|
||||
vmovups %xmm2, 16(%rbp,%rax)
|
||||
vmovups %xmm5, 32(%rbp,%rax)
|
||||
vmovups %xmm8, 48(%rbp,%rax)
|
||||
vmovups %xmm10, 64(%rbp,%rax)
|
||||
vmovups %xmm12, 80(%rbp,%rax)
|
||||
vmovups %xmm14, 96(%rbp,%rax)
|
||||
vmovups %xmm1, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rbx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
28
examples/copy/copy.s.csx.gcc.s
Normal file
28
examples/copy/copy.s.csx.gcc.s
Normal file
@@ -0,0 +1,28 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r12,%rcx), %ymm10
|
||||
vmovupd 32(%r12,%rcx), %ymm11
|
||||
vmovupd 64(%r12,%rcx), %ymm12
|
||||
vmovupd 96(%r12,%rcx), %ymm13
|
||||
vmovupd 128(%r12,%rcx), %ymm14
|
||||
vmovupd 160(%r12,%rcx), %ymm15
|
||||
vmovupd 192(%r12,%rcx), %ymm0
|
||||
vmovupd 224(%r12,%rcx), %ymm1
|
||||
vmovupd %ymm10, 0(%r13,%rcx)
|
||||
vmovupd %ymm11, 32(%r13,%rcx)
|
||||
vmovupd %ymm12, 64(%r13,%rcx)
|
||||
vmovupd %ymm13, 96(%r13,%rcx)
|
||||
vmovupd %ymm14, 128(%r13,%rcx)
|
||||
vmovupd %ymm15, 160(%r13,%rcx)
|
||||
vmovupd %ymm0, 192(%r13,%rcx)
|
||||
vmovupd %ymm1, 224(%r13,%rcx)
|
||||
addq $256, %rcx
|
||||
cmpq %rcx, %r10
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
15
examples/copy/copy.s.csx.icc.s
Normal file
15
examples/copy/copy.s.csx.icc.s
Normal file
@@ -0,0 +1,15 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.39: # Preds ..B1.39 ..B1.38
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r14,%rax,8), %zmm1 #79.5
|
||||
vmovupd %zmm1, (%r13,%rax,8) #79.5
|
||||
addq $8, %rax #79.5
|
||||
cmpq %r12, %rax #79.5
|
||||
jb ..B1.39 # Prob 82% #79.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
42
examples/copy/copy.s.tx2.clang.s
Normal file
42
examples/copy/copy.s.tx2.clang.s
Normal file
@@ -0,0 +1,42 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q2, q3, [x9, #-224]
|
||||
stp q0, q1, [x10, #-256]
|
||||
stp q2, q3, [x10, #-224]
|
||||
add x8, x8, #64 // =64
|
||||
ldp q0, q1, [x9]
|
||||
ldp q2, q3, [x9, #32]
|
||||
stp q0, q1, [x10]
|
||||
stp q2, q3, [x10, #32]
|
||||
ldp q0, q1, [x9, #-192]
|
||||
ldp q2, q3, [x9, #-160]
|
||||
stp q0, q1, [x10, #-192]
|
||||
stp q2, q3, [x10, #-160]
|
||||
ldp q0, q1, [x9, #64]
|
||||
ldp q2, q3, [x9, #96]
|
||||
stp q0, q1, [x10, #64]
|
||||
stp q2, q3, [x10, #96]
|
||||
ldp q0, q1, [x9, #-128]
|
||||
ldp q2, q3, [x9, #-96]
|
||||
stp q0, q1, [x10, #-128]
|
||||
stp q2, q3, [x10, #-96]
|
||||
ldp q0, q1, [x9, #128]
|
||||
ldp q2, q3, [x9, #160]
|
||||
stp q0, q1, [x10, #128]
|
||||
stp q2, q3, [x10, #160]
|
||||
ldp q0, q1, [x9, #-64]
|
||||
ldp q2, q3, [x9, #-32]
|
||||
stp q0, q1, [x10, #-64]
|
||||
stp q2, q3, [x10, #-32]
|
||||
ldp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x9, #224]
|
||||
add x9, x9, #512 // =512
|
||||
stp q0, q1, [x10, #192]
|
||||
stp q2, q3, [x10, #224]
|
||||
add x10, x10, #512 // =512
|
||||
adds x11, x11, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
29
examples/copy/copy.s.tx2.gcc.s
Normal file
29
examples/copy/copy.s.tx2.gcc.s
Normal file
@@ -0,0 +1,29 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x16, x15, 16
|
||||
ldr q9, [x19, x15]
|
||||
add x30, x15, 32
|
||||
add x17, x15, 48
|
||||
ldr q16, [x19, x16]
|
||||
ldr q18, [x19, x30]
|
||||
add x18, x15, 64
|
||||
add x1, x15, 80
|
||||
ldr q17, [x19, x17]
|
||||
ldr q19, [x19, x18]
|
||||
add x3, x15, 96
|
||||
add x2, x15, 112
|
||||
ldr q20, [x19, x1]
|
||||
ldr q21, [x19, x3]
|
||||
str q9, [x20, x15]
|
||||
ldr q22, [x19, x2]
|
||||
add x15, x15, 128
|
||||
str q16, [x20, x16]
|
||||
str q18, [x20, x30]
|
||||
str q17, [x20, x17]
|
||||
str q19, [x20, x18]
|
||||
str q20, [x20, x1]
|
||||
str q21, [x20, x3]
|
||||
str q22, [x20, x2]
|
||||
cmp x23, x15
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
22
examples/copy/copy.s.zen.gcc.s
Normal file
22
examples/copy/copy.s.zen.gcc.s
Normal file
@@ -0,0 +1,22 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%rbp,%r10), %xmm9
|
||||
vmovups 16(%rbp,%r10), %xmm10
|
||||
vmovups 32(%rbp,%r10), %xmm11
|
||||
vmovups 48(%rbp,%r10), %xmm12
|
||||
vmovups 64(%rbp,%r10), %xmm13
|
||||
vmovups 80(%rbp,%r10), %xmm14
|
||||
vmovups 96(%rbp,%r10), %xmm15
|
||||
vmovups 112(%rbp,%r10), %xmm0
|
||||
vmovups %xmm9, (%r12,%r10)
|
||||
vmovups %xmm10, 16(%r12,%r10)
|
||||
vmovups %xmm11, 32(%r12,%r10)
|
||||
vmovups %xmm12, 48(%r12,%r10)
|
||||
vmovups %xmm13, 64(%r12,%r10)
|
||||
vmovups %xmm14, 80(%r12,%r10)
|
||||
vmovups %xmm15, 96(%r12,%r10)
|
||||
vmovups %xmm0, 112(%r12,%r10)
|
||||
subq $-128, %r10
|
||||
cmpq %r10, %r15
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
@@ -1,13 +0,0 @@
|
||||
|
||||
void daxpy(int N){
|
||||
void dummy(double*, double*);
|
||||
double a[N], b[N];
|
||||
double s;
|
||||
|
||||
//STARTLOOP
|
||||
for(int i=0; i<N; ++i)
|
||||
a[i] = a[i] + s * b[i];
|
||||
|
||||
dummy(&a[1], &b[1]);
|
||||
}
|
||||
|
||||
36
examples/daxpy/daxpy.s.csx.gcc.s
Normal file
36
examples/daxpy/daxpy.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd 0(%r13,%rsi), %ymm14
|
||||
vmovupd 32(%r13,%rsi), %ymm15
|
||||
vmovupd 64(%r13,%rsi), %ymm1
|
||||
vmovupd 96(%r13,%rsi), %ymm0
|
||||
vmovupd 128(%r13,%rsi), %ymm3
|
||||
vmovupd 160(%r13,%rsi), %ymm4
|
||||
vmovupd 192(%r13,%rsi), %ymm5
|
||||
vmovupd 224(%r13,%rsi), %ymm7
|
||||
vfmadd213pd (%r12,%rsi), %ymm6, %ymm14
|
||||
vfmadd213pd 32(%r12,%rsi), %ymm6, %ymm15
|
||||
vfmadd213pd 64(%r12,%rsi), %ymm6, %ymm1
|
||||
vfmadd213pd 96(%r12,%rsi), %ymm6, %ymm0
|
||||
vfmadd213pd 128(%r12,%rsi), %ymm6, %ymm3
|
||||
vfmadd213pd 160(%r12,%rsi), %ymm6, %ymm4
|
||||
vfmadd213pd 192(%r12,%rsi), %ymm6, %ymm5
|
||||
vfmadd213pd 224(%r12,%rsi), %ymm6, %ymm7
|
||||
vmovupd %ymm14, (%r12,%rsi)
|
||||
vmovupd %ymm15, 32(%r12,%rsi)
|
||||
vmovupd %ymm1, 64(%r12,%rsi)
|
||||
vmovupd %ymm0, 96(%r12,%rsi)
|
||||
vmovupd %ymm3, 128(%r12,%rsi)
|
||||
vmovupd %ymm4, 160(%r12,%rsi)
|
||||
vmovupd %ymm5, 192(%r12,%rsi)
|
||||
vmovupd %ymm7, 224(%r12,%rsi)
|
||||
addq $256, %rsi
|
||||
cmpq %rsi, %r10
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
16
examples/daxpy/daxpy.s.csx.icc.s
Normal file
16
examples/daxpy/daxpy.s.csx.icc.s
Normal file
@@ -0,0 +1,16 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.39: # Preds ..B1.39 ..B1.38
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r13,%rax,8), %zmm1 #77.5
|
||||
vfmadd213pd (%r14,%rax,8), %zmm2, %zmm1 #77.5
|
||||
vmovupd %zmm1, (%r14,%rax,8) #77.5
|
||||
addq $8, %rax #77.5
|
||||
cmpq %rbx, %rax #77.5
|
||||
jb ..B1.39 # Prob 82% #77.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
90
examples/daxpy/daxpy.s.tx2.clang.s
Normal file
90
examples/daxpy/daxpy.s.tx2.clang.s
Normal file
@@ -0,0 +1,90 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q1, q2, [x9, #-256]
|
||||
ldp q3, q0, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fmla v1.2d, v4.2d, v31.2d
|
||||
fmla v2.2d, v5.2d, v31.2d
|
||||
stp q1, q2, [x9, #-256]
|
||||
fmla v3.2d, v6.2d, v31.2d
|
||||
fmla v0.2d, v7.2d, v31.2d
|
||||
stp q3, q0, [x9, #-224]
|
||||
ldp q5, q6, [x9, #-192]
|
||||
ldp q7, q4, [x9, #-160]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fmla v5.2d, v16.2d, v31.2d
|
||||
fmla v6.2d, v17.2d, v31.2d
|
||||
stp q5, q6, [x9, #-192]
|
||||
fmla v7.2d, v18.2d, v31.2d
|
||||
fmla v4.2d, v19.2d, v31.2d
|
||||
stp q7, q4, [x9, #-160]
|
||||
ldp q19, q18, [x9, #-128]
|
||||
ldp q16, q17, [x9, #-96]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fmla v18.2d, v21.2d, v31.2d
|
||||
fmla v16.2d, v22.2d, v31.2d
|
||||
ldp q21, q22, [x9, #-64]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
fmla v19.2d, v20.2d, v31.2d
|
||||
stp q19, q18, [x9, #-128]
|
||||
fmla v17.2d, v23.2d, v31.2d
|
||||
stp q16, q17, [x9, #-96]
|
||||
ldp q23, q20, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fmla v21.2d, v24.2d, v31.2d
|
||||
fmla v22.2d, v25.2d, v31.2d
|
||||
stp q21, q22, [x9, #-64]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q28, q29, [x10]
|
||||
fmla v23.2d, v26.2d, v31.2d
|
||||
fmla v20.2d, v27.2d, v31.2d
|
||||
stp q23, q20, [x9, #-32]
|
||||
ldp q26, q27, [x9, #32]
|
||||
fmla v24.2d, v28.2d, v31.2d
|
||||
fmla v25.2d, v29.2d, v31.2d
|
||||
stp q24, q25, [x9]
|
||||
ldp q28, q29, [x10, #32]
|
||||
fmla v26.2d, v28.2d, v31.2d
|
||||
fmla v27.2d, v29.2d, v31.2d
|
||||
stp q26, q27, [x9, #32]
|
||||
ldp q24, q25, [x9, #64]
|
||||
ldp q28, q29, [x10, #64]
|
||||
ldp q26, q27, [x9, #96]
|
||||
fmla v24.2d, v28.2d, v31.2d
|
||||
fmla v25.2d, v29.2d, v31.2d
|
||||
stp q24, q25, [x9, #64]
|
||||
ldp q28, q29, [x10, #96]
|
||||
fmla v26.2d, v28.2d, v31.2d
|
||||
fmla v27.2d, v29.2d, v31.2d
|
||||
stp q26, q27, [x9, #96]
|
||||
ldp q24, q25, [x9, #128]
|
||||
ldp q26, q27, [x10, #128]
|
||||
fmla v24.2d, v26.2d, v31.2d
|
||||
fmla v25.2d, v27.2d, v31.2d
|
||||
stp q24, q25, [x9, #128]
|
||||
ldp q26, q27, [x9, #160]
|
||||
ldp q1, q2, [x10, #160]
|
||||
fmla v26.2d, v1.2d, v31.2d
|
||||
fmla v27.2d, v2.2d, v31.2d
|
||||
stp q26, q27, [x9, #160]
|
||||
ldp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x10, #192]
|
||||
fmla v0.2d, v2.2d, v31.2d
|
||||
fmla v1.2d, v3.2d, v31.2d
|
||||
stp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x9, #224]
|
||||
ldp q4, q5, [x10, #224]
|
||||
fmla v2.2d, v4.2d, v31.2d
|
||||
fmla v3.2d, v5.2d, v31.2d
|
||||
stp q2, q3, [x9, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x11, x11, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
41
examples/daxpy/daxpy.s.tx2.gcc.s
Normal file
41
examples/daxpy/daxpy.s.tx2.gcc.s
Normal file
@@ -0,0 +1,41 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x5, x3
|
||||
ldr q23, [x10]
|
||||
ldr q24, [x5], 16
|
||||
mov x6, x10
|
||||
ldr q25, [x3, 16]
|
||||
ldr q26, [x3, 48]
|
||||
add x10, x10, 128
|
||||
add x3, x3, 128
|
||||
ldr q27, [x3, -64]
|
||||
ldr q28, [x3, -48]
|
||||
ldr q29, [x3, -32]
|
||||
ldr q30, [x3, -16]
|
||||
fmla v23.2d, v3.2d, v24.2d
|
||||
ldr q31, [x5, 16]
|
||||
str q23, [x6], 16
|
||||
ldr q0, [x10, -112]
|
||||
fmla v0.2d, v3.2d, v25.2d
|
||||
str q0, [x10, -112]
|
||||
ldr q2, [x6, 16]
|
||||
fmla v2.2d, v3.2d, v31.2d
|
||||
str q2, [x6, 16]
|
||||
ldr q5, [x10, -80]
|
||||
ldr q4, [x10, -64]
|
||||
ldr q6, [x10, -48]
|
||||
ldr q1, [x10, -32]
|
||||
ldr q7, [x10, -16]
|
||||
fmla v5.2d, v3.2d, v26.2d
|
||||
fmla v4.2d, v3.2d, v27.2d
|
||||
fmla v6.2d, v3.2d, v28.2d
|
||||
fmla v1.2d, v3.2d, v29.2d
|
||||
fmla v7.2d, v3.2d, v30.2d
|
||||
str q5, [x10, -80]
|
||||
str q4, [x10, -64]
|
||||
str q6, [x10, -48]
|
||||
str q1, [x10, -32]
|
||||
str q7, [x10, -16]
|
||||
cmp x23, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/daxpy/daxpy.s.zen.gcc.s
Normal file
30
examples/daxpy/daxpy.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups (%r12,%rax), %xmm12
|
||||
vmovups 16(%r12,%rax), %xmm13
|
||||
vmovups 32(%r12,%rax), %xmm14
|
||||
vmovups 48(%r12,%rax), %xmm15
|
||||
vmovups 64(%r12,%rax), %xmm1
|
||||
vmovups 80(%r12,%rax), %xmm0
|
||||
vmovups 96(%r12,%rax), %xmm4
|
||||
vmovups 112(%r12,%rax), %xmm5
|
||||
vfmadd213pd 0(%rbp,%rax), %xmm3, %xmm12
|
||||
vfmadd213pd 16(%rbp,%rax), %xmm3, %xmm13
|
||||
vfmadd213pd 32(%rbp,%rax), %xmm3, %xmm14
|
||||
vfmadd213pd 48(%rbp,%rax), %xmm3, %xmm15
|
||||
vfmadd213pd 64(%rbp,%rax), %xmm3, %xmm1
|
||||
vfmadd213pd 80(%rbp,%rax), %xmm3, %xmm0
|
||||
vfmadd213pd 96(%rbp,%rax), %xmm3, %xmm4
|
||||
vfmadd213pd 112(%rbp,%rax), %xmm3, %xmm5
|
||||
vmovups %xmm12, 0(%rbp,%rax)
|
||||
vmovups %xmm13, 16(%rbp,%rax)
|
||||
vmovups %xmm14, 32(%rbp,%rax)
|
||||
vmovups %xmm15, 48(%rbp,%rax)
|
||||
vmovups %xmm1, 64(%rbp,%rax)
|
||||
vmovups %xmm0, 80(%rbp,%rax)
|
||||
vmovups %xmm4, 96(%rbp,%rax)
|
||||
vmovups %xmm5, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %r15, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
67
examples/gs/gs.s.csx.gcc.s
Normal file
67
examples/gs/gs.s.csx.gcc.s
Normal file
@@ -0,0 +1,67 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L31:
|
||||
vmovsd (%rax,%rsi,8), %xmm7
|
||||
vaddsd (%rax,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 8(%rax), %xmm7, %xmm10
|
||||
leaq 8(%rax), %rdx
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, (%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd 16(%rax), %xmm14, %xmm15
|
||||
leaq 16(%rax), %rdx
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 8(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
vaddsd 24(%rax), %xmm2, %xmm4
|
||||
leaq 24(%rax), %rdx
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 32(%rax), %xmm7, %xmm10
|
||||
leaq 32(%rax), %rdx
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, 24(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd 40(%rax), %xmm14, %xmm15
|
||||
leaq 40(%rax), %rdx
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 32(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
vaddsd 48(%rax), %xmm2, %xmm4
|
||||
leaq 48(%rax), %rdx
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 40(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 56(%rax), %xmm7, %xmm10
|
||||
leaq 56(%rax), %rdx
|
||||
addq $64, %rax
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, -16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd (%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm8
|
||||
vmovsd %xmm8, -8(%rax)
|
||||
cmpq %r8, %rax
|
||||
jne .L31
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
36
examples/gs/gs.s.csx.icc.s
Normal file
36
examples/gs/gs.s.csx.icc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.58: # Preds ..B1.58 ..B1.57
|
||||
# Execution count [9.36e+01]
|
||||
vmovsd 8(%r11,%r10), %xmm2 #55.35
|
||||
incq %r15 #54.9
|
||||
vaddsd 16(%r11,%r12), %xmm2, %xmm3 #55.12
|
||||
vaddsd 8(%r11,%rbx), %xmm3, %xmm4 #55.12
|
||||
vaddsd %xmm1, %xmm4, %xmm1 #55.12
|
||||
vmulsd %xmm1, %xmm0, %xmm5 #55.12
|
||||
vmovsd %xmm5, 8(%r11,%r12) #55.12
|
||||
vaddsd 16(%r11,%r10), %xmm5, %xmm6 #55.48
|
||||
vaddsd 24(%r11,%r12), %xmm6, %xmm7 #55.63
|
||||
vaddsd 16(%r11,%rbx), %xmm7, %xmm8 #55.79
|
||||
vmulsd %xmm8, %xmm0, %xmm9 #55.12
|
||||
vmovsd %xmm9, 16(%r11,%r12) #55.12
|
||||
vaddsd 24(%r11,%r10), %xmm9, %xmm10 #55.48
|
||||
vaddsd 32(%r11,%r12), %xmm10, %xmm11 #55.63
|
||||
vaddsd 24(%r11,%rbx), %xmm11, %xmm12 #55.79
|
||||
vmulsd %xmm12, %xmm0, %xmm13 #55.12
|
||||
vmovsd %xmm13, 24(%r11,%r12) #55.12
|
||||
vaddsd 32(%r11,%r10), %xmm13, %xmm14 #55.48
|
||||
vaddsd 40(%r11,%r12), %xmm14, %xmm15 #55.63
|
||||
vaddsd 32(%r11,%rbx), %xmm15, %xmm16 #55.79
|
||||
vmulsd %xmm16, %xmm0, %xmm1 #55.12
|
||||
vmovsd %xmm1, 32(%r11,%r12) #55.12
|
||||
addq $32, %r11 #54.9
|
||||
cmpq %r14, %r15 #54.9
|
||||
jb ..B1.58 # Prob 28% #54.9
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
|
||||
19
examples/gs/gs.s.tx2.clang.s
Normal file
19
examples/gs/gs.s.tx2.clang.s
Normal file
@@ -0,0 +1,19 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB0_62: // %L.LB1_398.1
|
||||
// Parent Loop BB0_50 Depth=1
|
||||
// Parent Loop BB0_55 Depth=2
|
||||
// Parent Loop BB0_59 Depth=3
|
||||
// => This Inner Loop Header: Depth=4
|
||||
ldr d1, [x7], #8
|
||||
fadd d0, d1, d0
|
||||
ldr d2, [x22]
|
||||
ldr d3, [x23], #8
|
||||
fadd d2, d2, d3
|
||||
fadd d0, d0, d2
|
||||
sub w26, w26, #1 // =1
|
||||
fmul d0, d0, d9
|
||||
stur d0, [x22, #-8]
|
||||
add x22, x22, #8 // =8
|
||||
cmp w26, #2 // =2
|
||||
b.gt .LBB0_62
|
||||
// OSACA-END
|
||||
41
examples/gs/gs.s.tx2.gcc.s
Normal file
41
examples/gs/gs.s.tx2.gcc.s
Normal file
@@ -0,0 +1,41 @@
|
||||
// OSACA-BEGIN
|
||||
.L20:
|
||||
ldr d31, [x15, x18, lsl 3]
|
||||
ldr d0, [x15, 8]
|
||||
mov x14, x15
|
||||
add x16, x15, 24
|
||||
ldr d2, [x15, x30, lsl 3]
|
||||
add x15, x15, 32
|
||||
fadd d1, d31, d0
|
||||
fadd d3, d1, d30
|
||||
fadd d4, d3, d2
|
||||
fmul d5, d4, d9
|
||||
str d5, [x14], 8
|
||||
ldr d6, [x14, x18, lsl 3]
|
||||
ldr d16, [x14, 8]
|
||||
add x13, x14, 8
|
||||
ldr d7, [x14, x30, lsl 3]
|
||||
fadd d17, d6, d16
|
||||
fadd d18, d17, d5
|
||||
fadd d19, d18, d7
|
||||
fmul d20, d19, d9
|
||||
str d20, [x15, -24]
|
||||
ldr d21, [x13, x18, lsl 3]
|
||||
ldr d23, [x14, 16]
|
||||
ldr d22, [x13, x30, lsl 3]
|
||||
fadd d24, d21, d23
|
||||
fadd d25, d24, d20
|
||||
fadd d26, d25, d22
|
||||
fmul d27, d26, d9
|
||||
str d27, [x14, 8]
|
||||
ldr d30, [x15]
|
||||
ldr d28, [x16, x18, lsl 3]
|
||||
ldr d29, [x16, x30, lsl 3]
|
||||
fadd d31, d28, d30
|
||||
fadd d2, d31, d27
|
||||
fadd d0, d2, d29
|
||||
fmul d30, d0, d9
|
||||
str d30, [x15, -8]
|
||||
cmp x7, x15
|
||||
bne .L20
|
||||
// OSACA-END
|
||||
61
examples/gs/gs.s.zen.gcc.s
Normal file
61
examples/gs/gs.s.zen.gcc.s
Normal file
@@ -0,0 +1,61 @@
|
||||
# OSACA-BEGIN
|
||||
.L32:
|
||||
vmovsd (%rax,%rsi,8), %xmm7
|
||||
leaq 8(%rax), %rdx
|
||||
vaddsd (%rax,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 8(%rax), %xmm7, %xmm10
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, (%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
leaq 16(%rax), %rdx
|
||||
vaddsd 16(%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 8(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
leaq 24(%rax), %rdx
|
||||
vaddsd 24(%rax), %xmm2, %xmm4
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
leaq 32(%rax), %rdx
|
||||
vaddsd 32(%rax), %xmm7, %xmm10
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, 24(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
leaq 40(%rax), %rdx
|
||||
vaddsd 40(%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 32(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
leaq 48(%rax), %rdx
|
||||
vaddsd 48(%rax), %xmm2, %xmm4
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 40(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
leaq 56(%rax), %rdx
|
||||
vaddsd 56(%rax), %xmm7, %xmm10
|
||||
addq $64, %rax
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, -16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd (%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm8
|
||||
vmovsd %xmm8, -8(%rax)
|
||||
cmpq %r8, %rax
|
||||
jne .L32
|
||||
# OSACA-END
|
||||
40
examples/j2d/j2d.s.csx.gcc.AVX.s
Normal file
40
examples/j2d/j2d.s.csx.gcc.AVX.s
Normal file
@@ -0,0 +1,40 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L21:
|
||||
vmovupd (%r8,%rax), %ymm11
|
||||
vmovupd (%rsi,%rax), %ymm13
|
||||
vaddpd (%r9,%rax), %ymm11, %ymm12
|
||||
vaddpd (%rdi,%rax), %ymm13, %ymm14
|
||||
vmovupd 32(%r8,%rax), %ymm1
|
||||
vmovupd 32(%rsi,%rax), %ymm2
|
||||
vaddpd %ymm14, %ymm12, %ymm15
|
||||
vaddpd 32(%r9,%rax), %ymm1, %ymm5
|
||||
vaddpd 32(%rdi,%rax), %ymm2, %ymm7
|
||||
vmulpd %ymm8, %ymm15, %ymm0
|
||||
vmovupd 64(%r8,%rax), %ymm10
|
||||
vaddpd %ymm7, %ymm5, %ymm6
|
||||
vmovupd 64(%rsi,%rax), %ymm12
|
||||
vmovupd 96(%rsi,%rax), %ymm5
|
||||
vmovupd %ymm0, (%rdx,%rax)
|
||||
vmovupd 96(%r8,%rax), %ymm0
|
||||
vaddpd 64(%r9,%rax), %ymm10, %ymm11
|
||||
vaddpd 64(%rdi,%rax), %ymm12, %ymm13
|
||||
vaddpd 96(%r9,%rax), %ymm0, %ymm1
|
||||
vaddpd 96(%rdi,%rax), %ymm5, %ymm2
|
||||
vaddpd %ymm13, %ymm11, %ymm14
|
||||
vmulpd %ymm8, %ymm6, %ymm9
|
||||
vaddpd %ymm2, %ymm1, %ymm7
|
||||
vmulpd %ymm8, %ymm14, %ymm15
|
||||
vmulpd %ymm8, %ymm7, %ymm6
|
||||
vmovupd %ymm9, 32(%rdx,%rax)
|
||||
vmovupd %ymm15, 64(%rdx,%rax)
|
||||
vmovupd %ymm6, 96(%rdx,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rax, %r15
|
||||
jne .L21
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
46
examples/j2d/j2d.s.csx.gcc.SSE.s
Normal file
46
examples/j2d/j2d.s.csx.gcc.SSE.s
Normal file
@@ -0,0 +1,46 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L28:
|
||||
movupd 16(%r8,%rax), %xmm11
|
||||
movupd 16(%rdi,%rax), %xmm12
|
||||
movupd 16(%rsi,%rax), %xmm13
|
||||
addpd %xmm11, %xmm15
|
||||
addpd %xmm13, %xmm12
|
||||
movupd 32(%rdi,%rax), %xmm14
|
||||
movupd 32(%rsi,%rax), %xmm0
|
||||
addpd %xmm15, %xmm12
|
||||
movupd 32(%r8,%rax), %xmm15
|
||||
addpd %xmm0, %xmm14
|
||||
addpd %xmm15, %xmm11
|
||||
movupd 48(%rdi,%rax), %xmm1
|
||||
movupd 48(%rsi,%rax), %xmm7
|
||||
addpd %xmm11, %xmm14
|
||||
addpd %xmm7, %xmm1
|
||||
mulpd %xmm2, %xmm12
|
||||
mulpd %xmm2, %xmm14
|
||||
movups %xmm12, 16(%rcx,%rax)
|
||||
movups %xmm14, 32(%rcx,%rax)
|
||||
movupd 48(%r8,%rax), %xmm14
|
||||
addpd %xmm14, %xmm15
|
||||
addpd %xmm15, %xmm1
|
||||
mulpd %xmm2, %xmm1
|
||||
movups %xmm1, 48(%rcx,%rax)
|
||||
addq $64, %rax
|
||||
.L21:
|
||||
movupd (%r8,%rax), %xmm15
|
||||
movupd (%rdi,%rax), %xmm0
|
||||
movupd (%rsi,%rax), %xmm1
|
||||
addpd %xmm15, %xmm14
|
||||
addpd %xmm1, %xmm0
|
||||
leaq 16(%rax), %r10
|
||||
addpd %xmm0, %xmm14
|
||||
mulpd %xmm2, %xmm14
|
||||
movups %xmm14, (%rcx,%rax)
|
||||
cmpq %r10, %r14
|
||||
jne .L28
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
37
examples/j2d/j2d.s.csx.icc.AVX.s
Normal file
37
examples/j2d/j2d.s.csx.icc.AVX.s
Normal file
@@ -0,0 +1,37 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.47: # Preds ..B1.47 ..B1.46
|
||||
# Execution count [1.15e+04]
|
||||
vmovupd 10016(%r8,%rcx,8), %ymm1 #94.5
|
||||
vmovupd 10048(%r8,%rcx,8), %ymm6 #94.5
|
||||
vmovupd 10080(%r8,%rcx,8), %ymm11 #94.5
|
||||
vaddpd 16(%r12,%rcx,8), %ymm1, %ymm2 #94.5
|
||||
vaddpd 48(%r12,%rcx,8), %ymm6, %ymm7 #94.5
|
||||
vaddpd 80(%r12,%rcx,8), %ymm11, %ymm12 #94.5
|
||||
vaddpd 20032(%r10,%rcx,8), %ymm2, %ymm3 #94.5
|
||||
vaddpd 20064(%r10,%rcx,8), %ymm7, %ymm8 #94.5
|
||||
vaddpd 20096(%r10,%rcx,8), %ymm12, %ymm13 #94.5
|
||||
vaddpd 10032(%r8,%rcx,8), %ymm3, %ymm4 #94.5
|
||||
vaddpd 10064(%r8,%rcx,8), %ymm8, %ymm9 #94.5
|
||||
vaddpd 10096(%r8,%rcx,8), %ymm13, %ymm14 #94.5
|
||||
vmovupd 10112(%r8,%rcx,8), %ymm1 #94.5
|
||||
vmulpd %ymm4, %ymm0, %ymm5 #94.5
|
||||
vmulpd %ymm9, %ymm0, %ymm10 #94.5
|
||||
vmulpd %ymm14, %ymm0, %ymm15 #94.5
|
||||
vaddpd 112(%r12,%rcx,8), %ymm1, %ymm2 #94.5
|
||||
vmovupd %ymm5, 10016(%r9,%rcx,8) #94.5
|
||||
vmovupd %ymm10, 10048(%r9,%rcx,8) #94.5
|
||||
vmovupd %ymm15, 10080(%r9,%rcx,8) #94.5
|
||||
vaddpd 20128(%r10,%rcx,8), %ymm2, %ymm3 #94.5
|
||||
vaddpd 10128(%r8,%rcx,8), %ymm3, %ymm4 #94.5
|
||||
vmulpd %ymm4, %ymm0, %ymm5 #94.5
|
||||
vmovupd %ymm5, 10112(%r9,%rcx,8) #94.5
|
||||
addq $16, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.47 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
69
examples/j2d/j2d.s.csx.icc.AVX512.s
Normal file
69
examples/j2d/j2d.s.csx.icc.AVX512.s
Normal file
@@ -0,0 +1,69 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.47: # Preds ..B1.63 ..B1.46
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.48: # Preds ..B1.47
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10032(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10016(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.51: # Preds ..B1.48
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 16(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20032(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10016(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.52: # Preds ..B1.51
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10096(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10080(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.55: # Preds ..B1.52
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 80(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20096(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10080(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.56: # Preds ..B1.55
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10160(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10144(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.59: # Preds ..B1.56
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 144(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20160(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10144(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.60: # Preds ..B1.59
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10224(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10208(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.63: # Preds ..B1.60
|
||||
# Execution count [1.15e+04]
|
||||
vaddpd 208(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20224(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10208(%r9,%rcx,8) #94.5
|
||||
addq $32, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.47 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
40
examples/j2d/j2d.s.csx.icc.SSE.s
Normal file
40
examples/j2d/j2d.s.csx.icc.SSE.s
Normal file
@@ -0,0 +1,40 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.42: # Preds ..B1.42 ..B1.41
|
||||
# Execution count [1.15e+04]
|
||||
movups 10016(%r8,%rcx,8), %xmm0 #94.5
|
||||
addpd 16(%r12,%rcx,8), %xmm0 #94.5
|
||||
addpd 20032(%r10,%rcx,8), %xmm0 #94.5
|
||||
movups 10032(%r8,%rcx,8), %xmm2 #94.5
|
||||
movups 32(%r12,%rcx,8), %xmm1 #94.5
|
||||
addpd %xmm2, %xmm0 #94.5
|
||||
addpd %xmm1, %xmm2 #94.5
|
||||
mulpd %xmm7, %xmm0 #94.5
|
||||
addpd 20048(%r10,%rcx,8), %xmm2 #94.5
|
||||
movups 10048(%r8,%rcx,8), %xmm4 #94.5
|
||||
movups 48(%r12,%rcx,8), %xmm3 #94.5
|
||||
addpd %xmm4, %xmm2 #94.5
|
||||
addpd %xmm3, %xmm4 #94.5
|
||||
mulpd %xmm7, %xmm2 #94.5
|
||||
addpd 20064(%r10,%rcx,8), %xmm4 #94.5
|
||||
movups 10064(%r8,%rcx,8), %xmm6 #94.5
|
||||
movups 64(%r12,%rcx,8), %xmm5 #94.5
|
||||
addpd %xmm6, %xmm4 #94.5
|
||||
addpd %xmm5, %xmm6 #94.5
|
||||
mulpd %xmm7, %xmm4 #94.5
|
||||
addpd 20080(%r10,%rcx,8), %xmm6 #94.5
|
||||
addpd 10080(%r8,%rcx,8), %xmm6 #94.5
|
||||
mulpd %xmm7, %xmm6 #94.5
|
||||
movups %xmm0, 10016(%r9,%rcx,8) #94.5
|
||||
movups %xmm2, 10032(%r9,%rcx,8) #94.5
|
||||
movups %xmm4, 10048(%r9,%rcx,8) #94.5
|
||||
movups %xmm6, 10064(%r9,%rcx,8) #94.5
|
||||
addq $8, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.42 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
131
examples/j2d/j2d.s.tx2.clang.s
Normal file
131
examples/j2d/j2d.s.tx2.clang.s
Normal file
@@ -0,0 +1,131 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_16 Depth=1
|
||||
// Parent Loop BB1_19 Depth=2
|
||||
// Parent Loop BB1_24 Depth=3
|
||||
// => This Inner Loop Header: Depth=4
|
||||
add x0, x5, x16
|
||||
add x18, x21, x16
|
||||
ldp q4, q5, [x0, #16]
|
||||
ldp q6, q7, [x0, #48]
|
||||
ldur q0, [x18, #8]
|
||||
ldur q1, [x18, #24]
|
||||
ldur q2, [x18, #40]
|
||||
ldur q3, [x18, #56]
|
||||
add x1, x28, x16
|
||||
add x15, x15, #32 // =32
|
||||
fadd v0.2d, v4.2d, v0.2d
|
||||
fadd v4.2d, v5.2d, v1.2d
|
||||
fadd v5.2d, v6.2d, v2.2d
|
||||
fadd v6.2d, v7.2d, v3.2d
|
||||
ldp q7, q16, [x1, #16]
|
||||
fadd v1.2d, v7.2d, v1.2d
|
||||
ldp q17, q18, [x1, #48]
|
||||
ldur q19, [x18, #72]
|
||||
fadd v0.2d, v0.2d, v1.2d
|
||||
fadd v1.2d, v16.2d, v2.2d
|
||||
fadd v2.2d, v17.2d, v3.2d
|
||||
fadd v3.2d, v18.2d, v19.2d
|
||||
ldp q16, q17, [x0, #80]
|
||||
ldp q18, q19, [x0, #112]
|
||||
fadd v1.2d, v4.2d, v1.2d
|
||||
fadd v2.2d, v5.2d, v2.2d
|
||||
fadd v3.2d, v6.2d, v3.2d
|
||||
ldur q4, [x18, #72]
|
||||
ldur q5, [x18, #88]
|
||||
ldur q6, [x18, #104]
|
||||
ldur q7, [x18, #120]
|
||||
fadd v4.2d, v16.2d, v4.2d
|
||||
fadd v16.2d, v17.2d, v5.2d
|
||||
fadd v17.2d, v18.2d, v6.2d
|
||||
fadd v18.2d, v19.2d, v7.2d
|
||||
ldp q19, q20, [x1, #80]
|
||||
fadd v5.2d, v19.2d, v5.2d
|
||||
ldp q21, q22, [x1, #112]
|
||||
ldur q23, [x18, #136]
|
||||
fadd v4.2d, v4.2d, v5.2d
|
||||
fadd v5.2d, v20.2d, v6.2d
|
||||
fadd v6.2d, v21.2d, v7.2d
|
||||
fadd v7.2d, v22.2d, v23.2d
|
||||
ldp q20, q21, [x0, #144]
|
||||
ldp q22, q23, [x0, #176]
|
||||
fadd v5.2d, v16.2d, v5.2d
|
||||
fadd v6.2d, v17.2d, v6.2d
|
||||
fadd v7.2d, v18.2d, v7.2d
|
||||
ldur q16, [x18, #136]
|
||||
ldur q17, [x18, #152]
|
||||
ldur q18, [x18, #168]
|
||||
ldur q19, [x18, #184]
|
||||
fadd v16.2d, v20.2d, v16.2d
|
||||
fadd v20.2d, v21.2d, v17.2d
|
||||
fadd v21.2d, v22.2d, v18.2d
|
||||
fadd v22.2d, v23.2d, v19.2d
|
||||
ldp q23, q24, [x1, #144]
|
||||
fadd v17.2d, v23.2d, v17.2d
|
||||
ldp q25, q26, [x1, #176]
|
||||
fadd v16.2d, v16.2d, v17.2d
|
||||
fadd v17.2d, v24.2d, v18.2d
|
||||
fadd v18.2d, v25.2d, v19.2d
|
||||
ldp q24, q25, [x0, #208]
|
||||
ldur q23, [x18, #200]
|
||||
fadd v17.2d, v20.2d, v17.2d
|
||||
fadd v18.2d, v21.2d, v18.2d
|
||||
ldur q20, [x18, #200]
|
||||
ldur q21, [x18, #216]
|
||||
fadd v19.2d, v26.2d, v23.2d
|
||||
fadd v20.2d, v24.2d, v20.2d
|
||||
fadd v24.2d, v25.2d, v21.2d
|
||||
ldp q25, q26, [x1, #208]
|
||||
fadd v21.2d, v25.2d, v21.2d
|
||||
fadd v20.2d, v20.2d, v21.2d
|
||||
ldp q21, q25, [x0, #240]
|
||||
fadd v19.2d, v22.2d, v19.2d
|
||||
ldur q22, [x18, #232]
|
||||
fadd v21.2d, v21.2d, v22.2d
|
||||
fadd v22.2d, v26.2d, v22.2d
|
||||
fadd v22.2d, v24.2d, v22.2d
|
||||
ldp q24, q26, [x1, #240]
|
||||
ldur q23, [x18, #248]
|
||||
fadd v25.2d, v25.2d, v23.2d
|
||||
fadd v23.2d, v24.2d, v23.2d
|
||||
add x18, x18, #264 // =264
|
||||
fmul v0.2d, v0.2d, v28.2d
|
||||
fmul v1.2d, v1.2d, v28.2d
|
||||
fmul v2.2d, v2.2d, v28.2d
|
||||
fmul v5.2d, v5.2d, v28.2d
|
||||
fadd v21.2d, v21.2d, v23.2d
|
||||
ldr q23, [x18]
|
||||
add x18, x25, x16
|
||||
stur q0, [x18, #8]
|
||||
stur q1, [x18, #24]
|
||||
fmul v3.2d, v3.2d, v28.2d
|
||||
stur q2, [x18, #40]
|
||||
fadd v23.2d, v26.2d, v23.2d
|
||||
stur q5, [x18, #88]
|
||||
fmul v4.2d, v4.2d, v28.2d
|
||||
stur q3, [x18, #56]
|
||||
fmul v6.2d, v6.2d, v28.2d
|
||||
stur q4, [x18, #72]
|
||||
fmul v0.2d, v7.2d, v28.2d
|
||||
stur q6, [x18, #104]
|
||||
fmul v1.2d, v16.2d, v28.2d
|
||||
stur q0, [x18, #120]
|
||||
fmul v2.2d, v17.2d, v28.2d
|
||||
stur q1, [x18, #136]
|
||||
fmul v4.2d, v19.2d, v28.2d
|
||||
stur q2, [x18, #152]
|
||||
fadd v5.2d, v25.2d, v23.2d
|
||||
stur q4, [x18, #184]
|
||||
fmul v3.2d, v18.2d, v28.2d
|
||||
stur q3, [x18, #168]
|
||||
fmul v6.2d, v20.2d, v28.2d
|
||||
stur q6, [x18, #200]
|
||||
fmul v0.2d, v22.2d, v28.2d
|
||||
stur q0, [x18, #216]
|
||||
fmul v1.2d, v21.2d, v28.2d
|
||||
stur q1, [x18, #232]
|
||||
add x16, x16, #256 // =256
|
||||
fmul v2.2d, v5.2d, v28.2d
|
||||
stur q2, [x18, #248]
|
||||
adds x17, x17, #4 // =4
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
43
examples/j2d/j2d.s.tx2.gcc.s
Normal file
43
examples/j2d/j2d.s.tx2.gcc.s
Normal file
@@ -0,0 +1,43 @@
|
||||
// OSACA-BEGIN
|
||||
.L93:
|
||||
add x5, x0, 16
|
||||
ldr q2, [x14, x0]
|
||||
ldr q5, [x25, x0]
|
||||
add x7, x0, 32
|
||||
ldr q13, [x22, x0]
|
||||
ldr q4, [x25, x5]
|
||||
add x6, x0, 48
|
||||
ldr x9, [sp, 144]
|
||||
ldr q19, [x22, x5]
|
||||
ldr q7, [x14, x5]
|
||||
ldr q6, [x14, x7]
|
||||
ldr q3, [x25, x7]
|
||||
ldr q18, [x22, x7]
|
||||
fadd v17.2d, v2.2d, v30.2d
|
||||
ldr q16, [x14, x6]
|
||||
ldr q20, [x25, x6]
|
||||
fadd v23.2d, v5.2d, v13.2d
|
||||
ldr q22, [x22, x6]
|
||||
fadd v24.2d, v4.2d, v19.2d
|
||||
fadd v25.2d, v7.2d, v2.2d
|
||||
fadd v27.2d, v6.2d, v7.2d
|
||||
fadd v26.2d, v3.2d, v18.2d
|
||||
fadd v28.2d, v16.2d, v6.2d
|
||||
mov v30.16b, v16.16b
|
||||
fadd v29.2d, v20.2d, v22.2d
|
||||
fadd v31.2d, v23.2d, v17.2d
|
||||
fadd v0.2d, v24.2d, v25.2d
|
||||
fadd v2.2d, v26.2d, v27.2d
|
||||
fadd v1.2d, v29.2d, v28.2d
|
||||
fmul v5.2d, v31.2d, v21.2d
|
||||
fmul v13.2d, v0.2d, v21.2d
|
||||
fmul v4.2d, v2.2d, v21.2d
|
||||
fmul v19.2d, v1.2d, v21.2d
|
||||
str q5, [x28, x0]
|
||||
add x0, x0, 64
|
||||
str q13, [x28, x5]
|
||||
str q4, [x28, x7]
|
||||
str q19, [x28, x6]
|
||||
cmp x9, x0
|
||||
bne .L93
|
||||
// OSACA-END
|
||||
36
examples/j2d/j2d.s.zen.gcc.s
Normal file
36
examples/j2d/j2d.s.zen.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
# OSACA-BEGIN
|
||||
.L28:
|
||||
vmovups (%r10,%rcx), %xmm5
|
||||
vmovups 32(%r10,%rax), %xmm13
|
||||
vmovups (%rdi,%rcx), %xmm1
|
||||
vmovups 32(%rdi,%rax), %xmm14
|
||||
vmovups 48(%rdi,%rax), %xmm9
|
||||
vaddpd (%r8,%rcx), %xmm1, %xmm10
|
||||
vaddpd 32(%r8,%rax), %xmm14, %xmm15
|
||||
vaddpd 48(%r8,%rax), %xmm9, %xmm1
|
||||
vaddpd %xmm5, %xmm8, %xmm8
|
||||
vaddpd %xmm13, %xmm5, %xmm6
|
||||
vmovups 48(%r10,%rax), %xmm5
|
||||
vaddpd %xmm8, %xmm10, %xmm11
|
||||
vaddpd %xmm6, %xmm15, %xmm0
|
||||
vmulpd %xmm2, %xmm11, %xmm12
|
||||
vaddpd %xmm5, %xmm13, %xmm4
|
||||
vmulpd %xmm2, %xmm0, %xmm7
|
||||
vaddpd %xmm4, %xmm1, %xmm10
|
||||
vmovups %xmm12, (%rsi,%rcx)
|
||||
vmovups %xmm7, 32(%rsi,%rax)
|
||||
vmulpd %xmm2, %xmm10, %xmm8
|
||||
vmovups %xmm8, 48(%rsi,%rax)
|
||||
addq $64, %rax
|
||||
.L21:
|
||||
vmovups (%r10,%rax), %xmm8
|
||||
leaq 16(%rax), %rcx
|
||||
vmovups (%rdi,%rax), %xmm9
|
||||
vaddpd (%r8,%rax), %xmm9, %xmm10
|
||||
vaddpd %xmm8, %xmm5, %xmm11
|
||||
vaddpd %xmm11, %xmm10, %xmm12
|
||||
vmulpd %xmm2, %xmm12, %xmm13
|
||||
vmovups %xmm13, (%rsi,%rax)
|
||||
cmpq %rcx, %r14
|
||||
jne .L28
|
||||
# OSACA-END
|
||||
@@ -1,13 +0,0 @@
|
||||
|
||||
void scale(int N){
|
||||
void dummy(double*, double*);
|
||||
double a[N], b[N];
|
||||
double s;
|
||||
|
||||
//STARTLOOP
|
||||
for(int i=0; i<N; ++i){
|
||||
a[i] = s * b[i];
|
||||
}
|
||||
|
||||
dummy(&a[1],&b[1]);
|
||||
}
|
||||
44
examples/striad/striad.s.csx.gcc.s
Normal file
44
examples/striad/striad.s.csx.gcc.s
Normal file
@@ -0,0 +1,44 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r15,%rax), %ymm5
|
||||
vmovupd 0(%r13,%rax), %ymm6
|
||||
vmovupd 32(%r15,%rax), %ymm8
|
||||
vmovupd 32(%r13,%rax), %ymm7
|
||||
vmovupd 64(%r15,%rax), %ymm9
|
||||
vmovupd 64(%r13,%rax), %ymm10
|
||||
vmovupd 96(%r15,%rax), %ymm11
|
||||
vmovupd 96(%r13,%rax), %ymm12
|
||||
vmovupd 128(%r15,%rax), %ymm13
|
||||
vmovupd 128(%r13,%rax), %ymm14
|
||||
vmovupd 160(%r15,%rax), %ymm15
|
||||
vmovupd 160(%r13,%rax), %ymm2
|
||||
vmovupd 192(%r15,%rax), %ymm0
|
||||
vmovupd 192(%r13,%rax), %ymm1
|
||||
vmovupd 224(%r15,%rax), %ymm3
|
||||
vmovupd 224(%r13,%rax), %ymm4
|
||||
vfmadd132pd (%r14,%rax), %ymm6, %ymm5
|
||||
vfmadd132pd 32(%r14,%rax), %ymm7, %ymm8
|
||||
vfmadd132pd 64(%r14,%rax), %ymm10, %ymm9
|
||||
vfmadd132pd 96(%r14,%rax), %ymm12, %ymm11
|
||||
vfmadd132pd 128(%r14,%rax), %ymm14, %ymm13
|
||||
vfmadd132pd 160(%r14,%rax), %ymm2, %ymm15
|
||||
vfmadd132pd 192(%r14,%rax), %ymm1, %ymm0
|
||||
vfmadd132pd 224(%r14,%rax), %ymm4, %ymm3
|
||||
vmovupd %ymm5, (%r12,%rax)
|
||||
vmovupd %ymm8, 32(%r12,%rax)
|
||||
vmovupd %ymm9, 64(%r12,%rax)
|
||||
vmovupd %ymm11, 96(%r12,%rax)
|
||||
vmovupd %ymm13, 128(%r12,%rax)
|
||||
vmovupd %ymm15, 160(%r12,%rax)
|
||||
vmovupd %ymm0, 192(%r12,%rax)
|
||||
vmovupd %ymm3, 224(%r12,%rax)
|
||||
addq $256, %rax
|
||||
cmpq %rax, %r8
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
21
examples/striad/striad.s.csx.icc.s
Normal file
21
examples/striad/striad.s.csx.icc.s
Normal file
@@ -0,0 +1,21 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.41: # Preds ..B1.41 ..B1.40
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%rcx,%rax,8), %zmm2 #80.5
|
||||
vmovups 64(%rcx,%rax,8), %zmm4 #80.5
|
||||
vmovups (%r14,%rax,8), %zmm1 #80.5
|
||||
vmovups 64(%r14,%rax,8), %zmm3 #80.5
|
||||
vfmadd213pd (%r8,%rax,8), %zmm1, %zmm2 #80.5
|
||||
vfmadd213pd 64(%r8,%rax,8), %zmm3, %zmm4 #80.5
|
||||
vmovupd %zmm2, (%r13,%rax,8) #80.5
|
||||
vmovupd %zmm4, 64(%r13,%rax,8) #80.5
|
||||
addq $16, %rax #80.5
|
||||
cmpq %r12, %rax #80.5
|
||||
jb ..B1.41 # Prob 82% #80.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
112
examples/striad/striad.s.tx2.clang.s
Normal file
112
examples/striad/striad.s.tx2.clang.s
Normal file
@@ -0,0 +1,112 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q2, q3, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
ldp q16, q17, [x11, #-256]
|
||||
ldp q18, q19, [x11, #-224]
|
||||
fmla v0.2d, v16.2d, v4.2d
|
||||
fmla v1.2d, v17.2d, v5.2d
|
||||
stp q1, q0, [sp, #96] // 32-byte Folded Spill
|
||||
fmla v2.2d, v18.2d, v6.2d
|
||||
fmla v3.2d, v19.2d, v7.2d
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q6, q7, [x9, #-160]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
ldp q20, q21, [x11, #-192]
|
||||
ldp q22, q23, [x11, #-160]
|
||||
fmla v4.2d, v20.2d, v16.2d
|
||||
stp q3, q4, [x12, #-208]
|
||||
fmla v5.2d, v21.2d, v17.2d
|
||||
fmla v6.2d, v22.2d, v18.2d
|
||||
stp q5, q6, [x12, #-176]
|
||||
fmla v7.2d, v23.2d, v19.2d
|
||||
ldp q16, q18, [x9, #-128]
|
||||
ldp q17, q19, [x9, #-96]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
ldp q24, q25, [x11, #-128]
|
||||
ldp q26, q27, [x11, #-96]
|
||||
fmla v16.2d, v24.2d, v20.2d
|
||||
stp q7, q16, [x12, #-144]
|
||||
fmla v18.2d, v25.2d, v21.2d
|
||||
fmla v17.2d, v26.2d, v22.2d
|
||||
stp q18, q17, [x12, #-112]
|
||||
fmla v19.2d, v27.2d, v23.2d
|
||||
ldp q22, q23, [x9, #-64]
|
||||
ldp q20, q21, [x9, #-32]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
ldp q28, q29, [x11, #-64]
|
||||
ldp q30, q31, [x11, #-32]
|
||||
fmla v22.2d, v28.2d, v24.2d
|
||||
stp q19, q22, [x12, #-80]
|
||||
fmla v23.2d, v29.2d, v25.2d
|
||||
fmla v20.2d, v30.2d, v26.2d
|
||||
stp q23, q20, [x12, #-48]
|
||||
fmla v21.2d, v31.2d, v27.2d
|
||||
stur q21, [x12, #-16]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q28, q29, [x10]
|
||||
ldp q30, q31, [x10, #32]
|
||||
ldp q8, q10, [x11]
|
||||
ldp q11, q12, [x11, #32]
|
||||
fmla v24.2d, v8.2d, v28.2d
|
||||
fmla v25.2d, v10.2d, v29.2d
|
||||
stp q24, q25, [x12]
|
||||
fmla v26.2d, v11.2d, v30.2d
|
||||
fmla v27.2d, v12.2d, v31.2d
|
||||
stp q26, q27, [x12, #32]
|
||||
ldp q28, q29, [x9, #64]
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldp q8, q10, [x10, #64]
|
||||
ldp q11, q12, [x10, #96]
|
||||
ldp q13, q14, [x11, #64]
|
||||
ldp q15, q9, [x11, #96]
|
||||
fmla v28.2d, v13.2d, v8.2d
|
||||
fmla v29.2d, v14.2d, v10.2d
|
||||
stp q28, q29, [x12, #64]
|
||||
fmla v30.2d, v15.2d, v11.2d
|
||||
fmla v31.2d, v9.2d, v12.2d
|
||||
stp q30, q31, [x12, #96]
|
||||
ldp q8, q9, [x9, #128]
|
||||
ldp q12, q13, [x10, #128]
|
||||
ldp q14, q15, [x11, #128]
|
||||
ldp q10, q11, [x9, #160]
|
||||
fmla v8.2d, v14.2d, v12.2d
|
||||
ldp q12, q14, [x10, #160]
|
||||
fmla v9.2d, v15.2d, v13.2d
|
||||
stp q8, q9, [x12, #128]
|
||||
ldp q13, q15, [x11, #160]
|
||||
fmla v10.2d, v13.2d, v12.2d
|
||||
fmla v11.2d, v15.2d, v14.2d
|
||||
stp q10, q11, [x12, #160]
|
||||
ldp q12, q13, [x9, #192]
|
||||
ldp q14, q15, [x10, #192]
|
||||
ldp q0, q1, [x11, #192]
|
||||
fmla v12.2d, v0.2d, v14.2d
|
||||
ldr q0, [sp, #112] // 16-byte Folded Reload
|
||||
stur q0, [x12, #-256]
|
||||
ldr q0, [sp, #96] // 16-byte Folded Reload
|
||||
stp q0, q2, [x12, #-240]
|
||||
ldp q0, q2, [x9, #224]
|
||||
ldp q3, q4, [x10, #224]
|
||||
ldp q5, q6, [x11, #224]
|
||||
fmla v13.2d, v1.2d, v15.2d
|
||||
stp q12, q13, [x12, #192]
|
||||
fmla v0.2d, v5.2d, v3.2d
|
||||
fmla v2.2d, v6.2d, v4.2d
|
||||
stp q0, q2, [x12, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x12, x12, #512 // =512
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x13, x13, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
53
examples/striad/striad.s.tx2.gcc.s
Normal file
53
examples/striad/striad.s.tx2.gcc.s
Normal file
@@ -0,0 +1,53 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x12, x11, 16
|
||||
ldr q29, [x22, x11]
|
||||
ldr q30, [x20, x11]
|
||||
add x7, x11, 32
|
||||
ldr q31, [x21, x11]
|
||||
ldr q7, [x22, x12]
|
||||
add x6, x11, 48
|
||||
add x5, x11, 64
|
||||
ldr q6, [x20, x12]
|
||||
ldr q2, [x21, x12]
|
||||
add x8, x11, 80
|
||||
add x0, x11, 96
|
||||
ldr q9, [x22, x7]
|
||||
ldr q5, [x20, x7]
|
||||
add x13, x11, 112
|
||||
ldr q1, [x21, x7]
|
||||
ldr q16, [x22, x6]
|
||||
ldr q4, [x20, x6]
|
||||
ldr q0, [x21, x6]
|
||||
fmla v30.2d, v29.2d, v31.2d
|
||||
ldr q23, [x22, x5]
|
||||
ldr q3, [x20, x5]
|
||||
fmla v6.2d, v7.2d, v2.2d
|
||||
ldr q22, [x21, x5]
|
||||
ldr q21, [x22, x8]
|
||||
ldr q24, [x20, x8]
|
||||
ldr q20, [x21, x8]
|
||||
fmla v5.2d, v9.2d, v1.2d
|
||||
ldr q19, [x22, x0]
|
||||
ldr q25, [x20, x0]
|
||||
fmla v4.2d, v16.2d, v0.2d
|
||||
ldr q18, [x21, x0]
|
||||
ldr q17, [x22, x13]
|
||||
ldr q26, [x20, x13]
|
||||
ldr q27, [x21, x13]
|
||||
fmla v3.2d, v23.2d, v22.2d
|
||||
fmla v24.2d, v21.2d, v20.2d
|
||||
str q30, [x19, x11]
|
||||
add x11, x11, 128
|
||||
str q6, [x19, x12]
|
||||
fmla v25.2d, v19.2d, v18.2d
|
||||
str q5, [x19, x7]
|
||||
fmla v26.2d, v17.2d, v27.2d
|
||||
str q4, [x19, x6]
|
||||
str q3, [x19, x5]
|
||||
str q24, [x19, x8]
|
||||
str q25, [x19, x0]
|
||||
str q26, [x19, x13]
|
||||
cmp x25, x11
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
38
examples/striad/striad.s.zen.gcc.s
Normal file
38
examples/striad/striad.s.zen.gcc.s
Normal file
@@ -0,0 +1,38 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups (%r14,%rax), %xmm0
|
||||
vmovups (%r12,%rax), %xmm5
|
||||
vmovups 16(%r14,%rax), %xmm3
|
||||
vmovups 16(%r12,%rax), %xmm6
|
||||
vmovups 32(%r14,%rax), %xmm4
|
||||
vmovups 32(%r12,%rax), %xmm7
|
||||
vmovups 48(%r14,%rax), %xmm8
|
||||
vmovups 48(%r12,%rax), %xmm9
|
||||
vmovups 64(%r14,%rax), %xmm10
|
||||
vmovups 64(%r12,%rax), %xmm11
|
||||
vmovups 80(%r14,%rax), %xmm12
|
||||
vmovups 80(%r12,%rax), %xmm13
|
||||
vmovups 96(%r14,%rax), %xmm14
|
||||
vmovups 96(%r12,%rax), %xmm15
|
||||
vmovups 112(%r14,%rax), %xmm2
|
||||
vmovups 112(%r12,%rax), %xmm1
|
||||
vfmadd132pd 0(%r13,%rax), %xmm5, %xmm0
|
||||
vfmadd132pd 16(%r13,%rax), %xmm6, %xmm3
|
||||
vfmadd132pd 32(%r13,%rax), %xmm7, %xmm4
|
||||
vfmadd132pd 48(%r13,%rax), %xmm9, %xmm8
|
||||
vfmadd132pd 64(%r13,%rax), %xmm11, %xmm10
|
||||
vfmadd132pd 80(%r13,%rax), %xmm13, %xmm12
|
||||
vfmadd132pd 96(%r13,%rax), %xmm15, %xmm14
|
||||
vfmadd132pd 112(%r13,%rax), %xmm1, %xmm2
|
||||
vmovups %xmm0, 0(%rbp,%rax)
|
||||
vmovups %xmm3, 16(%rbp,%rax)
|
||||
vmovups %xmm4, 32(%rbp,%rax)
|
||||
vmovups %xmm8, 48(%rbp,%rax)
|
||||
vmovups %xmm10, 64(%rbp,%rax)
|
||||
vmovups %xmm12, 80(%rbp,%rax)
|
||||
vmovups %xmm14, 96(%rbp,%rax)
|
||||
vmovups %xmm2, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rcx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
46
examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s
Normal file
46
examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s
Normal file
@@ -0,0 +1,46 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
# LLVM-MCA-BEGIN
|
||||
.L19:
|
||||
vmovupd (%rcx), %ymm4
|
||||
vmovupd 32(%rcx), %ymm13
|
||||
vaddsd %xmm4, %xmm0, %xmm6
|
||||
vunpckhpd %xmm4, %xmm4, %xmm3
|
||||
vextractf64x2 $0x1, %ymm4, %xmm8
|
||||
vaddsd %xmm6, %xmm3, %xmm7
|
||||
vunpckhpd %xmm8, %xmm8, %xmm11
|
||||
vunpckhpd %xmm13, %xmm13, %xmm1
|
||||
vaddsd %xmm7, %xmm8, %xmm10
|
||||
vextractf64x2 $0x1, %ymm13, %xmm2
|
||||
vunpckhpd %xmm2, %xmm2, %xmm3
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmovupd 64(%rcx), %ymm8
|
||||
vmovupd 96(%rcx), %ymm5
|
||||
vaddsd %xmm13, %xmm12, %xmm0
|
||||
vunpckhpd %xmm8, %xmm8, %xmm12
|
||||
vextractf64x2 $0x1, %ymm8, %xmm14
|
||||
vaddsd %xmm0, %xmm1, %xmm4
|
||||
vunpckhpd %xmm14, %xmm14, %xmm0
|
||||
vextractf64x2 $0x1, %ymm5, %xmm9
|
||||
vaddsd %xmm4, %xmm2, %xmm6
|
||||
subq $-128, %rcx
|
||||
vaddsd %xmm3, %xmm6, %xmm7
|
||||
vaddsd %xmm8, %xmm7, %xmm11
|
||||
vunpckhpd %xmm5, %xmm5, %xmm7
|
||||
vaddsd %xmm11, %xmm12, %xmm13
|
||||
vunpckhpd %xmm9, %xmm9, %xmm12
|
||||
vaddsd %xmm13, %xmm14, %xmm1
|
||||
vaddsd %xmm0, %xmm1, %xmm4
|
||||
vaddsd %xmm5, %xmm4, %xmm3
|
||||
vaddsd %xmm3, %xmm7, %xmm8
|
||||
vaddsd %xmm8, %xmm9, %xmm11
|
||||
vaddsd %xmm12, %xmm11, %xmm0
|
||||
cmpq %rcx, %r15
|
||||
jne .L19
|
||||
# LLVM-MCA-END
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
20
examples/sum_reduction/sum_reduction.s.csx.gcc.s
Normal file
20
examples/sum_reduction/sum_reduction.s.csx.gcc.s
Normal file
@@ -0,0 +1,20 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vaddpd (%rcx), %ymm3, %ymm4
|
||||
addq $256, %rcx
|
||||
vaddpd -224(%rcx), %ymm4, %ymm5
|
||||
vaddpd -192(%rcx), %ymm5, %ymm6
|
||||
vaddpd -160(%rcx), %ymm6, %ymm8
|
||||
vaddpd -128(%rcx), %ymm8, %ymm9
|
||||
vaddpd -96(%rcx), %ymm9, %ymm10
|
||||
vaddpd -64(%rcx), %ymm10, %ymm11
|
||||
vaddpd -32(%rcx), %ymm11, %ymm3
|
||||
cmpq %rcx, %r15
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
17
examples/sum_reduction/sum_reduction.s.csx.icc.s
Normal file
17
examples/sum_reduction/sum_reduction.s.csx.icc.s
Normal file
@@ -0,0 +1,17 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.38: # Preds ..B1.38 ..B1.37
|
||||
# Execution count [2.22e+03]
|
||||
vaddpd (%r13,%rax,8), %zmm4, %zmm4 #76.5
|
||||
vaddpd 64(%r13,%rax,8), %zmm3, %zmm3 #76.5
|
||||
vaddpd 128(%r13,%rax,8), %zmm2, %zmm2 #76.5
|
||||
vaddpd 192(%r13,%rax,8), %zmm1, %zmm1 #76.5
|
||||
addq $32, %rax #76.5
|
||||
cmpq %r14, %rax #76.5
|
||||
jb ..B1.38 # Prob 82% #76.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
57
examples/sum_reduction/sum_reduction.s.tx2.clang.s
Normal file
57
examples/sum_reduction/sum_reduction.s.tx2.clang.s
Normal file
@@ -0,0 +1,57 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q4, q5, [x9, #-256]
|
||||
fadd v0.2d, v4.2d, v0.2d
|
||||
fadd v1.2d, v5.2d, v1.2d
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q16, q17, [x9, #-128]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
ldp q6, q7, [x9, #-224]
|
||||
fadd v2.2d, v6.2d, v2.2d
|
||||
fadd v3.2d, v7.2d, v3.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
fadd v4.2d, v5.2d, v17.2d
|
||||
ldp q6, q7, [x9, #-160]
|
||||
ldp q18, q19, [x9, #-96]
|
||||
ldp q16, q17, [x9]
|
||||
add x8, x8, #64 // =64
|
||||
fadd v1.2d, v1.2d, v4.2d
|
||||
fadd v4.2d, v6.2d, v18.2d
|
||||
fadd v2.2d, v2.2d, v4.2d
|
||||
fadd v4.2d, v7.2d, v19.2d
|
||||
ldp q6, q7, [x9, #-32]
|
||||
ldp q18, q19, [x9, #32]
|
||||
fadd v6.2d, v6.2d, v18.2d
|
||||
fadd v7.2d, v7.2d, v19.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
ldp q4, q5, [x9, #-64]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
fadd v5.2d, v5.2d, v17.2d
|
||||
ldp q16, q17, [x9, #64]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
fadd v5.2d, v5.2d, v17.2d
|
||||
ldp q16, q17, [x9, #128]
|
||||
fadd v0.2d, v0.2d, v16.2d
|
||||
fadd v1.2d, v1.2d, v17.2d
|
||||
ldp q16, q17, [x9, #192]
|
||||
ldp q18, q19, [x9, #96]
|
||||
fadd v6.2d, v6.2d, v18.2d
|
||||
fadd v7.2d, v7.2d, v19.2d
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
ldp q18, q19, [x9, #160]
|
||||
fadd v2.2d, v2.2d, v18.2d
|
||||
fadd v3.2d, v3.2d, v19.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
fadd v4.2d, v5.2d, v17.2d
|
||||
ldp q18, q19, [x9, #224]
|
||||
add x9, x9, #512 // =512
|
||||
fadd v1.2d, v1.2d, v4.2d
|
||||
fadd v4.2d, v6.2d, v18.2d
|
||||
fadd v2.2d, v2.2d, v4.2d
|
||||
fadd v4.2d, v7.2d, v19.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
adds x10, x10, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
47
examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s
Normal file
47
examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s
Normal file
@@ -0,0 +1,47 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x17, x16
|
||||
ldr q4, [x17], 16
|
||||
ldr q5, [x16, 16]
|
||||
add x16, x16, 128
|
||||
ldr q3, [x16, -80]
|
||||
ldr q2, [x16, -64]
|
||||
ldr q0, [x16, -48]
|
||||
ldr q1, [x16, -32]
|
||||
ldr q7, [x16, -16]
|
||||
dup d16, v4.d[0]
|
||||
dup d6, v4.d[1]
|
||||
ldr q4, [x17, 16]
|
||||
dup d22, v5.d[0]
|
||||
dup d5, v5.d[1]
|
||||
dup d20, v3.d[0]
|
||||
dup d3, v3.d[1]
|
||||
dup d19, v2.d[0]
|
||||
dup d2, v2.d[1]
|
||||
dup d21, v4.d[0]
|
||||
dup d4, v4.d[1]
|
||||
fadd d10, d8, d16
|
||||
dup d18, v0.d[0]
|
||||
dup d0, v0.d[1]
|
||||
dup d8, v1.d[0]
|
||||
dup d1, v1.d[1]
|
||||
dup d17, v7.d[0]
|
||||
dup d7, v7.d[1]
|
||||
fadd d23, d6, d10
|
||||
fadd d24, d23, d22
|
||||
fadd d25, d5, d24
|
||||
fadd d26, d25, d21
|
||||
fadd d27, d4, d26
|
||||
fadd d28, d27, d20
|
||||
fadd d29, d3, d28
|
||||
fadd d30, d29, d19
|
||||
fadd d31, d2, d30
|
||||
fadd d16, d31, d18
|
||||
fadd d6, d0, d16
|
||||
fadd d22, d6, d8
|
||||
fadd d5, d1, d22
|
||||
fadd d20, d5, d17
|
||||
fadd d8, d7, d20
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
23
examples/sum_reduction/sum_reduction.s.tx2.gcc.s
Normal file
23
examples/sum_reduction/sum_reduction.s.tx2.gcc.s
Normal file
@@ -0,0 +1,23 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x17, x16
|
||||
ldr q10, [x17], 16
|
||||
ldr q16, [x16, 16]
|
||||
add x16, x16, 128
|
||||
ldr q17, [x16, -80]
|
||||
ldr q18, [x16, -64]
|
||||
ldr q19, [x16, -48]
|
||||
ldr q20, [x16, -32]
|
||||
ldr q21, [x16, -16]
|
||||
fadd v22.2d, v1.2d, v10.2d
|
||||
ldr q23, [x17, 16]
|
||||
fadd v24.2d, v22.2d, v16.2d
|
||||
fadd v25.2d, v24.2d, v23.2d
|
||||
fadd v26.2d, v25.2d, v17.2d
|
||||
fadd v27.2d, v26.2d, v18.2d
|
||||
fadd v28.2d, v27.2d, v19.2d
|
||||
fadd v29.2d, v28.2d, v20.2d
|
||||
fadd v1.2d, v29.2d, v21.2d
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
38
examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s
Normal file
38
examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s
Normal file
@@ -0,0 +1,38 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovsd (%r10), %xmm8
|
||||
vmovsd 8(%r10), %xmm10
|
||||
subq $-128, %r10
|
||||
vmovsd -112(%r10), %xmm12
|
||||
vmovsd -104(%r10), %xmm14
|
||||
vmovsd -96(%r10), %xmm1
|
||||
vmovsd -88(%r10), %xmm2
|
||||
vmovsd -80(%r10), %xmm3
|
||||
vmovsd -72(%r10), %xmm6
|
||||
vaddsd %xmm8, %xmm7, %xmm9
|
||||
vmovsd -64(%r10), %xmm8
|
||||
vaddsd %xmm9, %xmm10, %xmm11
|
||||
vmovsd -56(%r10), %xmm10
|
||||
vaddsd %xmm12, %xmm11, %xmm13
|
||||
vmovsd -48(%r10), %xmm12
|
||||
vaddsd %xmm13, %xmm14, %xmm15
|
||||
vmovsd -40(%r10), %xmm14
|
||||
vaddsd %xmm1, %xmm15, %xmm4
|
||||
vmovsd -32(%r10), %xmm1
|
||||
vaddsd %xmm4, %xmm2, %xmm0
|
||||
vmovsd -24(%r10), %xmm2
|
||||
vaddsd %xmm3, %xmm0, %xmm5
|
||||
vmovsd -16(%r10), %xmm3
|
||||
vaddsd %xmm5, %xmm6, %xmm7
|
||||
vmovsd -8(%r10), %xmm6
|
||||
vaddsd %xmm8, %xmm7, %xmm9
|
||||
vaddsd %xmm9, %xmm10, %xmm11
|
||||
vaddsd %xmm12, %xmm11, %xmm13
|
||||
vaddsd %xmm13, %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm4
|
||||
vaddsd %xmm4, %xmm2, %xmm0
|
||||
vaddsd %xmm3, %xmm0, %xmm5
|
||||
vaddsd %xmm5, %xmm6, %xmm7
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
14
examples/sum_reduction/sum_reduction.s.zen.gcc.s
Normal file
14
examples/sum_reduction/sum_reduction.s.zen.gcc.s
Normal file
@@ -0,0 +1,14 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vaddpd (%r10), %xmm3, %xmm1
|
||||
subq $-128, %r10
|
||||
vaddpd -112(%r10), %xmm1, %xmm4
|
||||
vaddpd -96(%r10), %xmm4, %xmm5
|
||||
vaddpd -80(%r10), %xmm5, %xmm6
|
||||
vaddpd -64(%r10), %xmm6, %xmm8
|
||||
vaddpd -48(%r10), %xmm8, %xmm9
|
||||
vaddpd -32(%r10), %xmm9, %xmm10
|
||||
vaddpd -16(%r10), %xmm10, %xmm3
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,199 +0,0 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
|
||||
# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
|
||||
.file "taxCalc.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
# -- Begin main
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl main
|
||||
# --- main(void)
|
||||
main:
|
||||
..B1.1: # Preds ..B1.0
|
||||
.cfi_startproc
|
||||
..___tag_value_main.1:
|
||||
..L2:
|
||||
#4.15
|
||||
pushq %rbp #4.15
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbp #4.15
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_offset 6, -16
|
||||
andq $-128, %rsp #4.15
|
||||
subq $4096, %rsp #4.15
|
||||
movl $104446, %esi #4.15
|
||||
movl $3, %edi #4.15
|
||||
call __intel_new_feature_proc_init #4.15
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.10: # Preds ..B1.1
|
||||
vstmxcsr (%rsp) #4.15
|
||||
movl $.2.3_2_kmpc_loc_struct_pack.3, %edi #4.15
|
||||
xorl %esi, %esi #4.15
|
||||
orl $32832, (%rsp) #4.15
|
||||
xorl %eax, %eax #4.15
|
||||
vldmxcsr (%rsp) #4.15
|
||||
..___tag_value_main.6:
|
||||
call __kmpc_begin #4.15
|
||||
..___tag_value_main.7:
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.2: # Preds ..B1.10
|
||||
movl $il0_peep_printf_format_0, %edi #5.5
|
||||
call puts #5.5
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.3: # Preds ..B1.2
|
||||
vmovss .L_2il0floatpacket.0(%rip), %xmm0 #8.15
|
||||
xorl %eax, %eax #11.5
|
||||
vxorps %xmm1, %xmm1, %xmm1 #9.5
|
||||
vmovss %xmm1, (%rsp) #9.5
|
||||
|
||||
movl $111,%ebx #IACA START
|
||||
.byte 100,103,144 #IACA START
|
||||
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
|
||||
|
||||
..B1.4: # Preds ..B1.4 ..B1.3
|
||||
lea 1(%rax,%rax), %edx #12.9
|
||||
vcvtsi2ss %edx, %xmm2, %xmm2 #12.27
|
||||
vmulss %xmm2, %xmm0, %xmm3 #12.29
|
||||
lea 2(%rax,%rax), %ecx #12.9
|
||||
vaddss %xmm3, %xmm1, %xmm4 #12.29
|
||||
vxorps %xmm1, %xmm1, %xmm1 #12.27
|
||||
vcvtsi2ss %ecx, %xmm1, %xmm1 #12.27
|
||||
vmulss %xmm1, %xmm0, %xmm5 #12.29
|
||||
vmovss %xmm4, 4(%rsp,%rax,8) #12.9
|
||||
vaddss %xmm5, %xmm4, %xmm1 #12.29
|
||||
vmovss %xmm1, 8(%rsp,%rax,8) #12.9
|
||||
incq %rax #11.5
|
||||
cmpq $499, %rax #11.5
|
||||
jb ..B1.4 # Prob 99% #11.5
|
||||
|
||||
movl $222,%ebx #IACA END
|
||||
.byte 100,103,144 #IACA END
|
||||
|
||||
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
|
||||
..B1.5: # Preds ..B1.4
|
||||
vmovss 3992(%rsp), %xmm0 #12.18
|
||||
movl $il0_peep_printf_format_1, %edi #15.5
|
||||
vaddss .L_2il0floatpacket.1(%rip), %xmm0, %xmm1 #12.29
|
||||
vmovss %xmm1, 3996(%rsp) #12.9
|
||||
call puts #15.5
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.6: # Preds ..B1.5
|
||||
movl $.2.3_2_kmpc_loc_struct_pack.14, %edi #16.12
|
||||
xorl %eax, %eax #16.12
|
||||
..___tag_value_main.8:
|
||||
call __kmpc_end #16.12
|
||||
..___tag_value_main.9:
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.7: # Preds ..B1.6
|
||||
xorl %eax, %eax #16.12
|
||||
movq %rbp, %rsp #16.12
|
||||
popq %rbp #16.12
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 6
|
||||
ret #16.12
|
||||
.align 16,0x90
|
||||
.cfi_endproc
|
||||
# LOE
|
||||
# mark_end;
|
||||
.type main,@function
|
||||
.size main,.-main
|
||||
.data
|
||||
.align 4
|
||||
.align 4
|
||||
.2.3_2_kmpc_loc_struct_pack.3:
|
||||
.long 0
|
||||
.long 2
|
||||
.long 0
|
||||
.long 0
|
||||
.quad .2.3_2__kmpc_loc_pack.2
|
||||
.align 4
|
||||
.2.3_2__kmpc_loc_pack.2:
|
||||
.byte 59
|
||||
.byte 117
|
||||
.byte 110
|
||||
.byte 107
|
||||
.byte 110
|
||||
.byte 111
|
||||
.byte 119
|
||||
.byte 110
|
||||
.byte 59
|
||||
.byte 109
|
||||
.byte 97
|
||||
.byte 105
|
||||
.byte 110
|
||||
.byte 59
|
||||
.byte 52
|
||||
.byte 59
|
||||
.byte 52
|
||||
.byte 59
|
||||
.byte 59
|
||||
.space 1, 0x00 # pad
|
||||
.align 4
|
||||
.2.3_2_kmpc_loc_struct_pack.14:
|
||||
.long 0
|
||||
.long 2
|
||||
.long 0
|
||||
.long 0
|
||||
.quad .2.3_2__kmpc_loc_pack.13
|
||||
.align 4
|
||||
.2.3_2__kmpc_loc_pack.13:
|
||||
.byte 59
|
||||
.byte 117
|
||||
.byte 110
|
||||
.byte 107
|
||||
.byte 110
|
||||
.byte 111
|
||||
.byte 119
|
||||
.byte 110
|
||||
.byte 59
|
||||
.byte 109
|
||||
.byte 97
|
||||
.byte 105
|
||||
.byte 110
|
||||
.byte 59
|
||||
.byte 49
|
||||
.byte 54
|
||||
.byte 59
|
||||
.byte 49
|
||||
.byte 54
|
||||
.byte 59
|
||||
.byte 59
|
||||
.section .rodata.str1.4, "aMS",@progbits,1
|
||||
.align 4
|
||||
.align 4
|
||||
il0_peep_printf_format_0:
|
||||
.long 1128354639
|
||||
.long 1702109249
|
||||
.long 1931506803
|
||||
.long 1953653108
|
||||
.byte 0
|
||||
.space 3, 0x00 # pad
|
||||
.align 4
|
||||
il0_peep_printf_format_1:
|
||||
.long 1128354639
|
||||
.long 1702109249
|
||||
.long 1696625779
|
||||
.word 25710
|
||||
.byte 0
|
||||
.data
|
||||
# -- End main
|
||||
.section .rodata, "a"
|
||||
.align 4
|
||||
.align 4
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x3e428f5c
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,4
|
||||
.align 4
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x433dcf5c
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,4
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
// -- Begin DWARF2 SEGMENT .eh_frame
|
||||
.section .eh_frame,"a",@progbits
|
||||
.eh_frame_seg:
|
||||
.align 8
|
||||
# End
|
||||
@@ -1,18 +0,0 @@
|
||||
#include <stdio.h>
|
||||
//#include "iacaMarks.h"
|
||||
|
||||
int main(void){
|
||||
printf("OSACA test start\n");
|
||||
int i = 1;
|
||||
float arr[1000];
|
||||
float tax = 0.19;
|
||||
arr[0] = 0;
|
||||
//STARTLOOP
|
||||
while(i < 1000){
|
||||
arr[i] = arr[i-1]+i*tax;
|
||||
i += 1;
|
||||
}
|
||||
|
||||
printf("OSACA test end\n");
|
||||
return 0;
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
|
||||
void triad(int N){
|
||||
void dummy(double*);
|
||||
double a[N], b[N], c[N], d[N];
|
||||
double s;
|
||||
|
||||
//STARTLOOP
|
||||
for(int i=0; i<N; ++i)
|
||||
a[i] = b[i] + c[i] * d[i];
|
||||
|
||||
dummy(&a[1]);
|
||||
}
|
||||
36
examples/triad/triad.s.csx.gcc.s
Normal file
36
examples/triad/triad.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r14,%rsi), %ymm14
|
||||
vmovupd 32(%r14,%rsi), %ymm15
|
||||
vmovupd 64(%r14,%rsi), %ymm1
|
||||
vmovupd 96(%r14,%rsi), %ymm0
|
||||
vmovupd 128(%r14,%rsi), %ymm3
|
||||
vmovupd 160(%r14,%rsi), %ymm4
|
||||
vmovupd 192(%r14,%rsi), %ymm5
|
||||
vmovupd 224(%r14,%rsi), %ymm7
|
||||
vfmadd213pd 0(%r13,%rsi), %ymm6, %ymm14
|
||||
vfmadd213pd 32(%r13,%rsi), %ymm6, %ymm15
|
||||
vfmadd213pd 64(%r13,%rsi), %ymm6, %ymm1
|
||||
vfmadd213pd 96(%r13,%rsi), %ymm6, %ymm0
|
||||
vfmadd213pd 128(%r13,%rsi), %ymm6, %ymm3
|
||||
vfmadd213pd 160(%r13,%rsi), %ymm6, %ymm4
|
||||
vfmadd213pd 192(%r13,%rsi), %ymm6, %ymm5
|
||||
vfmadd213pd 224(%r13,%rsi), %ymm6, %ymm7
|
||||
vmovupd %ymm14, (%r12,%rsi)
|
||||
vmovupd %ymm15, 32(%r12,%rsi)
|
||||
vmovupd %ymm1, 64(%r12,%rsi)
|
||||
vmovupd %ymm0, 96(%r12,%rsi)
|
||||
vmovupd %ymm3, 128(%r12,%rsi)
|
||||
vmovupd %ymm4, 160(%r12,%rsi)
|
||||
vmovupd %ymm5, 192(%r12,%rsi)
|
||||
vmovupd %ymm7, 224(%r12,%rsi)
|
||||
addq $256, %rsi
|
||||
cmpq %rsi, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
16
examples/triad/triad.s.csx.icc.s
Normal file
16
examples/triad/triad.s.csx.icc.s
Normal file
@@ -0,0 +1,16 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.40: # Preds ..B1.40 ..B1.39
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r13,%rax,8), %zmm1 #78.5
|
||||
vfmadd213pd (%rcx,%rax,8), %zmm2, %zmm1 #78.5
|
||||
vmovupd %zmm1, (%r14,%rax,8) #78.5
|
||||
addq $8, %rax #78.5
|
||||
cmpq %r12, %rax #78.5
|
||||
jb ..B1.40 # Prob 82% #78.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
118
examples/triad/triad.s.tx2.clang.s
Normal file
118
examples/triad/triad.s.tx2.clang.s
Normal file
@@ -0,0 +1,118 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q2, q3, [x9, #-256]
|
||||
ldp q0, q1, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fmla v2.2d, v4.2d, v16.2d
|
||||
fmla v3.2d, v5.2d, v16.2d
|
||||
stp q2, q3, [x11, #-256]
|
||||
fmla v0.2d, v6.2d, v16.2d
|
||||
fmla v1.2d, v7.2d, v16.2d
|
||||
stp q0, q1, [x11, #-224]
|
||||
ldp q6, q7, [x9, #-192]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v6.2d, v16.2d, v20.2d
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q4, q5, [x9, #-160]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fmla v7.2d, v17.2d, v16.2d
|
||||
stp q6, q7, [x11, #-192]
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v4.2d, v18.2d, v16.2d
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v5.2d, v19.2d, v16.2d
|
||||
stp q4, q5, [x11, #-160]
|
||||
ldp q17, q19, [x9, #-128]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v17.2d, v20.2d, v24.2d
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q16, q18, [x9, #-96]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fmla v19.2d, v21.2d, v20.2d
|
||||
stp q17, q19, [x11, #-128]
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v16.2d, v22.2d, v20.2d
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q24, q25, [x10, #-64]
|
||||
fmla v18.2d, v23.2d, v20.2d
|
||||
stp q16, q18, [x11, #-96]
|
||||
ldp q20, q22, [x9, #-64]
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v20.2d, v24.2d, v28.2d
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q21, q23, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fmla v22.2d, v25.2d, v24.2d
|
||||
stp q20, q22, [x11, #-64]
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v21.2d, v26.2d, v24.2d
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q28, q29, [x10]
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q30, q31, [x10, #32]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v23.2d, v27.2d, v24.2d
|
||||
stp q21, q23, [x11, #-32]
|
||||
ldp q24, q25, [x9]
|
||||
fmla v24.2d, v28.2d, v8.2d
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q8, q10, [x10, #64]
|
||||
ldp q11, q12, [x10, #96]
|
||||
fmla v25.2d, v29.2d, v28.2d
|
||||
stp q24, q25, [x11]
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v26.2d, v30.2d, v28.2d
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q13, q14, [x10, #128]
|
||||
ldr q2, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q1, q3, [x10, #192]
|
||||
fmla v27.2d, v31.2d, v28.2d
|
||||
stp q26, q27, [x11, #32]
|
||||
ldp q28, q29, [x9, #64]
|
||||
fmla v28.2d, v8.2d, v9.2d
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q6, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q5, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v29.2d, v10.2d, v8.2d
|
||||
stp q28, q29, [x11, #64]
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v30.2d, v11.2d, v8.2d
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
add x8, x8, #64 // =64
|
||||
fmla v31.2d, v12.2d, v8.2d
|
||||
stp q30, q31, [x11, #96]
|
||||
ldp q8, q10, [x9, #128]
|
||||
fmla v8.2d, v13.2d, v9.2d
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q11, q12, [x9, #160]
|
||||
fmla v10.2d, v14.2d, v9.2d
|
||||
stp q8, q10, [x11, #128]
|
||||
ldp q13, q14, [x10, #160]
|
||||
fmla v12.2d, v14.2d, v2.2d
|
||||
ldp q2, q0, [x9, #192]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v2.2d, v1.2d, v6.2d
|
||||
ldp q1, q4, [x9, #224]
|
||||
fmla v0.2d, v3.2d, v5.2d
|
||||
stp q2, q0, [x11, #192]
|
||||
ldp q3, q5, [x10, #224]
|
||||
fmla v11.2d, v13.2d, v9.2d
|
||||
stp q11, q12, [x11, #160]
|
||||
fmla v1.2d, v3.2d, v16.2d
|
||||
fmla v4.2d, v5.2d, v16.2d
|
||||
stp q1, q4, [x11, #224]
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x12, x12, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
45
examples/triad/triad.s.tx2.gcc.s
Normal file
45
examples/triad/triad.s.tx2.gcc.s
Normal file
@@ -0,0 +1,45 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x0, x10, 16
|
||||
ldr q23, [x20, x10]
|
||||
ldr q24, [x21, x10]
|
||||
add x7, x10, 32
|
||||
ldr q25, [x20, x0]
|
||||
ldr q26, [x21, x0]
|
||||
add x6, x10, 48
|
||||
add x5, x10, 64
|
||||
ldr q27, [x20, x7]
|
||||
ldr q28, [x21, x7]
|
||||
add x4, x10, 80
|
||||
add x11, x10, 96
|
||||
ldr q29, [x20, x6]
|
||||
ldr q30, [x21, x6]
|
||||
add x2, x10, 112
|
||||
fmla v23.2d, v3.2d, v24.2d
|
||||
ldr q31, [x20, x5]
|
||||
ldr q4, [x21, x5]
|
||||
fmla v25.2d, v3.2d, v26.2d
|
||||
ldr q2, [x20, x4]
|
||||
ldr q5, [x21, x4]
|
||||
fmla v27.2d, v3.2d, v28.2d
|
||||
ldr q1, [x20, x11]
|
||||
ldr q6, [x21, x11]
|
||||
fmla v29.2d, v3.2d, v30.2d
|
||||
ldr q0, [x20, x2]
|
||||
ldr q7, [x21, x2]
|
||||
fmla v31.2d, v3.2d, v4.2d
|
||||
fmla v2.2d, v3.2d, v5.2d
|
||||
fmla v1.2d, v3.2d, v6.2d
|
||||
str q23, [x19, x10]
|
||||
add x10, x10, 128
|
||||
fmla v0.2d, v3.2d, v7.2d
|
||||
str q25, [x19, x0]
|
||||
str q27, [x19, x7]
|
||||
str q29, [x19, x6]
|
||||
str q31, [x19, x5]
|
||||
str q2, [x19, x4]
|
||||
str q1, [x19, x11]
|
||||
str q0, [x19, x2]
|
||||
cmp x24, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/triad/triad.s.zen.gcc.s
Normal file
30
examples/triad/triad.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%r13,%rax), %xmm12
|
||||
vmovups 16(%r13,%rax), %xmm13
|
||||
vmovups 32(%r13,%rax), %xmm14
|
||||
vmovups 48(%r13,%rax), %xmm15
|
||||
vmovups 64(%r13,%rax), %xmm1
|
||||
vmovups 80(%r13,%rax), %xmm0
|
||||
vmovups 96(%r13,%rax), %xmm4
|
||||
vmovups 112(%r13,%rax), %xmm5
|
||||
vfmadd213pd (%r12,%rax), %xmm3, %xmm12
|
||||
vfmadd213pd 16(%r12,%rax), %xmm3, %xmm13
|
||||
vfmadd213pd 32(%r12,%rax), %xmm3, %xmm14
|
||||
vfmadd213pd 48(%r12,%rax), %xmm3, %xmm15
|
||||
vfmadd213pd 64(%r12,%rax), %xmm3, %xmm1
|
||||
vfmadd213pd 80(%r12,%rax), %xmm3, %xmm0
|
||||
vfmadd213pd 96(%r12,%rax), %xmm3, %xmm4
|
||||
vfmadd213pd 112(%r12,%rax), %xmm3, %xmm5
|
||||
vmovups %xmm12, 0(%rbp,%rax)
|
||||
vmovups %xmm13, 16(%rbp,%rax)
|
||||
vmovups %xmm14, 32(%rbp,%rax)
|
||||
vmovups %xmm15, 48(%rbp,%rax)
|
||||
vmovups %xmm1, 64(%rbp,%rax)
|
||||
vmovups %xmm0, 80(%rbp,%rax)
|
||||
vmovups %xmm4, 96(%rbp,%rax)
|
||||
vmovups %xmm5, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rbx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
28
examples/update/update.s.csx.gcc.s
Normal file
28
examples/update/update.s.csx.gcc.s
Normal file
@@ -0,0 +1,28 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmulpd (%rcx), %ymm3, %ymm12
|
||||
vmulpd 32(%rcx), %ymm3, %ymm13
|
||||
vmulpd 64(%rcx), %ymm3, %ymm14
|
||||
vmulpd 96(%rcx), %ymm3, %ymm15
|
||||
vmulpd 128(%rcx), %ymm3, %ymm0
|
||||
vmulpd 160(%rcx), %ymm3, %ymm1
|
||||
vmulpd 192(%rcx), %ymm3, %ymm7
|
||||
vmulpd 224(%rcx), %ymm3, %ymm4
|
||||
vmovupd %ymm12, (%rcx)
|
||||
vmovupd %ymm13, 32(%rcx)
|
||||
vmovupd %ymm14, 64(%rcx)
|
||||
vmovupd %ymm15, 96(%rcx)
|
||||
vmovupd %ymm0, 128(%rcx)
|
||||
vmovupd %ymm1, 160(%rcx)
|
||||
vmovupd %ymm7, 192(%rcx)
|
||||
vmovupd %ymm4, 224(%rcx)
|
||||
addq $256, %rcx
|
||||
cmpq %r15, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
17
examples/update/update.s.csx.icc.s
Normal file
17
examples/update/update.s.csx.icc.s
Normal file
@@ -0,0 +1,17 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.38: # Preds ..B1.38 ..B1.37
|
||||
# Execution count [2.22e+03]
|
||||
vmulpd (%r13,%rax,8), %zmm3, %zmm1 #75.5
|
||||
vmulpd 64(%r13,%rax,8), %zmm3, %zmm2 #75.5
|
||||
vmovupd %zmm1, (%r13,%rax,8) #75.5
|
||||
vmovupd %zmm2, 64(%r13,%rax,8) #75.5
|
||||
addq $16, %rax #75.5
|
||||
cmpq %r14, %rax #75.5
|
||||
jb ..B1.38 # Prob 82% #75.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
15
examples/update/update.s.tx2.clang.s
Normal file
15
examples/update/update.s.tx2.clang.s
Normal file
@@ -0,0 +1,15 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_32: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x8]
|
||||
ldp q2, q3, [x8, #-32]
|
||||
fmul v2.2d, v2.2d, v26.2d
|
||||
fmul v3.2d, v3.2d, v26.2d
|
||||
stp q2, q3, [x8, #-32]
|
||||
fmul v0.2d, v0.2d, v26.2d
|
||||
fmul v1.2d, v1.2d, v26.2d
|
||||
stp q0, q1, [x8], #64
|
||||
adds x9, x9, #1 // =1
|
||||
b.ne .LBB1_32
|
||||
// OSACA-END
|
||||
31
examples/update/update.s.tx2.gcc.s
Normal file
31
examples/update/update.s.tx2.gcc.s
Normal file
@@ -0,0 +1,31 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
ldr q23, [x16]
|
||||
mov x17, x16
|
||||
add x16, x16, 128
|
||||
fmul v24.2d, v23.2d, v2.2d
|
||||
str q24, [x17], 16
|
||||
ldr q25, [x16, -112]
|
||||
fmul v26.2d, v25.2d, v2.2d
|
||||
str q26, [x16, -112]
|
||||
ldr q27, [x17, 16]
|
||||
fmul v28.2d, v27.2d, v2.2d
|
||||
str q28, [x17, 16]
|
||||
ldr q29, [x16, -80]
|
||||
ldr q30, [x16, -64]
|
||||
ldr q31, [x16, -48]
|
||||
ldr q1, [x16, -32]
|
||||
ldr q0, [x16, -16]
|
||||
fmul v5.2d, v29.2d, v2.2d
|
||||
fmul v4.2d, v30.2d, v2.2d
|
||||
fmul v3.2d, v31.2d, v2.2d
|
||||
fmul v6.2d, v1.2d, v2.2d
|
||||
fmul v7.2d, v0.2d, v2.2d
|
||||
str q5, [x16, -80]
|
||||
str q4, [x16, -64]
|
||||
str q3, [x16, -48]
|
||||
str q6, [x16, -32]
|
||||
str q7, [x16, -16]
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
22
examples/update/update.s.zen.gcc.s
Normal file
22
examples/update/update.s.zen.gcc.s
Normal file
@@ -0,0 +1,22 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmulpd (%r10), %xmm3, %xmm11
|
||||
subq $-128, %r10
|
||||
vmulpd -112(%r10), %xmm3, %xmm12
|
||||
vmulpd -96(%r10), %xmm3, %xmm13
|
||||
vmulpd -80(%r10), %xmm3, %xmm14
|
||||
vmulpd -64(%r10), %xmm3, %xmm15
|
||||
vmulpd -48(%r10), %xmm3, %xmm0
|
||||
vmovups %xmm11, -128(%r10)
|
||||
vmulpd -32(%r10), %xmm3, %xmm7
|
||||
vmovups %xmm12, -112(%r10)
|
||||
vmulpd -16(%r10), %xmm3, %xmm1
|
||||
vmovups %xmm13, -96(%r10)
|
||||
vmovups %xmm14, -80(%r10)
|
||||
vmovups %xmm15, -64(%r10)
|
||||
vmovups %xmm0, -48(%r10)
|
||||
vmovups %xmm7, -32(%r10)
|
||||
vmovups %xmm1, -16(%r10)
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
@@ -1,2 +1,11 @@
|
||||
"""Open Source Architecture Code Analyzer"""
|
||||
|
||||
name = "osaca"
|
||||
__version__ = '0.2.0'
|
||||
__version__ = "0.6.1"
|
||||
|
||||
# To trigger travis deployment to pypi, do the following:
|
||||
# 1. Increment __version___
|
||||
# 2. commit to RRZE-HPC/osaca's master branch
|
||||
# 3. wait for Github Actions to complete successful (unless already tested)
|
||||
# 4. tag commit with 'v{}'.format(__version__) (`git tag vX.Y.Z`)
|
||||
# 5. push tag to github (`git push origin vX.Y.Z` or push all tags with `git push --tags`)
|
||||
|
||||
4
osaca/__main__.py
Normal file
4
osaca/__main__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
from .osaca import main
|
||||
|
||||
main()
|
||||
@@ -1,41 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from param import Register, MemAddr, Parameter
|
||||
from testcase import Testcase
|
||||
|
||||
# Choose out of various operands
|
||||
reg8 = Register('al')
|
||||
reg16 = Register('ax')
|
||||
reg32 = Register('eax')
|
||||
reg64 = Register('rax')
|
||||
xmm = Register('xmm0')
|
||||
ymm = Register('ymm0')
|
||||
zmm = Register('zmm0')
|
||||
mem0 = MemAddr('(%rax, %esi, 4)')
|
||||
imd1 = Parameter('IMD')
|
||||
|
||||
|
||||
# -----------------------------------------------
|
||||
# -USER INPUT------------------------------------
|
||||
# -----------------------------------------------
|
||||
# Enter your mnemonic
|
||||
mnemonic = 'add'
|
||||
|
||||
# Define your operands. If you don't need it, just type in None
|
||||
dst = mem0
|
||||
op1 = imd1
|
||||
op2 = None
|
||||
|
||||
# Define the number of instructions per loop (default: 12)
|
||||
per_loop = '32'
|
||||
|
||||
# -----------------------------------------------
|
||||
# -----------------------------------------------
|
||||
|
||||
# Start
|
||||
operands = [x for x in [dst, op1, op2] if x is not None]
|
||||
opListStr = ', '.join([str(x) for x in operands])
|
||||
print('Create Testcase for {} {}'.format(mnemonic, opListStr), end='')
|
||||
tc = Testcase(mnemonic, operands, per_loop)
|
||||
tc.write_testcase()
|
||||
print(' --------> SUCCEEDED')
|
||||
34
osaca/data/_build_cache.py
Executable file
34
osaca/data/_build_cache.py
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
from glob import glob
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
sys.path[0:0] = ["../.."]
|
||||
|
||||
failed = False
|
||||
try:
|
||||
from osaca.semantics.hw_model import MachineModel
|
||||
except ModuleNotFoundError:
|
||||
print(
|
||||
"Unable to import MachineModel, probably some dependency is not yet installed. SKIPPING. "
|
||||
"First run of OSACA may take a while to build caches, subsequent runs will be as fast as "
|
||||
"ever."
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
print("Building cache: ", end="")
|
||||
sys.stdout.flush()
|
||||
|
||||
# Iterating architectures
|
||||
for f in glob(os.path.join(os.path.dirname(__file__), "*.yml")):
|
||||
MachineModel(path_to_yaml=f)
|
||||
print(".", end="")
|
||||
sys.stdout.flush()
|
||||
|
||||
# Iterating ISAs
|
||||
for f in glob(os.path.join(os.path.dirname(__file__), "isa/*.yml")):
|
||||
MachineModel(path_to_yaml=f)
|
||||
print("+", end="")
|
||||
sys.stdout.flush()
|
||||
|
||||
print()
|
||||
2617
osaca/data/a64fx.yml
Normal file
2617
osaca/data/a64fx.yml
Normal file
File diff suppressed because it is too large
Load Diff
4180
osaca/data/a72.yml
Normal file
4180
osaca/data/a72.yml
Normal file
File diff suppressed because it is too large
Load Diff
401
osaca/data/a72/mapping_pmevo.json
Normal file
401
osaca/data/a72/mapping_pmevo.json
Normal file
File diff suppressed because one or more lines are too long
808
osaca/data/a72/template.yml
Normal file
808
osaca/data/a72/template.yml
Normal file
@@ -0,0 +1,808 @@
|
||||
osaca_version: 0.3.11
|
||||
micro_architecture: Cortex A-72
|
||||
arch_code: a72
|
||||
isa: aarch64
|
||||
hidden_loads: false
|
||||
ports: ['0', '1', '2', '3', '4', '5', '6', '7']
|
||||
port_model_scheme: |
|
||||
+-------------------------------------------------------------------------------------+
|
||||
| scheduler |
|
||||
+-------------------------------------------------------------------------------------+
|
||||
0 |I 1 |L 2 |M 3 |S 4 |F1 5 |I 6 |F0 7 |B
|
||||
\/ \/ \/ \/ \/ \/ \/ \/
|
||||
+-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
|
||||
|INT ALU| | LOAD | | MUL | | STORE | | ASIMD | |INT ALU| | ASIMD | | Branch|
|
||||
+-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
|
||||
+-------+ +-------+ +-----------+ +-------+ +---------+
|
||||
| AGU | | DIV | | FP ALU | | AGU | |ASIMD MUL|
|
||||
+-------+ +-------+ +-----------+ +-------+ +---------+
|
||||
+-------+ +-----------+ +---------+
|
||||
| SHIFT | | FP MUL | | FP ALU |
|
||||
+-------+ +-----------+ +---------+
|
||||
+-------+ +-----------+ +---------+
|
||||
| CRC | | FP DIV | | FP MUL |
|
||||
+-------+ +-----------+ +---------+
|
||||
+-------+ +-----------+ +---------+
|
||||
| USAD | | FP SQRT | | FP DIV |
|
||||
+-------+ +-----------+ +---------+
|
||||
+-----------+ +---------+
|
||||
|ASIMD SHIFT| | FP CONV |
|
||||
+-----------+ +---------+
|
||||
+---------+
|
||||
| CRYPTO |
|
||||
+---------+
|
||||
# The port pressues do not always correctly match this schema, because most
|
||||
# instructions are imported from an experimentally determined mapping, which
|
||||
# is not always correct.
|
||||
load_latency: {x: 4.0, s: 5.0, d: 5.0, h: 6.0, q: 6.0}
|
||||
load_throughput: []
|
||||
load_throughput_default: [[1, '1']]
|
||||
store_throughput: []
|
||||
store_throughput_default: [[2, '3']]
|
||||
instruction_forms:
|
||||
|
||||
# Branch
|
||||
- name: b
|
||||
operands:
|
||||
- class: identifier
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '7']]
|
||||
throughput: 1.0
|
||||
- name: bne
|
||||
operands:
|
||||
- class: identifier
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '7']]
|
||||
throughput: 1.0
|
||||
- name: b.ne
|
||||
operands:
|
||||
- class: identifier
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '7']]
|
||||
throughput: 1.0
|
||||
- name: br
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '7']]
|
||||
throughput: 1.0
|
||||
- name: ret
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '7']]
|
||||
throughput: 1.0
|
||||
- name: bl
|
||||
operands:
|
||||
- class: identifier
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '05'], [1, '7']]
|
||||
throughput: 1.0
|
||||
- name: blr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '05'], [1, '7']]
|
||||
throughput: 1.0
|
||||
|
||||
# Load GPR
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[1, '1']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '3'], [1, '05']]
|
||||
throughput: 1.0
|
||||
|
||||
# Load FP d
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1'], [2, '05']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1'], [2, '05']]
|
||||
throughput: 1.0
|
||||
|
||||
# Load FP q
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 6.0
|
||||
port_pressure: [[1, '1'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 6.0
|
||||
port_pressure: [[1, '1'], [2, '05']]
|
||||
throughput: 1.0
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 6.0
|
||||
port_pressure: [[1, '1'], [2, '05']]
|
||||
throughput: 1.0
|
||||
|
||||
# Store GPR
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '3']]
|
||||
throughput: 1.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '3'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '3'], [1, '05']]
|
||||
throughput: 1.0
|
||||
|
||||
# Store FP d
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '3'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '3'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '3'], [1, '05']]
|
||||
throughput: 1.0
|
||||
|
||||
# Store FP q
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[2, '3']]
|
||||
throughput: 2.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[2, '3'], [1, '05']]
|
||||
throughput: 2.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 2.0
|
||||
port_pressure: [[2, '3'], [1, '05']]
|
||||
throughput: 2.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[2, '3'], [1, '05']]
|
||||
throughput: 2.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[2, '3'], [2, '05']]
|
||||
throughput: 2.0
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 4.0
|
||||
port_pressure: [[2, '3'], [2, '05']]
|
||||
throughput: 2.0
|
||||
|
||||
# Load unscaled GPR
|
||||
- name: ldur
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: '*'
|
||||
pre_indexed: '*'
|
||||
latency: 4.0
|
||||
port_pressure: [[1, '1']]
|
||||
throughput: 1.0
|
||||
|
||||
# Load unscaled FP q
|
||||
- name: ldur
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: '*'
|
||||
pre_indexed: '*'
|
||||
latency: 5.0
|
||||
port_pressure: [[1, '1']]
|
||||
throughput: 1.0
|
||||
|
||||
# Store unscaled GPR
|
||||
- name: stur
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: '*'
|
||||
pre_indexed: '*'
|
||||
latency: 1.0
|
||||
port_pressure: [[1, '3']]
|
||||
throughput: 1.0
|
||||
|
||||
# Store unscaled FP q
|
||||
- name: stur
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: '*'
|
||||
pre_indexed: '*'
|
||||
latency: 2.0
|
||||
port_pressure: [[2, '3']]
|
||||
throughput: 2.0
|
||||
|
||||
# Load pair GPR
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[1, '1']]
|
||||
throughput: 1.0
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[1, '1'], [1, '05']]
|
||||
throughput: 1.0
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 4.0
|
||||
port_pressure: [[1, '1'], [1, '05']]
|
||||
throughput: 1.0
|
||||
|
||||
# Load pair FP q
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 6.0
|
||||
port_pressure: [[2, '1']]
|
||||
throughput: 2.0
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 6.0
|
||||
port_pressure: [[2, '1'], [1, '05']]
|
||||
throughput: 2.0
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 6.0
|
||||
port_pressure: [[2, '1'], [1, '05']]
|
||||
throughput: 2.0
|
||||
|
||||
# Store pair GPR
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 2.0
|
||||
port_pressure: [[2, '3']]
|
||||
throughput: 2.0
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 2.0
|
||||
port_pressure: [[2, '3'], [1, '05']]
|
||||
throughput: 2.0
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 2.0
|
||||
port_pressure: [[2, '3'], [1, '05']]
|
||||
throughput: 2.0
|
||||
|
||||
# Store pair FP q
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[4, '3'], [1, '05']]
|
||||
throughput: 4.0
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: true
|
||||
pre_indexed: false
|
||||
latency: 4.0
|
||||
port_pressure: [[4, '3'], [1, '05']]
|
||||
throughput: 4.0
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post_indexed: false
|
||||
pre_indexed: true
|
||||
latency: 4.0
|
||||
port_pressure: [[4, '3'], [1, '05']]
|
||||
throughput: 4.0
|
||||
|
||||
# Fast-forward (measures 4 cycles, but can be 3)
|
||||
# Lower bound is used in order to ensure no over-estimates are possible.
|
||||
# Ports do not match documentation, but "fixing" requires also "fixing" almost
|
||||
# the entire rest of the model.
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: s
|
||||
- class: register
|
||||
prefix: s
|
||||
- class: register
|
||||
prefix: s
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '45']]
|
||||
throughput: 0.5
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '45']]
|
||||
throughput: 0.5
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '5']]
|
||||
throughput: 1.0
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '5']]
|
||||
throughput: 1.0
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: s
|
||||
- class: register
|
||||
prefix: s
|
||||
- class: register
|
||||
prefix: s
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '45']]
|
||||
throughput: 0.5
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '45']]
|
||||
throughput: 0.5
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '5']]
|
||||
throughput: 1.0
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
latency: 3.0
|
||||
port_pressure: [[1, '5']]
|
||||
throughput: 1.0
|
||||
|
||||
# Automatically generated instructions
|
||||
44150
osaca/data/bdw.yml
Normal file
44150
osaca/data/bdw.yml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,39 +0,0 @@
|
||||
instr,TP,LT,ports
|
||||
jmp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
je-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jne-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jna-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
ja-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jng-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jg-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jpe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jnp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jpo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jcxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jecxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
|
||||
|
||||
|
180
osaca/data/create_db_entry.py
Normal file
180
osaca/data/create_db_entry.py
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
from collections import defaultdict
|
||||
from fractions import Fraction
|
||||
|
||||
|
||||
class EntryBuilder:
|
||||
@staticmethod
|
||||
def compute_throughput(port_pressure):
|
||||
port_occupancy = defaultdict(Fraction)
|
||||
for uops, ports in port_pressure:
|
||||
for p in ports:
|
||||
port_occupancy[p] += Fraction(uops, len(ports))
|
||||
return float(max(list(port_occupancy.values()) + [0]))
|
||||
|
||||
@staticmethod
|
||||
def classify(operands_types):
|
||||
load = "mem" in operands_types[:-1]
|
||||
store = "mem" in operands_types[-1:]
|
||||
vec = False
|
||||
if any([vecr in operands_types for vecr in ["mm", "xmm", "ymm", "zmm"]]):
|
||||
vec = True
|
||||
assert not (load and store), "Can not process a combined load-store instruction."
|
||||
return load, store, vec
|
||||
|
||||
def build_description(
|
||||
self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
|
||||
):
|
||||
if comment:
|
||||
comment = " # " + comment
|
||||
else:
|
||||
comment = ""
|
||||
description = "- name: {}{}\n operands: {}\n".format(
|
||||
instruction_name, comment, "[]" if len(operand_types) == 0 else ""
|
||||
)
|
||||
|
||||
for ot in operand_types:
|
||||
if ot == "imd":
|
||||
description += " - class: immediate\n imd: int\n"
|
||||
elif ot.startswith("mem"):
|
||||
description += " - class: memory\n" ' base: "*"\n' ' offset: "*"\n'
|
||||
if ot == "mem_simple":
|
||||
description += " index: ~\n"
|
||||
elif ot == "mem_complex":
|
||||
description += " index: gpr\n"
|
||||
else:
|
||||
description += ' index: "*"\n'
|
||||
description += ' scale: "*"\n'
|
||||
else:
|
||||
if "{k}" in ot:
|
||||
description += " - class: register\n name: {}\n mask: True\n".format(
|
||||
ot.replace("{k}", "")
|
||||
)
|
||||
else:
|
||||
description += " - class: register\n name: {}\n".format(ot)
|
||||
|
||||
description += (
|
||||
" latency: {latency}\n"
|
||||
" port_pressure: {port_pressure!r}\n"
|
||||
" throughput: {throughput}\n"
|
||||
" uops: {uops}\n"
|
||||
).format(
|
||||
latency=latency,
|
||||
port_pressure=port_pressure,
|
||||
throughput=self.compute_throughput(port_pressure),
|
||||
uops=sum([i for i, p in port_pressure]),
|
||||
)
|
||||
return description
|
||||
|
||||
def parse_port_pressure(self, port_pressure_str):
|
||||
"""
|
||||
Example:
|
||||
1*p45+2*p0+2*p10,11 -> [[1, '45'], [2, '0'], [2, ['10', '11']]]
|
||||
"""
|
||||
port_pressure = []
|
||||
if port_pressure_str:
|
||||
for p in port_pressure_str.split("+"):
|
||||
cycles, ports = p.split("*p")
|
||||
ports = ports.split(",")
|
||||
if len(ports) == 1:
|
||||
ports = ports[0]
|
||||
else:
|
||||
ports = list(filter(lambda p: len(p) > 0, ports))
|
||||
|
||||
port_pressure.append([int(cycles), ports])
|
||||
return port_pressure
|
||||
|
||||
def process_item(self, instruction_form, resources):
|
||||
"""
|
||||
Example:
|
||||
('mov xmm mem', ('1*p45+2*p0', 7) -> ('mov', ['xmm', 'mem'], [[1, '45'], [2, '0']], 7)
|
||||
"""
|
||||
if instruction_form.startswith("[") and "]" in instruction_form:
|
||||
instr_elements = instruction_form.split("]")
|
||||
instr_elements = [instr_elements[0] + "]"] + instr_elements[1].strip().split(" ")
|
||||
else:
|
||||
instr_elements = instruction_form.split(" ")
|
||||
latency = int(resources[1])
|
||||
port_pressure = self.parse_port_pressure(resources[0])
|
||||
instruction_name = instr_elements[0]
|
||||
operand_types = instr_elements[1:]
|
||||
return self.build_description(instruction_name, operand_types, port_pressure, latency)
|
||||
|
||||
|
||||
class ArchEntryBuilder(EntryBuilder):
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
# Intel ICX
|
||||
# LD_pressure = [[1, "23"], [1, ["2D", "3D"]]]
|
||||
# LD_pressure_vec = LD_pressure
|
||||
# ST_pressure = [[1, "79"], [1, "48"]]
|
||||
# ST_pressure_vec = ST_pressure
|
||||
# LD_lat = 5
|
||||
# ST_lat = 0
|
||||
# Zen3
|
||||
LD_pressure = [[1, ["11", "12", "13"]]]
|
||||
LD_pressure_vec = [[1, ["11", "12"]]]
|
||||
ST_pressure = [[1, ["12", "13"]]]
|
||||
ST_pressure_vec = [[1, ["4"]], [1, ["13"]]]
|
||||
LD_lat = 4
|
||||
ST_lat = 0
|
||||
|
||||
load, store, vec = self.classify(operand_types)
|
||||
|
||||
if load:
|
||||
if vec:
|
||||
port_pressure += LD_pressure_vec
|
||||
else:
|
||||
port_pressure += LD_pressure
|
||||
latency += LD_lat
|
||||
comment = "with load"
|
||||
return EntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency, comment
|
||||
)
|
||||
if store:
|
||||
if vec:
|
||||
port_pressure = port_pressure + ST_pressure_vec
|
||||
else:
|
||||
port_pressure = port_pressure + ST_pressure
|
||||
operands = ["mem" if o == "mem" else o for o in operand_types]
|
||||
latency += ST_lat
|
||||
return EntryBuilder.build_description(
|
||||
self,
|
||||
instruction_name,
|
||||
operands,
|
||||
port_pressure,
|
||||
latency,
|
||||
"with store",
|
||||
)
|
||||
|
||||
# Register only:
|
||||
return EntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency
|
||||
)
|
||||
|
||||
|
||||
def get_description(instruction_form, port_pressure, latency, rhs_comment=None):
|
||||
entry = ArchEntryBuilder().process_item(instruction_form, (port_pressure, latency))
|
||||
|
||||
if rhs_comment is not None:
|
||||
max_length = max([len(line) for line in entry.split("\n")])
|
||||
|
||||
commented_entry = ""
|
||||
for line in entry.split("\n"):
|
||||
commented_entry += ("{:<" + str(max_length) + "} # {}\n").format(line, rhs_comment)
|
||||
entry = commented_entry
|
||||
|
||||
return entry
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 4 and len(sys.argv) != 5:
|
||||
print("Usage: {} <INSTRUCTION> <PORT_PRESSURE> <LATENCY> [COMMENT]".format(sys.argv[0]))
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
print(get_description(*sys.argv[1:]))
|
||||
except KeyError:
|
||||
print("Unknown architecture.")
|
||||
sys.exit(1)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user