Compare commits

...

836 Commits

Author SHA1 Message Date
JanLJL
097e5a6a81 add tests for GCC Intel syntax 2025-03-12 13:49:59 +01:00
pleroy
91da9a311a Upper case the argument to the --syntax flag, otherwise 'att' means 'intel' :-/ 2025-03-12 00:35:01 +01:00
pleroy
0c201be10e Revert 62908f3b8f and fix a failure in tests.test_cli.TestCLI.test_without_arch while preserving the possibility to try more archs than the detected one. 2025-03-11 23:34:36 +01:00
JanLJL
2cf2bf5cec Merge branch 'master' into merge-branch 2025-03-07 14:45:44 +01:00
JanLJL
4e3994fec1 added support for <Xd>! registers and [<Xd>]! mem addresses in Arm 2025-03-07 11:49:14 +01:00
JanLJL
ff727223bb add setuptools to Install 2025-03-05 13:19:09 +01:00
JanLJL
306abcf0a6 specify commit for kerncraft 2025-03-05 12:54:55 +01:00
JanLJL
0b1ada14d0 undo unnecessary install 2025-03-05 11:01:47 +01:00
JanLJL
81dfb0e6cb use local osaca version 2025-03-05 11:00:08 +01:00
JanLJL
796256fa13 more black formatting 2025-03-05 10:40:18 +01:00
JanLJL
99d0e0ffb6 install kerncraft from git repo 2025-03-05 10:38:39 +01:00
JanLJL
9c2f559983 black formatting 2025-03-05 10:20:47 +01:00
JanLJL
02716e7b41 flake8 formatting 2025-03-05 10:19:10 +01:00
JanLJL
5cd6b2cf9d renamed .asm files to .s for consistency 2025-03-05 09:36:07 +01:00
JanLJL
be3622ce86 bugfixes 2025-03-04 17:46:37 +01:00
JanLJL
63774e65bc chmod +x 2025-03-04 17:46:23 +01:00
JanLJL
796cfdc3b5 add test case for specific syntax parameter in get_asm_parser() 2025-03-04 17:45:19 +01:00
JanLJL
253b0ee9d5 remove dependency on MachineModel 2025-03-04 17:44:27 +01:00
JanLJL
e37f9f119d add default syntax for get_parser for compatibility with kerncraft 2025-03-04 17:44:02 +01:00
JanLJL
400be352e1 remove dependency on MachineModel 2025-03-04 17:42:52 +01:00
JanLJL
62908f3b8f fix bug when no micro arch was given 2025-03-04 17:42:05 +01:00
JanLJL
34fef3823b get_marker() needed for kerncraft 2025-03-03 18:26:33 +01:00
pleroy
1a7c1588f6 Add support for the Intel syntax supported by MSVC and ICC 2025-02-02 14:02:16 +01:00
JanLJL
785a365c63 bugfix 2025-01-09 17:11:21 +01:00
JanLJL
34321109df more SVE instructions 2025-01-09 16:48:48 +01:00
JanLJL
ea5e56083e added vfmaddsub instructions 2024-12-31 13:46:44 +01:00
JanLJL
94e90e292c added vshufp[sd] instructions 2024-12-31 13:46:30 +01:00
JanLJL
d46ef5db22 added instructions 2024-11-26 16:57:39 +01:00
JanLJL
8482463db3 version bump 2024-10-11 16:34:48 +02:00
JanLJL
9b9d7f8649 changed default ARCHs to SPR/V2 2024-10-11 16:29:13 +02:00
JanLJL
2d1d8bf38f instruction updates 2024-10-11 16:28:49 +02:00
JanLJL
fdb57fc8da DB updates 2024-10-11 15:57:36 +02:00
JanLJL
856ae737be Merge branch 'master' of github.com:RRZE-HPC/osaca 2024-10-10 18:35:57 +02:00
JanLJL
9bc4ca24d8 added scatter instructions 2024-10-10 18:20:38 +02:00
JanLJL
a8cab3170c fix #109 2024-10-01 12:00:47 +02:00
JanLJL
b0c243305c version bump 2024-09-16 21:09:12 +02:00
JanLJL
fc621b2eff instructions updated 2024-09-08 09:35:14 +02:00
JanLJL
b3526e7eba updated instructions 2024-09-05 15:54:11 +02:00
JanLJL
3059952025 formatting 2024-09-05 10:44:23 +02:00
JanLJL
41d62c1100 more instructions 2024-09-05 10:41:16 +02:00
JanLJL
a24710412a bugfix 2024-09-05 08:42:53 +02:00
JanLJL
ce258bb4b8 bugfix 2024-09-05 08:42:11 +02:00
JanLJL
892ff71701 fixed formatting 2024-09-04 13:10:58 +02:00
JanLJL
152dd57aec added min version requirement for kerncraft in GHA 2024-09-04 12:53:39 +02:00
JanLJL
e0f4ae1e55 bugfix 2024-09-04 10:59:27 +02:00
JanLJL
3ce7e2f202 updated for newer uarchs 2024-09-04 09:52:32 +02:00
JanLJL
d4d98bd0de added SVE reg width output in main func 2024-09-03 14:27:28 +02:00
JanLJL
ef00b67f3f introduced data ports for more accurate load/store 2024-09-03 14:23:37 +02:00
JanLJL
aea6212560 fixed pre-/post-indexed keyword 2024-09-03 14:23:14 +02:00
JanLJL
a731954166 fixed YAML output 2024-09-03 14:22:19 +02:00
JanLJL
2fcc62ae1a initial support ZEN 4 2024-08-30 17:41:45 +02:00
JanLJL
6ea61d8893 formatting 2024-08-19 15:52:28 +02:00
JanLJL
2d8bb99d9f formatting 2024-08-19 15:50:37 +02:00
JanLJL
df55c29e2f updated DB 2024-08-19 15:31:55 +02:00
JanLJL
2bdc765df2 fixed bug in read-out of default store TP 2024-08-19 14:37:20 +02:00
Jan
628bdf6518 Merge pull request #108 from MarkusBuettner/fix_x86_memory_segments
Update parsing of x86 memory segments
2024-08-13 18:33:37 +02:00
Markus Büttner
0e69fa1e26 Update parsing of memory segments
This addresses issue discussed in RRZE-HPC/OSACA#107.

Now it can parse instructions of the form

%fs:var@RELOC
%fs:var@RELOC+4
%fs:var@RELOC(%rdi)
2024-07-30 16:02:16 +02:00
Jan
bc8d0c7088 Merge pull request #105 from RRZE-HPC/feat/spr
SPR and Neoverse V2 support
2024-05-03 15:22:38 +02:00
JanLJL
c87524384a formatting 2024-05-03 12:36:30 +02:00
JanLJL
8d0900e46a added missing TP entries and removed duplicates 2024-05-03 12:31:50 +02:00
JanLJL
8fa31a7fca formatting 2024-05-03 12:31:25 +02:00
Jan
5071d63b9a Fixed uarch table layout 2024-05-02 23:06:12 +02:00
JanLJL
2286da45b7 Merge branch 'master' into feat/spr 2024-05-02 21:19:10 +02:00
JanLJL
2ba04e614a DB update 2024-05-02 21:17:57 +02:00
Jan
d9cf46690f Merge pull request #102 from RRZE-HPC/InstrucForm
Object-oriented operands and instruction forms
2024-05-02 21:01:05 +02:00
JanLJL
aca5511d6a Black formatting 2024-05-02 17:04:56 +02:00
JanLJL
c9e38631d1 Flake8 formatting 2024-05-02 17:00:12 +02:00
JanLJL
d623115b1b formattign 2024-05-02 16:30:11 +02:00
JanLJL
5da00d0ae6 moved get_full_instruction_name() from HardwareModel to DBInterface 2024-05-02 16:25:41 +02:00
stefandesouza
78309574ac added prefetch operand 2024-03-18 22:29:39 +01:00
JanLJL
764b22cebe initial support for SPR 2024-03-06 00:52:06 +01:00
stefandesouza
4fd59eb0d0 Black formatting 2024-03-05 12:14:05 +01:00
stefandesouza
18da151bbf Merge remote-tracking branch 'origin/master' into InstrucForm 2024-03-05 12:12:43 +01:00
stefandesouza
d884f74f5e Uncommented tests 2024-03-05 00:19:29 +01:00
stefandesouza
5f9de2c41d Dump now converts classes to dicts 2024-03-05 00:18:45 +01:00
JanLJL
3435641451 initial support Neoverse V2 2024-03-04 20:45:48 +01:00
stefandesouza
38781ecc94 Port pressure returned in tuple with Memory Operand 2024-03-04 20:00:43 +01:00
stefandesouza
46004add41 Immediate operand attribute name changes 2024-02-28 13:01:37 +01:00
stefandesouza
1c0708e750 Added updated files 2024-02-27 14:47:55 +01:00
stefandesouza
d858827a47 Took out port pressure from Memory Operand. Gets() for LD/ST TP now use tupples 2024-02-24 21:03:15 +01:00
stefandesouza
dcfe36b850 Took out name attribute from operand parent class 2024-02-24 15:46:04 +01:00
stefandesouza
7ad3438af5 Removed comments from operands 2024-02-24 14:15:25 +01:00
stefandesouza
61dacff53e Inlined conversion of LD/ST memory operands 2024-02-24 12:45:59 +01:00
stefandesouza
fa95293cb0 Flags into operand class 2024-02-24 12:11:52 +01:00
stefandesouza
c2bd484170 Register attributes name change 2024-02-22 13:53:14 +01:00
stefandesouza
66e51630af Memory attributes name change 2024-02-22 13:51:48 +01:00
stefandesouza
999806ec59 Removed redundant check 2024-02-22 13:51:14 +01:00
stefandesouza
f88fafdecc Uncommented check for unknown TP flags 2024-02-22 13:50:31 +01:00
stefandesouza
d4a6a9b44f Instruction form text change 2024-02-22 13:49:56 +01:00
stefandesouza
04388af5dd Made all attributes lower case 2024-02-22 13:48:56 +01:00
stefandesouza
07f1af966d Reverted comment 2024-02-22 13:47:47 +01:00
stefandesouza
d47b0192cf Merge branch 'InstrucForm' of https://github.com/RRZE-HPC/OSACA into InstrucForm 2024-02-22 13:45:04 +01:00
stefandesouza
6df973d16a Restore deleted files 2024-02-22 13:37:13 +01:00
JanLJL
30d536fd26 changed post/pre-indexed key in M1 file 2024-02-08 10:17:18 +01:00
Jan
a2b40b9d2c Added bge instruction 2024-02-01 10:42:35 +01:00
Jan
abfce92b4b Support more branching commands 2024-02-01 10:39:40 +01:00
stefandesouza
ec798f61b2 More formatting 2024-01-10 13:26:50 +01:00
stefandesouza
1fb015b312 Formatting before PR 2024-01-10 13:05:27 +01:00
stefandesouza
226bc8eee0 Added shift and shift_op to Register Operand 2024-01-04 14:34:36 +01:00
stefandesouza
0b3508abf8 Small cleaup commit 2023-12-16 16:00:37 +01:00
stefandesouza
cb5e0bdc38 Merged master 2023-12-16 12:15:12 +01:00
stefandesouza
4647615c5c Merge remote-tracking branch 'origin/master' into InstrucForm 2023-12-16 12:14:36 +01:00
Jan
157f1165bb Added uarch table 2023-12-13 10:23:06 +01:00
JanLJL
93c7d10bbe version bump 2023-12-12 19:11:48 +01:00
Jan
1d62359cc6 Merge pull request #101 from RRZE-HPC/feat/m1
Feat/m1
2023-12-12 10:57:31 -07:00
JanLJL
f3b50b93f5 added M1 arch 2023-12-12 18:33:24 +01:00
JanLJL
c5ef5f7432 bugfixes for SP reg and ccodes 2023-12-12 18:32:43 +01:00
JanLJL
78387a374d Merge branch 'master' into feat/m1 2023-12-12 15:58:24 +01:00
stefandesouza
339b06bd7f Linters update 2023-12-10 18:25:00 +01:00
stefandesouza
8a6ae8c701 Workflow file includes new kenrcraft branch. Also changed checks for 'bad_operands' since they don't fit class style attributes 2023-12-10 17:26:43 +01:00
stefandesouza
cac4a0ebf2 flake8 standards 2023-12-03 21:04:58 +01:00
stefandesouza
cef7f8098d Black formatting 2023-12-03 17:22:11 +01:00
stefandesouza
93ae586745 Fixed semantic and marker tests. Now only dump needs to be adjusted 2023-12-03 16:49:33 +01:00
stefandesouza
2c32ccf37a pre/post-indexed to pre/post_indexed. Now have use ImmediateOperand type for mem offset. Changed some parser tests also 2023-12-02 16:56:43 +01:00
stefandesouza
26d65750a6 Fixed issue with throughput assignment 2023-10-30 19:32:05 +01:00
stefandesouza
ebb973493b Added condition operand, adjusted tests to parse it & a few changes to get the kernelDG tests working 2023-10-29 16:36:00 +01:00
stefandesouza
14a2aa0b52 Changed style to conform to PEP-8 conventions; Added source and destination attributes to parent Operand class 2023-10-29 13:52:49 +01:00
stefandesouza
4186edbc03 added a couple of attributes 2023-10-23 21:57:01 +02:00
stefandesouza
e0a2ea9eb2 Hidden operands and dependency break in iforms now included 2023-10-23 21:54:58 +02:00
stefandesouza
c171a11101 Updated db_interface files to work with class objects 2023-10-23 18:19:35 +02:00
stefandesouza
33d1eec106 Updated tests to use the now class style iforms in isa_data 2023-10-23 16:25:31 +02:00
stefandesouza
db02359ea2 frontend tests now use new OO style, removed AttrDict usage 2023-10-22 16:43:46 +02:00
stefandesouza
6384ea2e18 Convert isa_data iforms to InstructionForm type 2023-10-17 12:28:49 +02:00
stefandesouza
e95278d2a2 Included 'source' and 'destination' attributes when loading isa data 2023-10-16 15:48:47 +02:00
JanLJL
2331e4dd8f added vbroadcast instr to ICX/ICL/SKX 2023-10-13 14:47:44 +02:00
JanLJL
dc250bcedc initial commit of M1 model (not complete) 2023-09-28 10:04:15 +02:00
stefandesouza
0b2753a78d Throughput assignment adjustments 2023-09-25 23:20:10 +02:00
stefandesouza
db899a2709 Changing operand matching for class operand style 2023-09-25 21:35:17 +02:00
JanLJL
74478034f7 Merge branch 'master' of github.com:RRZE-HPC/osaca 2023-09-13 09:49:28 +02:00
stefandesouza
42f96753c1 Black formatting 2023-09-12 12:45:28 +02:00
stefandesouza
a8e5a6ad46 Converting operand types read in from YAML files 2023-09-12 00:23:59 +02:00
stefandesouza
7f4f87d192 Changes for operand matching, instruction loading 2023-09-11 18:23:57 +02:00
stefandesouza
615ef82f04 Changes to accomodate the new OO style 2023-08-28 15:19:46 +02:00
stefandesouza
36549dd679 Updated list/range register resolver & applied black formatting 2023-08-26 14:51:04 +02:00
stefandesouza
76f3baf74e Removed all AttrDict() usage in parser. process_operand() now turns single registers into operands 2023-08-21 18:53:56 +02:00
stefandesouza
b06e6424f7 Added eq methods, changed AArch parser tests for class usage 2023-08-21 15:36:40 +02:00
stefandesouza
0a32c77751 Added 2 operand types and made changes for attribute usage 2023-08-20 21:01:44 +02:00
stefandesouza
eb09cbde42 Module imports 2023-08-20 13:37:57 +02:00
stefandesouza
ecdfc15ac5 InstrucForm class usage in AArch parser 2023-08-20 13:35:11 +02:00
stefandesouza
317816b9d3 Separate operand files with inheritance, str and repr classes 2023-08-20 12:10:07 +02:00
stefandesouza
4c74bb0d46 Merge remote-tracking branch 'origin/master' into InstrucForm
merge
2023-08-20 11:39:20 +02:00
stefandesouza
537076fa25 Added seperate operand class files 2023-08-20 11:38:56 +02:00
JanLJL
8cc408a307 version bump 2023-08-16 00:58:23 +02:00
Jan
6d275a1207 Update modules used in GH actions 2023-08-15 14:55:10 +02:00
Jan
c2ee276609 Added --yaml-out flag 2023-08-15 14:33:22 +02:00
Jan
03a9caf0eb Merge pull request #96 from stephenswat/feat/yaml_output
Add support for structured YAML output
2023-08-15 14:31:31 +02:00
JanLJL
f856c578bf added tests for dict output of analysis 2023-08-15 14:01:15 +02:00
JanLJL
ab10febe74 enhanced YAML output to include all kernel objects and no ruamel.yaml-specific data types 2023-08-15 14:01:11 +02:00
JanLJL
b50bc9ba1f Merge branch 'master' into pr96 2023-08-15 08:59:22 +02:00
Jan
01cc93f56c Merge pull request #95 from stephenswat/feat/zen3_imul
Add IMUL instruction for Zen 3 architectures
2023-08-15 08:57:59 +02:00
Stephen Nicholas Swatman
e1ce402133 Add support for structured YAML output
This commit adds a new `--yaml-out` flag to OSACA which allows the user
to dump the results of an analysis to a YAML file, allowing them to
analyse the file more easily. I have tried to make the output as
comprehensive as possible.
2023-08-12 19:43:43 +02:00
Stephen Nicholas Swatman
7cd380e7b8 Add IMUL instruction for Zen 3 architectures
This commit adds data on the IMUL (r, r) instruction on the AMD Zen 3
microarchitecture.
2023-08-12 19:40:44 +02:00
stefan.desouza@outlook.com
1eb692c86f Classes for OperandForm and Operand types 2023-08-07 15:01:48 +02:00
stefan.desouza@outlook.com
9a0474bcc1 Added DirectiveClass with comment iterator 2023-08-06 17:42:42 +02:00
stefan.desouza@outlook.com
71e2931bb0 Added InstructionForm class 2023-08-06 17:13:42 +02:00
JanLJL
c599ce4967 fixed wrong LEA DB entry 2023-08-03 11:13:23 +02:00
JanLJL
e476893dec version bump 2023-08-02 11:26:45 +02:00
JanLJL
c090d24edf added new instructions 2023-08-02 11:04:56 +02:00
JanLJL
9f9471ee4e changed TP/LT for reg renaming moves 2023-08-02 11:03:31 +02:00
Jan
870074b7ab Update issue templates 2023-07-18 17:24:10 +02:00
JanLJL
1125e4c5d9 fixed UnboundLocalError if tp assignment loop is not executed 2023-07-17 14:52:22 +02:00
JanLJL
88a1efe633 fixes #93 2023-07-17 14:22:05 +02:00
JanLJL
a0d8895d38 added shift instructions 2023-07-05 16:42:34 +02:00
JanLJL
c6ed492db3 fixed read out of store TP from DB 2023-06-20 21:20:41 +02:00
JanLJL
1ac20073ab added reg specific store TP 2023-06-20 21:17:37 +02:00
JanLJL
0a2d1f866d adjusted scraper due to new felixcloutier table layout 2023-06-20 21:16:40 +02:00
JanLJL
d46574db1f version dump 2023-03-24 17:42:03 +01:00
Jan
8e6289af1a fixed nested list 2023-03-24 17:08:59 +01:00
JanLJL
2a43676097 added p-indexing latency values for Arm architectures 2023-03-24 17:05:45 +01:00
JanLJL
c354306b3b updated requirements 2023-03-24 17:05:29 +01:00
JanLJL
9e3ab49065 updated README with new parameter info 2023-03-24 16:39:58 +01:00
Jan
475823d4dc Merge pull request #92 from dgazzoni/aarch64-conditions-codes
Support for flags and conditional ops on AArch64
2023-03-21 18:08:54 +00:00
JanLJL
54644ffb09 black-compliant formatting 2023-03-14 18:22:27 +01:00
JanLJL
af3b1fe3e8 add missing instruction for test 2023-03-14 17:51:20 +01:00
JanLJL
0b93766bdd Merge branch 'master' into pr-armcc 2023-03-14 17:50:48 +01:00
JanLJL
eab6907c82 added support for optional condition flag dependency analysis 2023-03-14 17:00:49 +01:00
JanLJL
27eb8f62b6 more instructions 2023-03-14 17:00:23 +01:00
JanLJL
d1201ace11 added more dependency analysis for post/pre indexing and condition flags 2023-03-14 17:00:02 +01:00
JanLJL
d6569a0f23 renamed condition code attrib, fixed incorrect src/dst, and added more conditional instructions 2023-03-14 16:57:34 +01:00
JanLJL
7e6eb7ce58 bugfix, resolved #90 2023-03-07 17:05:31 +01:00
JanLJL
10d4c4b87e added instruction 2023-03-07 17:04:32 +01:00
JanLJL
f06f767c34 formatting according to black 2023-03-03 15:24:18 +01:00
JanLJL
9f715c0ba3 added fallback search in arch/ISA model for ARM instructions with shape/cc suffixes 2023-03-03 15:11:40 +01:00
JanLJL
2884d17971 enabled indexing without shape and lane for vector regs 2023-03-03 14:41:48 +01:00
JanLJL
dbfba9ce5b added another instruction 2023-03-03 14:39:28 +01:00
JanLJL
841a4a5724 resolve #81 2023-03-02 15:50:13 +01:00
Décio Luiz Gazzoni Filho
19c47db3ed Support for flags and conditional ops on AArch64 2023-02-19 22:08:42 -03:00
JanLJL
06bc51ba63 version bump 2023-02-15 19:03:11 +01:00
JanLJL
27f408e4a3 new black formatting 2023-02-15 18:54:53 +01:00
JanLJL
13c75a3312 new black formatting 2023-02-15 18:46:20 +01:00
JanLJL
98f28a7c01 exclude .ipynb files 2023-02-15 17:00:05 +01:00
JanLJL
c3001e6ecb exclude .ipynb files 2023-02-15 16:59:19 +01:00
JanLJL
b20f5539bf black formatting 2023-02-15 16:53:26 +01:00
JanLJL
4ca7c26d20 updated versions 2023-02-15 16:49:08 +01:00
JanLJL
4453d3ca78 updated versions 2023-02-15 16:44:44 +01:00
JanLJL
e6d24ea01d added more and refined instruction measurements 2023-02-15 14:24:16 +01:00
JanLJL
bdbbfd446f bugfix in store throughput 2023-01-23 18:42:41 +01:00
JanLJL
4734ed95b2 updated py versions 2022-12-11 17:17:36 +01:00
JanLJL
4c50483a83 updated DB 2022-12-11 17:13:56 +01:00
JanLJL
d61cb287ce version bump 2022-10-11 15:43:46 +02:00
JanLJL
5c21e18e36 more instructions 2022-10-11 15:43:19 +02:00
JanLJL
8807e3eda6 added few instrucitons 2022-10-07 00:51:09 +02:00
JanLJL
7c907e2432 bugfix in store throughput calculation 2022-09-28 14:21:46 +02:00
JanLJL
1ea1e68b4e updated due to master branch sunset 2022-09-28 11:00:39 +02:00
JanLJL
907f64d452 updated due to master branch sunset 2022-09-28 10:59:16 +02:00
JanLJL
24de7a762b version bump 2022-09-28 10:36:15 +02:00
JanLJL
87411ab822 updated CPU archs 2022-09-28 10:33:28 +02:00
JanLJL
2fa25e3099 formatting for flake8 2022-09-28 10:05:18 +02:00
JanLJL
0b440e4da9 updated and bugfixed DB 2022-09-28 10:01:26 +02:00
JanLJL
08e6a4be36 updated DB 2022-09-28 10:01:14 +02:00
JanLJL
7724ce27c7 added Zen3 support 2022-09-27 18:39:14 +02:00
JanLJL
4f8e37d9fd bugfixes and more features 2022-09-27 18:04:59 +02:00
JanLJL
d5f1654aa8 bugfixes 2022-09-27 18:04:33 +02:00
JanLJL
81f40604cb version bump 2022-09-08 10:17:41 +02:00
JanLJL
df747b8c48 more instruction forms 2022-09-07 12:48:39 +02:00
JanLJL
4e25a29a8a removed invalid char 2022-09-07 10:48:45 +02:00
JanLJL
016061f72c more instruction forms 2022-09-07 10:33:28 +02:00
JanLJL
ddff8c5012 added option of explicitly mentioning k regs in DB (not simply gpr) 2022-09-07 10:33:16 +02:00
JanLJL
2306cb58d0 added more instructions for ICX 2022-09-01 15:49:28 +02:00
JanLJL
660a9d0f41 Merge branch 'master' of github.com:RRZE-HPC/osaca 2022-08-31 14:20:07 +02:00
JanLJL
3b453de617 added AND instr 2022-08-31 14:19:23 +02:00
JanLJL
b93d911bb7 fix bug in port util 2022-08-31 14:17:52 +02:00
JanLJL
21cfb8d011 version bump 2022-08-29 12:03:47 +02:00
JanLJL
32d60e7966 added py310 in actions 2022-08-29 11:45:41 +02:00
JanLJL
ba60703fb2 added py310 in actions 2022-08-29 11:45:13 +02:00
JanLJL
76542782c8 formatting 2022-08-29 11:30:46 +02:00
JanLJL
671f7f5591 added ICX architecture 2022-08-29 11:14:56 +02:00
JanLJL
f96f5d7ad1 black formatting 2022-06-22 17:12:53 +02:00
JanLJL
d81c53ef91 fixed #88 2022-06-22 17:09:24 +02:00
JanLJL
a018f80597 version bump 2022-04-08 13:51:08 +02:00
JanLJL
2bc6ba999f a few more instructions 2022-04-08 12:02:05 +02:00
Jan
53cbf39ff9 Merge pull request #87 from RRZE-HPC/patch-linter
Patch linter
2022-04-07 13:23:49 +02:00
JanLJL
3c5c516a6d removed solo black run 2022-04-07 12:18:57 +02:00
JanLJL
93c0753db3 formatting 2022-04-07 12:17:08 +02:00
Jan
23ffd06e34 Update lint.yml 2022-04-07 12:12:33 +02:00
JanLJL
ca0540563d Merge branch 'master' of github.com:RRZE-HPC/OSACA 2022-04-07 10:39:16 +02:00
JanLJL
467f212fa3 updates 2022-04-07 10:39:12 +02:00
JanLJL
0de00e512b removed temp file 2022-04-06 16:28:02 +02:00
Jan
3d26d6b82a Merge pull request #84 from qcjiang/feature/tsv110
Feature/tsv110
2022-04-06 16:25:39 +02:00
JanLJL
75bc03bc76 bugfixes and additions 2022-03-28 10:06:51 +02:00
Qingcai Jiang
fa06b9ccac fix a bug about orr in tsv110 2022-03-20 14:53:34 +08:00
JanLJL
9c966c2359 small bugfixes 2022-03-17 16:38:28 +01:00
Qingcai Jiang
13ec7dc20e Merge branch 'feature/tsv110' of github.com:qcjiang/OSACA into feature/tsv110 2022-02-27 17:19:28 +08:00
Qingcai Jiang
b2a326070f adjust sshll instruction 2022-02-27 17:19:15 +08:00
Qingcai Jiang
2eb6023b7a Merge branch 'pr84' into feature/tsv110 2022-02-18 16:04:55 +08:00
Qingcai Jiang
c2787babee Merge branch 'master' into feature/tsv110 2022-02-18 16:03:25 +08:00
Jan
0119f97942 fixed typo 2022-02-14 10:42:01 +01:00
Jan
6514257767 Merge branch 'master' into feature/tsv110 2022-01-27 10:52:39 +01:00
JanLJL
e1a5272fdf formatting 2022-01-27 10:12:00 +01:00
Jan
5748b2987b Merge pull request #85 from qcjiang/bug_fix/when_mov_is_the_last_instr
a) fix a bug when 'mov' is the last instruction and no markers are given
b) fix bug when kernel consists of only unknown load instructions
2022-01-27 10:09:28 +01:00
JanLJL
a447e289ff adjusted DB 2022-01-26 14:25:01 +01:00
JanLJL
d2a4749c39 added lane comparison for AArch64 reg operands 2022-01-26 14:24:48 +01:00
Qingcai Jiang
c917a83974 modify some instructions for tsv110 2022-01-06 16:25:08 +08:00
Qingcai Jiang
5ebd8a019e add some instructions for tsv110 2022-01-04 20:59:35 +08:00
Qingcai Jiang
fe42870cc2 add some instructions for tsv110 2022-01-04 18:45:32 +08:00
Qingcai Jiang
e70229aa32 Merge branch 'bug_fix/when_mov_is_the_last_instr' into feature/tsv110 2021-12-30 21:33:42 +08:00
Qingcai Jiang
71b9a17ab8 fix a bug when longest_path is not integer, try 'ldpw3, w1, [x0, #0x48]' in AArch64 2021-12-30 21:32:29 +08:00
Qingcai Jiang
b484179e02 fix some instr for tsv110 2021-12-30 21:24:41 +08:00
Qingcai Jiang
203ea2dfb0 XMerge branch 'bug_fix/when_mov_is_the_last_instr' into feature/tsv110 2021-12-30 20:32:34 +08:00
Qingcai Jiang
0e984f4ec7 fix a bug when 'mov' is the last instruction 2021-12-30 20:30:43 +08:00
Qingcai Jiang
c1fa5e3bce add some instructions for tsv110, now support most of the instructions 2021-12-19 18:13:32 +08:00
QCJiang
0ab6efa9cb Merge branch 'RRZE-HPC:master' into feature/tsv110 2021-12-18 17:52:54 +08:00
Qingcai Jiang
feda03408f add some instructions for tsv110 2021-12-18 17:51:41 +08:00
Qingcai Jiang
a738d82533 add some instructions for tsv110 2021-12-18 15:44:07 +08:00
Qingcai Jiang
4e10491fcb add some instructions for tsv110 2021-12-15 21:51:59 +08:00
Qingcai Jiang
a87c077654 Merge branch 'bug_fix/negative_hex_address' into feature/tsv110 2021-12-15 20:04:59 +08:00
Qingcai Jiang
ca3ca56a01 add some instructions in tsv110.yml 2021-12-07 18:27:42 +08:00
Qingcai Jiang
2c530654dd double check with every data in instructions 2021-12-07 16:58:30 +08:00
Qingcai Jiang
ce83727eaf formatted, this commit just put the same instructions together in tsv110.yaml, didn't change any numbers 2021-12-07 16:33:22 +08:00
Qingcai Jiang
62746dfc9c fix latency in str/ldr instructions 2021-12-07 16:17:00 +08:00
Jan
ebadaba3ca Merge pull request #82 from qcjiang/bug_fix/negative_hex_address
fix a bug when the hex_number of address is negative
2021-12-03 14:39:04 +01:00
Jan
2be8606e9a black-conform formatting 2021-12-03 14:38:52 +01:00
Qingcai Jiang
d170ba72dd fix a bug when the hex_number of address is negative 2021-12-03 15:13:54 +08:00
Qingcai Jiang
c35c16e007 fix typos 2021-12-02 22:55:39 +08:00
Qingcai Jiang
d3f081f282 add latency and TP information through ibench 2021-12-01 11:42:36 +08:00
JanLJL
f7579e83a9 added branch instructions and data for ADD 2021-11-29 18:48:13 +01:00
JanLJL
ea0576e8ce changed data for register renaming 2021-11-29 18:39:37 +01:00
JanLJL
37cc10edde unfified STP and LDP instructions 2021-11-29 18:34:39 +01:00
JanLJL
939abe2518 unified LOAD instructions 2021-11-29 17:48:16 +01:00
JanLJL
e120d9229b unified STORE instructions 2021-11-29 17:32:51 +01:00
JanLJL
12095979db adjusted non-instruction_form fields 2021-11-29 15:17:38 +01:00
Qingcai Jiang
ca5e9c3cae add some instructions with ibench 2021-11-17 17:49:05 +08:00
Qingcai Jiang
7194e79beb simple implement for TSV110 2021-11-06 16:04:16 +08:00
JanLJL
c97f93c39b version bump 2021-11-04 14:56:23 +01:00
JanLJL
968c71b7b6 black formatting 2021-11-04 12:11:15 +01:00
JanLJL
df26edd075 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2021-11-04 12:09:57 +01:00
JanLJL
a767b7f290 Closes #78, closes #79; added unary/binary logical operators 2021-11-04 12:09:44 +01:00
JanLJL
ba45038ad7 add latency of last instruction in CP 2021-11-04 11:58:40 +01:00
JanLJL
72e85075c2 better output formatting 2021-11-04 11:55:48 +01:00
Jan
40839384ec Merge pull request #60 from RRZE-HPC/a72
Add support for ARM Cortex-A72
2021-10-14 18:10:36 +02:00
JanLJL
ab615547e5 added Cortex A72 in README 2021-10-14 17:10:08 +02:00
JanLJL
9c16f8bc56 formatted 2021-10-14 10:59:55 +02:00
JanLJL
be891d45d4 formatted 2021-10-14 10:53:34 +02:00
JanLJL
5735291d27 Merge branch 'master' into a72 2021-10-14 10:37:05 +02:00
JanLJL
ab368cded1 unified format 2021-10-14 09:23:35 +02:00
JanLJL
6e99954f0b version bump 2021-10-07 17:10:17 +02:00
JanLJL
5205cb5cc6 fixed formatting with correct line length 2021-10-04 15:00:17 +02:00
JanLJL
e6ce870ca0 black formatting 2021-10-04 14:33:28 +02:00
JanLJL
566fbc6bc4 black conformity 2021-09-30 15:53:56 +02:00
JanLJL
b70cff21ad added instructions for BHIVE 2021-09-29 17:26:44 +02:00
JanLJL
d181184788 enhanced parser 2021-09-29 17:26:27 +02:00
Jan
fcc3475417 added lint configs 2021-08-27 08:14:50 +02:00
JanLJL
d418c16f4a applied flake8 and black rules 2021-08-26 16:58:19 +02:00
JanLJL
34523e1b23 fixed wrong uops info import with masking of some gather/scatter 2021-08-26 11:05:33 +02:00
JanLJL
457ccdcf77 version bump 2021-07-21 02:41:05 +02:00
JanLJL
ff61c65d58 added more load instrs 2021-07-21 02:34:31 +02:00
JanLJL
615c809fe3 updated a few DB entries 2021-06-02 16:37:18 +02:00
JanLJL
bce837dec9 version bump 2021-06-01 00:13:38 +02:00
JanLJL
090c24ade1 fixed parsing of reg ranges and lists 2021-06-01 00:10:05 +02:00
JanLJL
03a2a1da33 version bump 2021-05-10 12:56:35 +02:00
JanLJL
d59b100fa8 changed immediate type from str to int 2021-05-10 01:12:30 +02:00
JanLJL
5c741a8a2d version bump 2021-05-05 11:16:43 +02:00
JanLJL
2f4849f44e added tests for timeout in LCD analyis 2021-05-02 22:48:22 +02:00
JanLJL
f13a97e5b5 fixed bug in case of no uarch in CLI 2021-05-02 22:39:07 +02:00
JanLJL
66282b0eef fix #73 2021-05-02 22:22:30 +02:00
Julian Hammer
9ec7c161ab added missing testfile for sve instructions 2021-05-02 21:44:17 +02:00
Julian Hammer
8d8eaa8e4f addd LD2 and ST2 instructions to a64fx 2021-04-23 13:33:32 +02:00
Julian Hammer
88d5094bf1 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2021-04-23 13:18:23 +02:00
Julian Hammer
1f32252f91 improved register range and list support on AArch64 2021-04-23 13:12:18 +02:00
JanLJL
1de644cd62 fixed incompatibilty to py3.6 2021-04-20 13:59:56 +02:00
JanLJL
3d1c6aae8d set min requirement to py3.6 2021-04-20 13:59:32 +02:00
JanLJL
dafec70e6e added wheel to pypi publishing 2021-04-19 11:33:29 +02:00
JanLJL
6d85fbe9e4 fixed duplicate hyperlink tags 2021-04-19 10:58:11 +02:00
JanLJL
3f31235f8a added no timeout option 2021-04-19 10:57:51 +02:00
JanLJL
cfc061e5e3 version bump 2021-04-19 10:14:26 +02:00
JanLJL
5eb3e07ad6 Merge branch 'master' of https://github.com/RRZE-HPC/OSACA 2021-04-19 00:34:32 +02:00
JanLJL
a82a0e24a3 bugfixed CLX as uarch flag 2021-04-19 00:34:21 +02:00
Jan
6db08c7e8e added lcd-timeout flag, citations and updated credits 2021-04-19 00:27:24 +02:00
JanLJL
e6a54ee131 added CLX as synonym for CSX uarch 2021-04-19 00:05:53 +02:00
JanLJL
152360bad2 enhanced LCD analysis by making it parallel and added timeout flag 2021-04-19 00:04:03 +02:00
JanLJL
607d459569 keep dependency paths as generators instead of lists 2021-04-17 12:46:44 +02:00
JanLJL
b033b3b7aa allow different base with prefix for offset values 2021-04-17 11:06:39 +02:00
Julian Hammer
0c295dc847 version bump 2021-04-15 14:47:09 +02:00
Julian Hammer
5588e41492 readme added to validation folder 2021-04-15 14:45:23 +02:00
Julian
08440ed5e1 Validation (#71)
Validating of OSACA predictions for IVB, SKX, ZEN1, ZEN2, A64FX and TX2 with different kernels.

build_and_run.py contains the configuration used at RRZE's testcluster and UR's qpace4, Analysis.ipynb contains the analysis script and results. Raw data from measurements (122MB) will be attached to next OSACA release.

For now, find the raw data here: https://hawo.net/~sijuhamm/d/UPIhBOtz/validation-data.tar.gz

The analysis report can be viewed at https://nbviewer.jupyter.org/github/RRZE-HPC/OSACA/blob/validation/validation/Analysis.ipynb

Quite a few changes on OSACA included:

Feature: register change tracking via semantic understanding of operations
Feature: recording LCD latency along path and exposing this to frontend
Feature: support for memory reference aliases
Feature: store throughput scaling (similar to load throughput scaling)
Fix: model importer works with latest uops.info export
Fix: immediate type tracking on ARM now preserves type in internal representaion
Removed unused KerncraftAPI
2021-04-15 14:42:37 +02:00
Julian Hammer
25a0e0607d added missing instructions to all DBs 2021-04-05 16:47:52 +02:00
JanLJL
b0e35316f0 changed consideration of masking for database back to NO 2021-03-25 11:50:17 +01:00
JanLJL
94313ec772 added new instructions and bugfixed the wrong import from uops.info for masks for A FEW instructions 2021-03-25 11:47:15 +01:00
Julian Hammer
63563ecabc flake8 to ignore some errors and small style improvements 2021-03-11 12:52:34 +01:00
Julian Hammer
b7625a4a25 making flake8 happy 2021-03-11 12:29:14 +01:00
Julian Hammer
7da2f5bd7b fixed output redirection 2021-03-11 12:17:46 +01:00
Julian Hammer
6158a83b4f added blame ignore file 2021-03-11 12:03:40 +01:00
Julian Hammer
6204c90934 migrate code style to Black 2021-03-11 12:02:45 +01:00
Julian Hammer
1ebe5ecfbd sanity check validity of operand entries 2021-03-11 11:38:25 +01:00
Julian Hammer
0c5ac26f3f fixed typos and added missing default load/store info 2021-03-11 10:10:09 +01:00
JanLJL
9a13e5cbc5 guarantee 0 latency for None values in DB 2021-03-11 01:55:57 +01:00
Julian Hammer
dcf3e38612 fix #66 typo in icl.yml
cmp failed with immediates
2021-03-10 11:57:21 +01:00
Jan
09a14465c1 Merge pull request #65 from RRZE-HPC/throw_parsing_error
passing parsing errors to the outside
2021-03-08 09:38:24 +01:00
Julian Hammer
d7a687909e passing parsing errors to the outside 2021-03-05 18:07:36 +01:00
JanLJL
f8d53a69d7 changed test after adjustment in parser 2021-02-25 08:12:10 +01:00
JanLJL
74a479fb95 fixed AArch64 parser for register shifts and new instructions for A64FX 2021-02-25 07:43:42 +01:00
Julian Hammer
4fdf312622 add "sub gpr, gpr" to skx 2021-02-15 11:41:21 +01:00
Julian Hammer
803da767f2 add "vmovapd zmm, zmm" to skx 2021-02-15 11:41:20 +01:00
JanLJL
0e69db9de9 new instructions 2021-01-31 18:11:06 +01:00
JanLJL
9f87606ce8 minor model fixes 2021-01-26 12:56:19 +01:00
JanLJL
768a90de10 cover kernel with only unknown instructions 2021-01-07 12:40:16 +01:00
Jan
8c9557760e Merge pull request #62 from jdomke/attfix
att parser: workaround for crash with "jg,pt" mnemonic
For now we will ignore the branch taken/not-taken indication and will only keep the condition in the mnemonic.
2021-01-07 10:23:39 +01:00
Jan
4dbcfbda5d suppress branch indication in mnemonic for now 2021-01-07 10:20:57 +01:00
JanLJL
ed13cde61b fixed DV port bug 2021-01-07 09:59:18 +01:00
jdomke
4f8ed13309 att parser: support for more mnemonic
found some 'jg,pt' in icc/mkl generated binaries which crashed the
parser, here an example:
 dd8ccd:   3e 7f 90   jg,pt  dd8c60 <mkl_blas_avx2_dtrsm_kernel...
2021-01-05 15:26:22 +09:00
JanLJL
3c7971b347 fixes #61 2020-12-28 10:33:54 +01:00
Jan
d89a742718 Changed badge target to GH Actions 2020-12-18 14:13:26 +01:00
Julian Hammer
05fdbcf060 updated travis badge to gh action 2020-12-18 14:13:26 +01:00
Julian Hammer
0f86d2d1b2 disabled py35 with tox 2020-12-18 14:13:26 +01:00
Julian Hammer
dff86d456e configured flake8 max-line-length 2020-12-18 14:13:26 +01:00
Julian Hammer
77e7c3a520 thrid wheel was to much 2020-12-18 14:13:26 +01:00
Julian Hammer
ba957877e3 merged test and publish again 2020-12-18 14:13:26 +01:00
Julian Hammer
1d52362306 added missing runs-on 2020-12-18 14:13:26 +01:00
Julian Hammer
dd37a21c56 installing libgraphviz-dev 2020-12-18 14:13:26 +01:00
Julian Hammer
0bdc180a52 installing graphviz 2020-12-18 14:13:26 +01:00
Julian Hammer
f6a02a8f3e enabling gh_actions 2020-12-18 14:13:26 +01:00
JanLJL
60f792c4b2 new instructions 2020-12-17 12:38:58 +01:00
Git out :V
12044e3ac4 Initial support for the Cortex-A72 (Raspberry Pi 4) 2020-12-16 18:49:16 +01:00
Git out :V
8454edef73 Data for creating A72 model
Add PMEvo mapping from
https://github.com/cdl-saarland/pmevo-artifact/blob/master/vm_setup/data/A72/mapping_pmevo.json
together with a template file to allow generating an OSACA model for the
A72.
2020-12-16 18:48:55 +01:00
Git out :V
9165306808 PMEvo port mapping to OSACA model converter script
Tool for semi-automatically creating an OSACA model using a PMEvo port
mapping, optionally using asmbench to measure latency and throughput,
which otherwise are not available in the port mapping.

This is only designed to handle AArch64 architectures, in particular the
Cortex-A72, used on the Raspberry Pi 4. Usefulness for other models may
be limited.
2020-12-16 18:47:49 +01:00
JanLJL
449932d75b version bump 2020-12-11 01:00:30 +01:00
Jan
c68ad48e6b Update README.rst 2020-12-11 00:58:34 +01:00
JanLJL
8e3d613843 new instructions 2020-12-09 11:52:10 +01:00
JanLJL
2093610bbf add support for :: delimiter in x86 labels and identifiers 2020-12-07 02:45:49 +01:00
JanLJL
f9f1120da6 added support for absolute address operands without brackets 2020-12-07 01:57:06 +01:00
JanLJL
e87ab5d6ca new instruction 2020-12-07 01:18:32 +01:00
JanLJL
82b35e7649 new instruction 2020-12-07 01:18:32 +01:00
JanLJL
23623ca18a enhancements for lookup and parsing AArch64 instrs 2020-12-07 01:18:32 +01:00
JanLJL
b9e434d124 new instructions 2020-12-07 01:18:32 +01:00
JanLJL
0e47034c8b Merge branch 'master' of https://github.com/RRZE-HPC/OSACA 2020-12-06 18:06:36 +01:00
JanLJL
81ce395115 added the possibility of a 5th operand 2020-12-06 18:05:59 +01:00
JanLJL
f41854a0a6 better port order and a few new instructions 2020-12-06 18:05:36 +01:00
Julian
818b516289 Update README.rst 2020-11-23 11:27:35 +01:00
JanLJL
d7e5e12961 version bump 2020-11-23 10:43:00 +01:00
JanLJL
6bc6349c25 fetch version from __init__ file and write uarch in upper case 2020-11-21 21:33:33 +01:00
JanLJL
f69b5f88f0 removed false entries 2020-11-21 21:02:44 +01:00
JanLJL
596a323dfb bugfixes 2020-11-21 21:00:58 +01:00
JanLJL
08b4586b71 replaced b4799d1d with smarter solution 2020-11-20 15:44:49 +01:00
JanLJL
ffb263e20f more instructions 2020-11-20 15:37:32 +01:00
JanLJL
b4799d1d45 bugfix for immediate shifts with hex-base 2020-11-20 15:22:05 +01:00
Julian Hammer
4ff8fdc4ab version bump 2020-11-11 15:14:27 +01:00
JanLJL
c204096d74 fixed typo 2020-11-11 14:11:00 +01:00
JanLJL
dea217c12c fixed test after changing TP value of instruction 2020-11-11 14:04:07 +01:00
JanLJL
92c162daa2 new instructions 2020-11-11 13:54:23 +01:00
JanLJL
87ea8f0f0a new instructions 2020-11-11 12:27:49 +01:00
Julian Hammer
cb04efc384 fixed typo 2020-11-10 13:33:24 +01:00
JanLJL
14c0ea6180 bugfixes 2020-11-09 23:29:42 +01:00
Julian Hammer
314ff4cf9d improved performance of arch_semantics and reg dependency matching 2020-11-09 19:27:47 +01:00
Julian Hammer
f64253b2b9 added dict for instruction lookup 2020-11-09 17:00:46 +01:00
Julian Hammer
979d08358e singelton for isa parsers 2020-11-09 12:36:14 +01:00
Julian Hammer
a2dd6f752d added comment 2020-11-09 12:35:13 +01:00
Julian Hammer
2fb36406a7 performance improvement of throughput summation 2020-11-09 12:01:00 +01:00
Julian Hammer
94086033a8 added __main__.py 2020-11-09 08:27:31 +01:00
JanLJL
75edfc808a version bump 2020-11-06 20:40:13 +01:00
JanLJL
c8c077a834 enhanced length warning 2020-11-06 15:49:13 +01:00
JanLJL
26ee005adc added missing test file 2020-11-06 15:07:57 +01:00
JanLJL
207c53aaad minor bugfix in HW model and added user warnings for more insight 2020-11-06 15:06:36 +01:00
JanLJL
fafd7bc526 Merge branch 'master' of https://github.com/RRZE-HPC/OSACA 2020-11-06 12:57:46 +01:00
JanLJL
b986d7eba0 added --lines option 2020-11-06 12:57:41 +01:00
Julian Hammer
6b0adb5d68 improved cache handing (always hashing original file) 2020-11-06 12:27:34 +01:00
JanLJL
f9f382a948 bugfixes 2020-11-06 12:03:54 +01:00
Julian Hammer
c6b58c63ab Merge branch 'master' of github.com:RRZE-HPC/OSACA 2020-11-03 16:28:28 +01:00
Julian Hammer
78530bfdb0 fail-safed _build_cache.py 2020-11-03 16:28:07 +01:00
JanLJL
5aa0899961 added bdist 2020-11-03 16:10:46 +01:00
JanLJL
7f0abd7d10 version bump 2020-11-02 15:48:19 +01:00
JanLJL
9ba9bab107 try different ISA as fallback when parsing without --arch flag, use SKX as x86 default and enhanced ISA detection heuristic 2020-11-02 15:33:50 +01:00
Julian Hammer
983e66938c version bump 2020-10-29 13:15:23 +01:00
JanLJL
1c889fa785 Merge branch 'master' of https://github.com/RRZE-HPC/OSACA 2020-10-29 13:00:09 +01:00
JanLJL
022598d94f autodetect ISA and default uarch for ISA 2020-10-29 13:00:02 +01:00
Julian
1f5c9d1c61 using travis-ci.com badge 2020-10-29 12:45:39 +01:00
JanLJL
30e0ad038d ignore pickles in data/ and support py3.9 2020-10-29 11:06:20 +01:00
Julian Hammer
decec86e56 fixed py3.5 compatability 2020-10-29 10:59:00 +01:00
JanLJL
9af689b28c fixed bug in tests and removed unused imports 2020-10-28 19:29:48 +01:00
Julian Hammer
3aea3f2b49 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2020-10-28 17:16:43 +01:00
Julian Hammer
a6cb09cf1f added cache files to package and building during setup 2020-10-28 17:16:03 +01:00
Julian Hammer
9d2ea8603f new caching structure with support for distribution 2020-10-28 16:29:55 +01:00
JanLJL
a7918db145 enhanced hanlding for immediates with shifting 2020-10-21 12:14:21 +02:00
Julian Hammer
b5b1a1f2b2 version bump 2020-10-20 14:36:43 +02:00
Julian
dd59af16b2 Merge pull request #51 from RRZE-HPC/A64FX
A64FX support and several Arm bugfixes and enhancements including better TP scheduling
2020-10-16 10:44:47 +02:00
JanLJL
d9325724e2 removed duplicate cmp entry 2020-10-16 10:11:51 +02:00
JanLJL
7e7269c2bc refactored operand checking in post-processing 2020-10-16 10:05:08 +02:00
JanLJL
c64a24ae1b no \t replacement before any other point than user output 2020-10-16 09:44:18 +02:00
JanLJL
e8b78e4cc6 Merge branch 'master' into A64FX 2020-10-15 22:44:12 +02:00
JanLJL
cd5a706f56 adjusted tests for AArch64 2020-10-15 17:56:08 +02:00
Jan
13426358d0 Merge pull request #50 from RRZE-HPC/fix/increment_handling
Fixing Increment Handling
2020-10-15 17:00:11 +02:00
Julian Hammer
c80088b628 Merge branch 'master' into fix/increment_handling 2020-10-15 16:36:29 +02:00
Julian Hammer
748474cd81 added more cmp versions 2020-10-15 16:23:14 +02:00
Julian Hammer
2fec0bf810 Merge branch 'master' into fix/increment_handling 2020-10-15 13:55:34 +02:00
Julian Hammer
711a41d18e extended and cleaned up marker tests 2020-10-15 13:54:18 +02:00
Julian Hammer
cf4a9cddcb Merge branch 'master' into fix/increment_handling 2020-10-15 13:17:02 +02:00
Julian Hammer
5a5a1e74f5 added CMP to aarch64 to exclude first op from destinations 2020-10-15 13:15:54 +02:00
Julian Hammer
4865e7ea72 fixed ignoring of last line without end marker 2020-10-15 11:59:51 +02:00
Julian Hammer
d03398ddf9 treating post- and pre-incremeted memory references no longer as src_dst
the incremented register is now considered src_dst instead
2020-10-13 19:25:29 +02:00
Julian Hammer
edb8df3205 considering split AVX loads on SNB and IVB 2020-10-13 11:25:13 +02:00
Julian Hammer
489050723c removed a nother set of no-maker tests 2020-10-13 09:03:13 +02:00
Julian Hammer
0cc0d35ce9 removed maker missing tests 2020-10-12 19:34:04 +02:00
Julian Hammer
7f65bdb022 version bump 2020-10-12 15:39:49 +02:00
Julian Hammer
04360cc897 fixed label identifiers by splitting 2020-10-12 15:39:32 +02:00
Julian Hammer
5e7a12f9bb paranthesis now suppored in identifier strings 2020-10-12 15:05:52 +02:00
Julian Hammer
1def12ee79 if not markes were found, use whole code 2020-10-12 15:04:55 +02:00
Julian Hammer
7269156854 added --out argument 2020-10-12 15:04:18 +02:00
Julian Hammer
d6529ced73 fixed push and added pop 2020-10-12 15:03:03 +02:00
Julian Hammer
eac728dc9f added tx2 support for ldp d1, d2, [x3] 2020-10-07 13:57:57 +02:00
JanLJL
451ba62959 added vector mov 2020-09-23 10:07:43 +02:00
JanLJL
57cf1bfe6f Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-09-17 22:28:56 +02:00
JanLJL
44b921aa73 added BS4 dependency 2020-09-17 22:27:37 +02:00
JanLJL
accb52ce53 Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-09-17 22:15:20 +02:00
JanLJL
9e78f85475 added instructions 2020-09-17 22:14:14 +02:00
JanLJL
64da89ec3d enhancecd ARM identifier to support immediate offsets 2020-09-17 22:12:12 +02:00
JanLJL
adeae88665 instr update 2020-09-17 21:21:15 +02:00
JanLJL
1698ed1776 gather enhancement 2020-09-03 13:48:00 +02:00
JanLJL
2ef6051e64 added gather load instruction 2020-09-03 09:30:19 +02:00
Julian Hammer
3308f5d68f version bump 2020-08-05 10:59:10 +02:00
Julian Hammer
bd61b94669 ignoring b.none branched in basic block detection 2020-08-03 19:23:33 +02:00
JanLJL
0db8b6bcbf fixed first character match for symbolic identifiers 2020-08-03 18:30:29 +02:00
Jan
40755b2080 Merge pull request #49 from RRZE-HPC/coherent_label_parsing
Coherent label parsing
2020-08-03 18:25:20 +02:00
JanLJL
269148c2a1 save b/f in numeric identifier as suffix tag 2020-08-03 18:08:29 +02:00
JanLJL
12a8506530 removed unnecessary code 2020-08-03 17:14:58 +02:00
JanLJL
e715badcf9 detects numeric label as label 2020-08-03 16:59:48 +02:00
Julian Hammer
d6b4355a77 labels may now start with numbers 2020-08-03 15:53:29 +02:00
JanLJL
5361b63b52 version bump 2020-08-03 09:38:50 +02:00
JanLJL
cc39342047 minor enhancement for mask parsing 2020-08-03 09:07:45 +02:00
JanLJL
addcdeda85 added sve instructions 2020-08-03 08:55:37 +02:00
JanLJL
23d36a651b enhancements for SVE support 2020-08-03 08:54:59 +02:00
JanLJL
b052ab4151 bugfix in OoO scheduling 2020-07-28 14:52:30 +02:00
JanLJL
673da99fba minor enhancements for scheduling 2020-07-23 15:55:56 +02:00
JanLJL
6c72281d65 prepared for aarch64 8.2 support 2020-07-23 15:54:54 +02:00
JanLJL
5520362e65 adjustments and bugfixes 2020-07-13 18:53:19 +02:00
JanLJL
93060eee43 Merge branch 'master' into A64FX 2020-07-13 14:41:49 +02:00
JanLJL
0e77b7bc9a enhanced TP scheduling 2020-07-06 18:49:46 +02:00
JanLJL
ce8c3ff9ab bugfixes for A64FX 2020-07-06 18:48:54 +02:00
Jan
acbde7a19c Merge pull request #48 from RRZE-HPC/n1
initial implementation of Neoverse N1 support
2020-07-02 09:32:54 +02:00
Cloud User
34e978d2ae initial implementation of Neoverse N1 support 2020-06-30 20:28:57 +00:00
JanLJL
6294e2e9da initial commit for trying to support a64fx 2020-06-26 05:20:40 +02:00
JanLJL
6801229275 PEP8 adjustments 2020-06-25 21:56:18 +02:00
JanLJL
d3d1a89600 two new instrs 2020-06-25 21:55:10 +02:00
JanLJL
93c1951097 prettified aarch64 ISA DB 2020-06-25 21:54:52 +02:00
JanLJL
7211dd0799 improvements for uops.info importer script 2020-06-25 21:53:41 +02:00
JanLJL
5258d65c8e few more instructions 2020-06-24 17:41:30 +02:00
JanLJL
379fe80169 added initial support for Intel Ice Lake (ICL) 2020-06-22 22:15:14 +02:00
JanLJL
94d7d35c0b more instructions 2020-05-04 18:50:58 +02:00
JanLJL
1009c60d2d fixed wrong output format for 3-digit TP numbers 2020-04-08 21:28:50 +02:00
JanLJL
229b316b6d added some instructions 2020-04-08 15:54:31 +02:00
JanLJL
c0753be899 added python 3.7/3.8 to tests 2020-04-02 09:20:08 +02:00
JanLJL
eaa56792ab added bs4 dependency for Travis 2020-04-02 09:08:08 +02:00
JanLJL
3425fa3024 added tests 2020-04-02 08:57:26 +02:00
JanLJL
38924b6ec1 more instructions 2020-03-30 18:27:33 +02:00
JanLJL
d6ae457de4 removed duplicates in CSX DB 2020-03-30 18:18:35 +02:00
JanLJL
a5c2ab1a4a bugfix for online check of operands 2020-03-26 11:46:46 +01:00
JanLJL
e4393189dc minor update 2020-03-26 11:06:11 +01:00
JanLJL
3016fc7c46 added more tests 2020-03-26 10:19:14 +01:00
JanLJL
82f47d217c Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-03-26 10:03:23 +01:00
JanLJL
1754df42d2 enhanced x86 parser for directives 2020-03-26 10:02:39 +01:00
Julian Hammer
ac1295aac2 flag string in output now in line with required flags 2020-03-24 16:02:40 +01:00
Julian Hammer
9624e6c109 closing cache file after dump 2020-03-24 15:20:49 +01:00
Julian Hammer
2d16037c44 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2020-03-21 17:18:37 +01:00
Julian Hammer
c5801cfe2f closing cache file 2020-03-21 17:18:04 +01:00
Julian Hammer
3e960dd4ac closing cache file 2020-03-20 15:02:30 +01:00
JanLJL
680774267d fixed wrong import of mm registers 2020-03-17 12:56:12 +01:00
JanLJL
1aa710f195 enhanced MachineModel to support mask/zeroing differentiation for instruction forms 2020-03-17 12:55:37 +01:00
JanLJL
71206897fd version bump 2020-03-16 22:02:50 +01:00
JanLJL
af247e64b6 added test to check all micro-archs 2020-03-16 21:31:14 +01:00
JanLJL
2973f543b7 fixed duplicates in SNB machine model 2020-03-16 17:28:08 +01:00
JanLJL
0b7f1ed6e7 implemented online check for src/dst of operands based on felixcloutier 2020-03-12 15:08:14 +01:00
JanLJL
17e7f0e0d8 more instruction forms and added wildcard support for registers in ISA DB 2020-03-12 15:07:51 +01:00
JanLJL
c30ad4fb33 bugfixes 2020-03-12 15:06:02 +01:00
JanLJL
666512d54d added reg-only fallback for mem-instructions not found in ISA DB 2020-03-10 17:15:57 +01:00
JanLJL
381e9e9f76 imported reg-only instructions with port util from uops.info for ZEN2 2020-03-10 14:32:31 +01:00
JanLJL
8f63621d6d with working rst file for PypPI 2020-03-10 10:10:29 +01:00
JanLJL
e41d05868a removed raw directive (now for real) 2020-03-09 17:56:18 +01:00
JanLJL
8013a40e52 removed raw directive for PyPI 2020-03-09 17:33:39 +01:00
JanLJL
3db330de66 version bump 2020-03-09 16:52:07 +01:00
JanLJL
4e73e24b99 added documentation 2020-03-09 16:35:06 +01:00
JanLJL
dcd5b8fd61 more documentation 2020-03-05 18:39:38 +01:00
JanLJL
3fb053fa79 updated docs 2020-03-05 17:13:23 +01:00
JanLJL
cfd16aa079 added doc badges 2020-03-05 16:56:27 +01:00
JanLJL
1bf9a858ad Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-03-05 16:44:29 +01:00
JanLJL
5fc660484c added Sphinx documentation for RTD 2020-03-05 16:44:18 +01:00
JanLJL
c194f57f09 renamed doc to docs and more documentation 2020-03-05 15:28:30 +01:00
Julian Hammer
40a35ce067 outlined osaca logo font 2020-03-03 17:30:00 +01:00
JanLJL
4e58552c03 initial upload of zen2 data 2020-03-03 14:15:34 +01:00
JanLJL
280f9c5790 added thesis in docs and updated logo 2020-03-03 12:52:59 +01:00
JanLJL
d861d66206 minor fixes 2020-02-27 18:32:48 +01:00
JanLJL
3348afe219 minor fixes 2020-02-27 18:11:32 +01:00
Jan
f8ae6599c5 Refactoring of README and more description 2020-02-27 18:10:09 +01:00
JanLJL
ffb016af45 reduced testing runtime 2020-02-27 16:23:56 +01:00
JanLJL
51586cdaa1 bugfix for kerncraft dependency 2020-02-27 16:16:07 +01:00
JanLJL
c9000f74bc enabled kerncraft marker insertion for aarch64 and more tests 2020-02-27 16:00:23 +01:00
JanLJL
b06570ed45 added missing example coverage 2020-02-27 13:56:12 +01:00
JanLJL
b4682d16fb added the ability of different output stream for running osaca and more tests 2020-02-27 13:30:58 +01:00
JanLJL
6c08a98418 bugfix for running graph export in test env 2020-02-26 20:02:29 +01:00
JanLJL
2d30d190f4 running examples for tests 2020-02-26 18:40:08 +01:00
JanLJL
8cce680bd7 more tests 2020-02-26 17:32:13 +01:00
JanLJL
9a60aa2c28 minor fixes and removed unnecessary load_tps 2020-02-26 15:40:52 +01:00
JanLJL
03b4cd1686 added test for optimal throughput assignment and invalid asmbench import files 2020-02-20 16:34:11 +01:00
JanLJL
5bdc61aa09 bugfix 2020-02-20 12:25:50 +01:00
JanLJL
04db2bfa79 added tests for asmbench import 2020-02-20 12:07:20 +01:00
JanLJL
5a0365ab35 more tests 2020-02-20 09:04:39 +01:00
JanLJL
4cdee8b621 updated codecov settings 2020-02-12 19:19:35 +01:00
Jan
248829141f Create codecov.yml 2020-02-12 19:16:27 +01:00
Julian Hammer
131646b01a added instructions (with 2D and 3D ports) from uops.info 2020-02-05 10:07:44 +01:00
Julian Hammer
3cf40d9cd0 adding 2D and 3D ports on combined load instructions 2020-02-05 10:07:23 +01:00
Julian Hammer
0adde7b9fc added ice lake abbreviation 2020-02-05 10:05:57 +01:00
Jan
9888ef2da4 Added explanation for kernels 2020-02-03 13:39:12 +01:00
JanLJL
cadedeba7b added example kernels 2020-02-03 13:19:18 +01:00
JanLJL
f5489621fa small bugfix in tx2 and version update in meta data 2020-02-01 15:11:47 +01:00
JanLJL
77aa7f8fe0 version bump 2020-01-31 19:17:44 +01:00
Julian Hammer
760e3a9846 small change to .travis.yml 2020-01-31 17:16:42 +01:00
Julian Hammer
d0436838de updated travis file 2020-01-31 17:15:42 +01:00
Julian Hammer
c204b19caa Merge branch 'master' of github.com:RRZE-HPC/OSACA 2020-01-31 15:58:14 +01:00
Julian Hammer
731f1f9636 compatability with kerncraft v0.8.3.dev1 2020-01-31 15:57:46 +01:00
JanLJL
edae1720dc substitute all non-parseable chars 2020-01-30 23:13:00 +01:00
JanLJL
79afcba61d removed possible unparseable characters from DBs 2020-01-30 22:33:07 +01:00
JanLJL
559c95a34a bugfix for missing warning 2020-01-29 23:13:59 +01:00
JanLJL
9c7907ee21 Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-01-29 13:04:11 +01:00
JanLJL
3243455ec5 bugfixes and new instructions 2020-01-29 13:04:03 +01:00
JanLJL
5574a93a5e made detection of flag dependencies as opt_in for now 2020-01-29 13:03:43 +01:00
Julian Hammer
24583de74e version bump 2020-01-28 17:29:16 +01:00
Julian Hammer
530ad8484e frontend returns strings; added helper function to calc. unmatched ratio 2020-01-28 17:24:00 +01:00
JanLJL
421cf55af7 minor enhancements and bugfixes 2020-01-27 16:37:28 +01:00
JanLJL
2fc1f3a186 added new instructions and fixed false positive assignment of stores by dynamic TP/LT combination for aarch64 2020-01-22 21:40:11 +01:00
JanLJL
092403c529 version bump 2020-01-22 15:22:50 +01:00
JanLJL
2d82c32f02 bugfix in requirements 2020-01-22 15:19:10 +01:00
JanLJL
53135a03da removed dependencies not needed for basic analysis from install requiries list 2020-01-22 15:13:15 +01:00
JanLJL
02233f627e added file caching for DBs 2020-01-22 15:07:46 +01:00
JanLJL
662ad829ec added comment line marker support and adjusted tests 2020-01-22 15:06:56 +01:00
JanLJL
cb34733abe added zero masking support 2020-01-22 15:05:25 +01:00
JanLJL
aa92234e5d undid DB change in commits daa566329c and 4eea686e8b due to wrong data 2020-01-22 14:35:49 +01:00
JanLJL
1fd2453a50 Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-01-17 16:56:01 +01:00
Julian Hammer
4eea686e8b added movabs 2020-01-17 16:47:55 +01:00
Julian Hammer
daa566329c some more instructions (esp. AT&T naming) 2020-01-17 16:30:00 +01:00
JanLJL
b202bdfdb0 Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-01-17 15:19:12 +01:00
JanLJL
534eda8015 added ldur and stur 2020-01-17 15:16:00 +01:00
Julian Hammer
b2bb2cd003 small changes 2020-01-17 15:04:08 +01:00
Julian Hammer
97dbefdb6f reimported models with new importer 2020-01-17 15:03:31 +01:00
Julian Hammer
789406c863 updated movs 2020-01-17 12:39:19 +01:00
Julian Hammer
5341a2e94d many instructions 2020-01-17 12:31:59 +01:00
JanLJL
c2d8742ac0 allows aliasing in uarch DB via list-name 2020-01-17 12:21:46 +01:00
JanLJL
5b1c984552 adjusted test due to hidden operand dependencies 2020-01-17 08:13:15 +01:00
JanLJL
6d6d3b7ccb minor functionality enhancements 2020-01-17 06:55:05 +01:00
JanLJL
3656b222ca minor change for suspicious instructions 2020-01-17 06:54:31 +01:00
Julian Hammer
60b6b603b7 added a few instructions and hidden_operands 2020-01-16 16:49:02 +01:00
Julian Hammer
70c66dbd0f fixed typo in yml string template 2020-01-16 15:31:35 +01:00
Julian Hammer
d85daa9ecc added generated mov information to all intel uarchs 2020-01-16 15:19:46 +01:00
Julian Hammer
3f55ae2368 also added pmovs and updated some port numbers 2020-01-16 15:10:41 +01:00
Julian Hammer
7e4fcf5399 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2020-01-16 13:28:37 +01:00
Julian Hammer
571d090344 added mov generation script 2020-01-16 13:28:16 +01:00
Jan
55652f84e0 fixed broken build pipeline badge 2020-01-16 07:52:35 +01:00
Julian Hammer
6e25da6c08 considering port7 with simple v?mov[dq] 2020-01-15 14:33:45 +01:00
Julian Hammer
0269fc7085 traveled through the hell of v?mov[dq] 2020-01-15 14:03:32 +01:00
JanLJL
76469f7898 supports hidden operands now (for flags or special instructions) 2020-01-14 20:54:00 +01:00
JanLJL
3ff8a695b6 Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-01-14 18:25:08 +01:00
JanLJL
cb100d118f small bugfix for mm registers 2020-01-14 18:24:00 +01:00
Julian Hammer
0c22634601 added v?movhp[sd] and many cmp instructions 2020-01-14 15:52:28 +01:00
Julian Hammer
54ae9f4d26 removed port 7 store agu from ivb and snb 2020-01-14 13:28:54 +01:00
Julian Hammer
e8fab533db late conflict merger 2020-01-14 13:20:44 +01:00
Julian Hammer
0e05bd66d8 added conditional moves 2020-01-14 13:11:48 +01:00
Julian Hammer
e635b2b015 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2020-01-14 13:10:52 +01:00
Julian Hammer
383b720cc5 a few more instructions for csx 2020-01-14 10:51:20 +01:00
JanLJL
b6572720af enhanced for dynamic ST throughput combination 2020-01-14 10:49:47 +01:00
JanLJL
354ab8e148 aligned expected LT with new store latency 2020-01-14 09:31:27 +01:00
JanLJL
4f4a53c3be Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-01-14 09:29:44 +01:00
JanLJL
2a50207045 added ST throughput values 2020-01-14 09:29:40 +01:00
Julian Hammer
03f544638e merged conflict 2020-01-14 09:19:32 +01:00
JanLJL
bfe45f09bc fixed invalid wildcards 2020-01-14 09:04:09 +01:00
Julian Hammer
8e30cd583a added lea instructions 2020-01-13 13:27:28 +01:00
Jan
e86803df02 bugfix 2020-01-13 11:20:32 +01:00
Julian Hammer
184751cf9e Merge branch 'master' of github.com:RRZE-HPC/OSACA 2020-01-10 17:16:07 +01:00
Julian Hammer
d99522583e lots of new instructions :) 2020-01-10 17:15:55 +01:00
JanLJL
cafe4c5bf8 adjusted for mem wildcards in AArch64 ISA DB 2020-01-10 14:38:17 +01:00
JanLJL
623c4ea113 added wildcard mode for mem addressing in ISA DB 2020-01-10 12:55:44 +01:00
JanLJL
3ca2586bac added --ignore-unknown flag and major updates in x86 parser 2020-01-09 17:57:08 +01:00
JanLJL
36d6a82da5 added more instruction forms 2020-01-09 15:18:38 +01:00
JanLJL
b1444cf352 Merge branch 'master' of github.com:RRZE-HPC/osaca 2020-01-09 13:21:45 +01:00
JanLJL
4d6d8d9379 check for non-GAS-suffix mnemonics for instruction forms with MEM ops 2020-01-09 13:21:11 +01:00
Julian Hammer
1687ba8be9 version bump 2020-01-08 11:54:30 +01:00
JanLJL
59402a0837 new entry 2020-01-07 19:49:53 +01:00
JanLJL
f5b6611474 tiny bugfix for src_dst operands 2020-01-07 19:42:13 +01:00
JanLJL
262fa4b288 tiny update in port model 2019-12-20 16:23:18 +01:00
JanLJL
0fdbb7f52c bugfix 2019-12-19 18:54:47 +01:00
JanLJL
bad230fa7b enhanced dynamic combine of LD and arithmetic instr 2019-12-19 18:50:48 +01:00
Julian Hammer
dc02192d04 fixed matching of section type descriptors 2019-12-19 12:02:14 +01:00
JanLJL
c23e52cdf6 Merge branch 'master' of github.com:RRZE-HPC/osaca 2019-12-18 16:59:10 +01:00
JanLJL
b2b4aba0f3 added default load tp in new HW model 2019-12-18 16:58:34 +01:00
JanLJL
bbb004a2aa added default load TP and relocation in identifier 2019-12-18 16:56:20 +01:00
Julian Hammer
4628d52210 broadened scope of assembler identifiers 2019-12-17 14:53:59 +01:00
Julian Hammer
99781b4171 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2019-12-17 12:21:10 +01:00
Julian Hammer
04a1433f02 fixed argument check for architecture 2019-12-17 12:06:31 +01:00
JanLJL
d88617109f changed from dict DB back to list DB for now 2019-12-16 18:25:27 +01:00
Julian Hammer
cbed2c46f4 allowing for comment in marker; dev version bump 2019-12-13 17:38:35 +01:00
JanLJL
9d069c39d9 performance enhancement by removing unnecessary DB parsings 2019-12-02 15:39:59 +01:00
JanLJL
c9d3a90cd0 bugfix 2019-12-02 15:38:00 +01:00
JanLJL
9ea2c5f46d version bump 2019-11-18 23:00:07 +01:00
Julian Hammer
f18a48653f FIX #46 untangled semantic and non-semantic operand info 2019-11-14 16:43:33 +01:00
Julian Hammer
ff68f03aed removed automatic copying to ~/.osaca 2019-11-13 19:23:04 +01:00
Julian Hammer
a16fee9fb1 fixed version strings 2019-11-13 19:21:36 +01:00
Julian Hammer
2fcfc01542 updated to travis-ci.org 2019-11-13 16:43:04 +01:00
Julian Hammer
fe9cd6c0c9 export for and unittests for more semantics functions 2019-11-13 14:16:27 +01:00
Julian Hammer
e7838cac54 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2019-11-13 12:52:34 +01:00
Julian Hammer
4dc4323e2e fixed semantics testcase to join lists 2019-11-13 12:39:16 +01:00
Julian Hammer
6f5b8adadd fixed loop and basic block finder 2019-11-13 10:36:45 +01:00
JanLJL
744e1d83cc more docu 2019-11-08 12:01:09 +01:00
JanLJL
47e39f1f77 added documentation 2019-11-08 09:55:20 +01:00
JanLJL
6d814d416b added docstrings 2019-11-06 17:37:35 +01:00
JanLJL
f8ed85b7c9 removed port checking for old structure and minor bugfix 2019-11-06 12:50:00 +01:00
JanLJL
db6e40ee88 removed "uopsinfo" as possible import flag since model_importer creates full DB file 2019-11-06 12:35:50 +01:00
JanLJL
8359aa4807 added new instructions/uarchs 2019-11-06 12:30:20 +01:00
JanLJL
697c5b5f4b minor updates 2019-11-06 10:54:31 +01:00
JanLJL
f3f91536b5 added missing data ports in DB and nicer CP view for frontend 2019-10-31 19:40:20 +01:00
JanLJL
687693d2a5 improved port model scheme 2019-10-31 18:47:50 +01:00
JanLJL
eb55693871 added IVB DB 2019-10-31 18:19:23 +01:00
JanLJL
abf4fc391f Merge branch 'master' of github.com:RRZE-HPC/osaca 2019-10-30 18:16:26 +01:00
JanLJL
4da262a902 better GAS suffix handling and BDW DB 2019-10-30 18:16:14 +01:00
Julian Hammer
a91413c270 added list processing function 2019-10-30 09:31:01 +01:00
Jan
6c56a77967 Update README.rst 2019-10-29 17:26:36 +01:00
JanLJL
224e24d5e9 updates 2019-10-29 17:23:33 +01:00
JanLJL
0b78d290ec enhanced by optimal throughput analysis 2019-10-29 16:52:34 +01:00
JanLJL
a839af76c5 fixed bug in load port utilization 2019-10-29 16:51:58 +01:00
JanLJL
15da6044dd bugfix in imports 2019-10-29 09:15:54 +01:00
JanLJL
0f5d3a0370 separated SemanticsAppender into ISA and Arch semantics 2019-10-29 09:09:52 +01:00
Julian Hammer
1c8067545d collapsed x86 and arm functions into generic function 2019-10-25 16:09:02 +02:00
Julian Hammer
484d6da85e made tests a module 2019-10-25 14:49:21 +02:00
Julian Hammer
0ecc656055 added marker definition to marker_utils 2019-10-25 13:16:24 +02:00
Jan
af0c8fc953 Merge pull request #45 from RRZE-HPC/v3
ibench & asmbench interfaces
2019-10-24 14:45:12 +02:00
JanLJL
5fe983f4ef added asmbench import 2019-10-24 14:42:21 +02:00
JanLJL
8b4acf0508 fixed last problems with ibench import 2019-10-24 12:38:26 +02:00
Julian Hammer
0c63d4f1cd added functions for extraction of basic blocks and loop bodies 2019-10-23 14:45:12 +02:00
Julian Hammer
b1e4cb90a7 better formatting of load_throughput dump 2019-10-22 14:43:43 +02:00
JanLJL
159a1fa343 added combined view for TP/CP/LDC 2019-10-16 19:00:24 +02:00
JanLJL
db862441b0 Resolved merge conflicts 2019-10-16 10:59:03 +02:00
Jan
de5479a06c Merge pull request #43 from RRZE-HPC/non_average_port_model
Non average port model
2019-10-16 10:39:02 +02:00
JanLJL
d92523e133 changed DBs to new port_pressure structure 2019-10-16 10:06:47 +02:00
Julian Hammer
cb7cec20a8 working importer, better dumper 2019-10-15 12:22:49 +02:00
Julian Hammer
1c673382b4 work in progress 2019-10-14 17:08:40 +02:00
Julian Hammer
85dd53dc4e versin bump 0.3.1.dev1 2019-10-11 17:50:05 +02:00
Julian Hammer
792bbb1166 removed some unnecessary file checks and fixed up test cases 2019-10-11 16:13:58 +02:00
Julian Hammer
f08fbf79ba renamed semanticsAppender module, merged find_file functions 2019-10-11 15:34:24 +02:00
Julian Hammer
e7ea5451f1 fixed KerncraftAPI 2019-10-11 13:35:04 +02:00
Julian Hammer
0ed113116c returning output as string for kerncraft 2019-10-11 13:28:20 +02:00
Julian Hammer
65100bd9da made KerncraftAPI stateful 2019-10-11 13:21:33 +02:00
Julian Hammer
2995287377 directives may also have register parameters 2019-10-11 13:09:12 +02:00
Julian Hammer
209ce2a5ef better error message on parsing errors 2019-10-11 12:36:13 +02:00
Jan
3a6fa0475f bugfix 2019-10-11 09:50:14 +02:00
Jan
e2abac3e0c fixed wrond LCD test 2019-10-11 09:47:06 +02:00
JanLJL
cd81271b54 fixed testing 2019-10-10 17:59:11 +02:00
JanLJL
361a4fd8c2 moved db_interface and integrated ibench in it 2019-10-10 17:00:27 +02:00
JanLJL
c8c8c13ed1 more instructions 2019-10-10 16:36:07 +02:00
Jan
a3ba078e76 Update README.rst 2019-10-10 09:47:03 +02:00
JanLJL
438a4b3d6b small updates 2019-10-10 09:37:52 +02:00
Julian Hammer
8c2dfb27e5 enabled pypi tag deployment from travis 2019-10-09 13:36:03 +02:00
Julian
14abed8f85 Merge pull request #39 from aaronjohnson/patch-1
Update README.rst
2019-10-07 09:21:26 +02:00
JanLJL
39cfd4dcda ibench benchmark file creation 2019-10-04 12:21:19 +02:00
Jan
97025b259a Add files via upload 2019-10-04 10:55:24 +02:00
JanLJL
8f743f2b88 fixed rendering issues in README for PyPi 2019-10-04 02:15:26 +02:00
Jan
cd5f131f35 Fixed external link 2019-10-04 01:47:22 +02:00
Jan
f896440bbe Update README.rst 2019-10-04 01:42:29 +02:00
Aaron Johnson
f6d12cae2a Update README.rst
minor spelling fix for thorughput -> throughput
2019-10-03 15:57:28 -07:00
Jan Laukemann
3e0c57f1c7 fixes #36 database copy bug 2019-10-04 00:40:23 +02:00
JanLJL
41e3ee57e6 register helper functions added 2019-10-03 23:10:50 +02:00
JanLJL
586fee5306 version bump and dependency in setup 2019-09-27 18:25:25 +02:00
JanLJL
efb9ba166d enhanced directive parser for ARM 2019-09-27 17:47:07 +02:00
JanLJL
22bfcd8020 added visual graph export, YMM LD support for ZEN and support for non-dyn loads in DB 2019-09-27 17:15:04 +02:00
JanLJL
625d814dce new dynamic tp and lt values for LD instructions 2019-09-26 21:39:56 +02:00
JanLJL
19dbd90849 runnable pmbs version 2019-09-25 13:01:30 +02:00
JanLJL
042c034838 implemented new CP calculation for x86 2019-09-24 19:00:12 +02:00
JanLJL
7bc3f016cd DB updates 2019-09-18 00:10:40 +02:00
JanLJL
7c9ef5f589 fix in kernel marker 2019-09-18 00:10:25 +02:00
JanLJL
f278180402 nicer perspective of loop-carried deps 2019-09-18 00:09:26 +02:00
JanLJL
fb834e5533 changed zen port model from combined LD/ST to separate ones 2019-09-06 16:12:37 +02:00
JanLJL
36a662267d one more test 2019-08-30 16:40:40 +02:00
JanLJL
71dc7f5431 more tests 2019-08-30 16:36:50 +02:00
JanLJL
ab93311ceb added missing kerncraft test file to all_tests 2019-08-30 16:20:22 +02:00
JanLJL
533f78f9cf added tests for Kerncraft API 2019-08-30 16:19:18 +02:00
JanLJL
39004fce1a added base parser to all_tests 2019-08-30 14:22:53 +02:00
JanLJL
e737abd58f bugifx 2019-08-30 13:53:45 +02:00
JanLJL
5bb6b94803 bugifx 2019-08-30 13:50:18 +02:00
JanLJL
1948c738d1 more tests 2019-08-30 13:09:02 +02:00
JanLJL
0173481bec more tests for frontend 2019-08-30 12:28:04 +02:00
JanLJL
eeb55e8cf7 added tests for base parser 2019-08-30 12:10:15 +02:00
JanLJL
edd772380e added tests for hidden load 2019-08-30 10:56:18 +02:00
JanLJL
ae5845b944 added tests for loop-carried deps and changed data structure of them to dict 2019-08-30 10:11:51 +02:00
JanLJL
9188e8e31e added float immediate 2019-08-29 19:00:25 +02:00
JanLJL
fc06b968d8 more instructions 2019-08-29 18:54:28 +02:00
JanLJL
ff672fb5ec more instructions 2019-08-29 17:52:27 +02:00
JanLJL
f2eff01529 more tests and bugfixes 2019-08-29 16:36:14 +02:00
JanLJL
7dc14fbf39 added CLI 2019-08-29 14:23:37 +02:00
JanLJL
80e741d411 enhanced frontend and added hidable load port 2019-08-29 14:03:16 +02:00
JanLJL
7855166624 fixed loop-carried dependency check and minor fixes in DBs 2019-08-28 13:07:24 +02:00
JanLJL
df8a81bf4d initial version of loop-carried dependencies checker 2019-08-23 19:01:20 +02:00
JanLJL
de2ba87d6b more interfaces 2019-08-20 18:50:57 +02:00
JanLJL
e468db4a0d refactoring from AArch64 to aarch64 2019-08-13 18:26:48 +02:00
JanLJL
866397d6ae finished DB sanity check and cleaned DBs 2019-08-13 18:13:41 +02:00
JanLJL
f99265070f copy data during set up 2019-08-09 12:00:17 +02:00
JanLJL
01e87b7727 more tests for codecov 2019-08-09 11:48:49 +02:00
JanLJL
b09129eeb7 more tests for Arm 2019-08-09 11:01:08 +02:00
Jan
de9350a158 Update README.rst 2019-08-09 10:09:56 +02:00
JanLJL
9da84cdfb2 build badge fix 2019-08-09 10:00:24 +02:00
JanLJL
7183b767f2 added codecov badge 2019-08-09 09:52:09 +02:00
JanLJL
050a51fc28 bugfixed codecov 2019-08-09 09:40:53 +02:00
JanLJL
2493913fd5 integrated codecov 2019-08-09 09:34:53 +02:00
JanLJL
6ab336f0ab new entries and initial upload of entry creation interface 2019-08-06 18:28:35 +02:00
JanLJL
8c2f744acf removed old examples 2019-08-06 18:27:53 +02:00
JanLJL
184ed73190 renaming CSL to CSX 2019-07-26 08:31:06 +02:00
JanLJL
148977d417 changed tests for different ARM reg dependencies 2019-07-26 07:41:35 +02:00
JanLJL
ec666b7c79 bugfixes, nicer frontend and first AMD Zen data added 2019-07-25 23:17:10 +02:00
JanLJL
903738161e bugfix in reg dependencies 2019-07-25 10:29:25 +02:00
JanLJL
b031b887a6 bugfix 2019-07-24 11:53:02 +02:00
JanLJL
832fa4e241 moved frontend tests to test_frontend.py 2019-07-24 11:06:28 +02:00
JanLJL
abf8b674be more data 2019-07-24 10:58:13 +02:00
JanLJL
cb75bf52ab first basic analysis version 2019-07-24 10:57:19 +02:00
JanLJL
e69baaba41 bugfix 2019-07-10 18:31:25 +02:00
JanLJL
a866500610 added DiGraph creation and more tests 2019-07-10 18:25:32 +02:00
JanLJL
311535476a bugfix for test 2019-07-08 16:25:07 +02:00
JanLJL
75393f106c finished refactoring for semanticsAppender, including first tests 2019-07-08 15:47:08 +02:00
JanLJL
03dff0013f initial upload - not functional yet 2019-07-05 17:11:18 +02:00
JanLJL
593dd63897 refactoring as preparations for explicit semanticsAppender 2019-07-05 15:34:00 +02:00
JanLJL
e5fdb7a9ac removed unused DBs 2019-07-05 15:30:30 +02:00
JanLJL
2e23820a55 changed reg scale value to int and added prefix x for sp 2019-07-03 12:55:42 +02:00
JanLJL
2fffb07feb updated dependencies 2019-06-24 17:32:38 +02:00
JanLJL
e923c67bdb bugfixed x86 parser and tests for dep finder 2019-06-24 17:28:45 +02:00
JanLJL
75a405e33e added dependency check for registers and tests 2019-06-12 18:57:53 +02:00
JanLJL
8b377f4db1 AttrDict bugfix 2019-06-05 15:00:55 +02:00
JanLJL
119dc5baa9 refactoring to modulize marker_utils 2019-06-05 13:51:41 +02:00
JanLJL
b81a4c68d3 more tests 2019-06-04 14:15:56 +02:00
JanLJL
ee15842c97 added tests 2019-06-04 13:13:34 +02:00
JanLJL
d553998b90 added AttrDict to parser and refactoring 2019-06-04 12:55:32 +02:00
JanLJL
6c212d130c added tests for analyzer 2019-06-04 10:07:44 +02:00
JanLJL
0944633958 added marker detection 2019-06-03 20:51:46 +02:00
JanLJL
b469f5a0c4 added tests for pre/post-indexed addr and prefetching 2019-05-28 18:22:28 +02:00
Julian Hornich
646490ac2a Merge branch 'master' of https://github.com/RRZE-HPC/OSACA 2019-05-28 12:47:06 +02:00
Julian Hornich
8e13432318 Added AMD Zen instructions 2019-05-28 12:46:49 +02:00
JanLJL
f7ce5ac63c version bump 2019-05-28 08:59:18 +02:00
JanLJL
01a6b15bb1 bugfix for testing 2019-05-28 08:59:04 +02:00
JanLJL
0925af21a0 finished ARM parser and added tests 2019-05-24 15:10:02 +02:00
JanLJL
b683bf7ce3 added -P flag in outputs 2019-05-22 09:36:20 +02:00
JanLJL
64a7cb8196 bugfix for importing ibench values 2019-05-16 18:45:51 +02:00
JanLJL
d14ccee0b4 version bump 2019-05-16 16:42:57 +02:00
JanLJL
171b57b381 supports directives in the kernel 2019-05-16 16:34:53 +02:00
JanLJL
d6042b4006 parser now understands and ignores asm directives 2019-05-16 15:51:28 +02:00
JanLJL
f9e6583959 more tests for parser 2019-05-03 14:45:21 +02:00
JanLJL
c1cf539c45 enabled instruction forms without any operands 2019-05-03 10:44:54 +02:00
JanLJL
8bd7be32e2 changed comma handling for operands 2019-05-02 19:04:01 +02:00
JanLJL
1f52157e9c fixed tests 2019-05-02 18:52:16 +02:00
JanLJL
daa874b396 bugfixes 2019-04-30 18:57:18 +02:00
JanLJL
a84150f71b fixed wrong import 2019-04-30 18:49:38 +02:00
JanLJL
c70e57ab1a fixed for tests 2019-04-30 18:46:45 +02:00
JanLJL
f189d6aca5 fixed wrong testname 2019-04-30 18:40:38 +02:00
JanLJL
38bbf2712b added tests and functionalities for x86_att parser 2019-04-30 18:37:07 +02:00
JanLJL
bc9b380429 removed broken landscape badge 2019-04-30 14:09:33 +02:00
JanLJL
2d32b3a92a initial parser structure added 2019-04-30 14:06:24 +02:00
JanLJL
9e3aaa7336 update 2019-04-30 10:28:56 +02:00
JanLJL
bfd966e824 removed 3.7 builds since travis does not support it yet 2019-04-30 10:16:56 +02:00
JanLJL
2fa92bc0b0 prepared repo for version 3 2019-04-30 09:59:30 +02:00
JanLJL
07cbd63f47 Closes #28 2019-04-30 09:44:22 +02:00
Jan
bd4a5622b2 Merge pull request #27 from RRZE-HPC/imported_intel_models
Imported intel models
2019-04-30 08:28:06 +02:00
Julian Hammer
01b23e1b47 Merge branch 'master' into imported_intel_models 2019-01-30 17:36:12 +01:00
Julian Hammer
aea6f8f043 added warning if data dirs differ. fixing #21 2019-01-30 17:35:27 +01:00
Julian Hammer
7d3857b023 (non realease) version bump to match up with kerncraft 2019-01-30 15:54:23 +01:00
Julian Hammer
179c78c9ec Merge branch 'imported_intel_models' of github.com:RRZE-HPC/OSACA into imported_intel_models 2019-01-29 13:53:06 +01:00
Julian Hammer
c0413de556 api to get ratio of unmatched instructions 2019-01-29 13:52:12 +01:00
Julian Hammer
3b027e2453 added CFL and KBL 2019-01-29 13:52:12 +01:00
Julian Hammer
5cd80f4e82 updated importer and evaluation functions 2019-01-29 13:52:12 +01:00
Julian Hammer
29e4974662 pep8 2019-01-29 13:52:12 +01:00
Julian Hammer
7185174885 making progress on covering ambigious instruction forms 2019-01-29 13:52:12 +01:00
Julian Hammer
d3d46bfff5 Masking instruction duplication mentioned in #24 2019-01-29 13:52:12 +01:00
Julian Hammer
fbc3437cce added missing test file 2019-01-29 13:52:12 +01:00
Julian Hammer
db1cdf5474 [WIP] added test case and modified port counts to match imported model 2019-01-29 13:52:12 +01:00
Julian Hammer
a460ca8d55 adde model importer and imported models from uops.info 2019-01-29 13:52:11 +01:00
Julian Hammer
595cfa02a0 fixed marker insertion 2019-01-29 13:29:03 +01:00
Julian Hammer
216fbfa7bc ignoring .loc 2019-01-29 13:28:52 +01:00
Julian Hammer
bac3104c0a api to get ratio of unmatched instructions 2019-01-18 14:19:39 +01:00
Julian Hammer
64de49d497 added CFL and KBL 2019-01-16 14:53:50 +01:00
Julian Hammer
4be8bcafc4 updated importer and evaluation functions 2019-01-16 14:53:11 +01:00
Julian Hammer
4eaa1eeb7d pep8 2019-01-15 13:48:09 +01:00
Julian Hammer
4038f07002 making progress on covering ambigious instruction forms 2019-01-11 18:26:24 +01:00
Julian Hammer
895a262d5d Masking instruction duplication mentioned in #24 2019-01-11 16:48:07 +01:00
Julian Hammer
88856eb573 added missing test file 2019-01-11 16:16:11 +01:00
Julian Hammer
880dc332c8 [WIP] added test case and modified port counts to match imported model 2019-01-11 14:36:48 +01:00
Julian Hammer
6650145543 adde model importer and imported models from uops.info 2019-01-11 14:36:27 +01:00
jlaukemann
961364cbda fixed buggy instruction form entries 2019-01-10 16:35:50 +01:00
Julian
62ab93a7cd Merge pull request #18 from RRZE-HPC/api_cleanup
API cleanup
2019-01-10 13:42:32 +01:00
219 changed files with 346522 additions and 4022 deletions

2
.git-blame-ignore-revs Normal file
View File

@@ -0,0 +1,2 @@
# Migrate code style to Black
6204c90934c0e62aed98862ae77368b20a64cbfb

33
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@@ -0,0 +1,33 @@
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
|||
|---|---|
| OSACA version | v x.y.z |
| Used where | \[CLI / Compiler Explorer\]
Steps to reproduce the behavior:
- OSACA command
- input code snippet or Compiler Explorer short link
**OSACA output**
Please supply the output of the command with within a code block
```
```
**Expected behavior**
A clear and concise description of what you expected to happen.
**Additional context**
Add any other context about the problem here.

View File

@@ -0,0 +1,20 @@
---
name: Feature request
about: Suggest an idea for this project
title: "[REQUEST]"
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.

View File

@@ -0,0 +1,23 @@
---
name: Hardware support request
about: Request support for a new hardware architecture support
title: "[HW REQUEST]"
labels: new architecture
assignees: ''
---
**Why do you need support for this specific architecture?**
Please write a short note why you need this specific architecture support.
**Which architecture model, family and further information?**
Write a short note about the specific micro-architecture.
**Is the documentation of the port model publicly available?**
Please refer to already existing documentation about the port model.
**Is any documentation of the performance data of the instruction forms (throughput, latencies, port assignment, ...) publicly available?**
Please refer to already existing documentation/data bases containing information about the performance of instruction forms on this micro-arch.
**Are there already any usable tools (commercial or open-source)?**
Please refer to already existing tools with support for this hardware micro-architecture.

30
.github/workflows/lint.yml vendored Normal file
View File

@@ -0,0 +1,30 @@
name: Lint
on: push
jobs:
run-linters:
name: Run linters
runs-on: ubuntu-latest
steps:
- name: Check out Git repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.x"
- name: Install Python dependencies
run: python -m pip install black flake8
- name: Run linters
uses: wearerequired/lint-action@v2
with:
github_token: ${{ secrets.github_token }}
# Enable linters
black: true
black_args: "-l 99 --diff --color --extend-exclude .ipynb"
flake8: true
flake8_args: "--max-line-length=99 --extend-ignore=E203,E501"

42
.github/workflows/test-n-publish.yml vendored Normal file
View File

@@ -0,0 +1,42 @@
name: test-n-publish
on: [push, pull_request]
jobs:
test-n-publish:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13"]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
name: Set up Python ${{ matrix.python-version }}
with:
python-version: ${{ matrix.python-version }}
- name: Install
run: |
python -m pip install wheel
python -m pip install --upgrade pip
python -m pip install setuptools
python -m pip install codecov requests
python -m pip install bs4
sudo apt-get -y install graphviz libgraphviz-dev pkg-config
python -m pip install pygraphviz
#python -m pip install "kerncraft>=0.8.16"
python -m pip install git+https://github.com/RRZE-HPC/kerncraft.git@7caff4e2ecdbef595013041ba0131e37ed33c72c
python -m pip install -e .
- name: Test
run: |
coverage run -p tests/all_tests.py
- uses: codecov/codecov-action@v3
- name: Build package
run: |
python setup.py build sdist bdist_wheel
- name: Publish to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
skip_existing: true
user: __token__
password: ${{ secrets.pypi_password }}

6
.gitignore vendored
View File

@@ -1,5 +1,5 @@
# OSACA specific files and folders
osaca/taxCalc/
*.*.pickle
# Byte-compiled / optimized / DLL files
__pycache__/
@@ -109,3 +109,7 @@ venv.bak/
# mypy
.mypy_cache/
# Visual Studio
.vs
x64/

View File

@@ -1,8 +1,35 @@
sudo: false
os: linux
language: python
python:
- "3.5"
- "3.6"
- "3.7"
install: pip install tox-travis
script: tox
- "3.8"
- "3.9"
before_install:
# - pip install tox-travis
- pip install codecov
- pip install bs4
- pip install pygraphviz
- pip install kerncraft
install:
- pip install -e .
cache: pip
script:
# - tox
- coverage run -p tests/all_tests.py
after_success:
- coverage combine
- codecov
deploy:
provider: pypi
username: "__token__"
password:
secure: "fRRCETOwDkJ4pFacYZghPfCQ9mSsV4PlD3sTDp8rDHoCnebPjvFYc1tIdv+Wds0ae162KNUaj9GbxjK0MTGiRcy4pD08n7ufv8snmBQ2rtOLkj7RCRg1hw30WcMHjzqScFJgQcBrpjdPmR5AlesUufh6OadGvF1NspmVRWKr8ir3KQhmNV+itAliYoqaSTRTg1zC/znm+49l5gkzlLxd+mPj5/dtcc8vZ/i2M2+nNTTjDxq71q4Ddqv+bgZV1y7OZY2YuvjEDPflUbwc3fjOxpj891uMDHodsGmEHBu8WsLpF2tAO0C/x63S0jXamkV+/4cAQqQAwWr0Lby9/BjCfUwyUMOEgZ0S+z9WoFpBpQTQEfkD2JH/UFrv4CMnLFqgDkVMcx0vc/rT4Od8eJ5wOSG5+VdniJNOLpodFOXuKc09eJMk2lE9vk9OBrcsZ09UOTPTUCMZSIP4cBDxaIkx+RHQEy63TQdJZcElRBEWGEgj2e9hbiktvIoOvbFGQDscpz7ShBDklXIpu9hnxcKHtNDEjyywTUJmx7lTMILL05DPUnpUmnMb1Gyx5lbHzhSExc9re0cxEA354UUQKBS5HwHQcEBw9stMfsaForiBAUOocUKdGqlGP9cOXFoxdC9M+ff5FNstgbjPYSowb/JbATMlmCWKgH/bXXcTGCO10sk="
distributions: "sdist bdist_wheel"
skip_existing: true
cleanup: false
on:
repo: RRZE-HPC/OSACA
branch: master
tags: true

View File

@@ -1,7 +1,9 @@
include README.rst
include LICENSE
include tox.ini
recursive-include osaca/data/ *.csv
recursive-include osaca/data/ *.yml
recursive-include osaca/data/ *.pickle
include osaca/data/_build_cache.py
include examples/*
recursive-include tests *.py *.out
recursive-include tests/testfiles/ *

View File

@@ -1,4 +1,4 @@
.. image:: doc/osaca-logo.png
.. image:: docs/img/osaca-logo.png
:alt: OSACA logo
:width: 80%
@@ -6,25 +6,38 @@ OSACA
=====
Open Source Architecture Code Analyzer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--------------------------------------
This tool allows automatic instruction fetching of assembly code,
auto-generating of testcases for assembly instructions creating latency
and throughput benchmarks on a specific instruction form and throughput
analysis and throughput prediction for a innermost loop kernel.
For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.
.. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?token=393L6z2HEXNiGLtZ43s6&branch=master
:target: https://travis-ci.com/RRZE-HPC/OSACA
.. image:: https://github.com/RRZE-HPC/OSACA/workflows/test-n-publish/badge.svg?branch=master&event=push
:target: https://github.com/RRZE-HPC/OSACA/actions
:alt: Build Status
.. image:: https://landscape.io/github/RRZE-HPC/OSACA/master/landscape.svg?style=flat&badge_auth_token=c95f01b247f94bc79c09d21c5c827697
:target: https://landscape.io/github/RRZE-HPC/OSACA/master
:alt: Code Health
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
:alt: Code Coverage
.. image:: https://readthedocs.org/projects/osaca/badge/?version=latest
:target: https://osaca.readthedocs.io/en/latest/?badge=latest
:alt: Documentation Status
.. image:: https://img.shields.io/badge/read-the_docs-blue
:target: https://osaca.readthedocs.io/
:alt: Docs
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
:target: https://github.com/ambv/black
:alt: Code Style
Getting started
===============
OSACA is as a python module with a command line interface.
OSACA is also integrated into the `Compiler Explorer at godbolt.org <https://godbolt.org>`_, which allows using OSACA from a browser without any installation. To analyze an assembly snippet, go to https://godbolt.org change language to "Analysis", insert an AArch64 or AT&T(!) x86 assembly code and make sure OSACA is selected in the corresponding analysis panel, e.g., https://godbolt.org/z/shK4f8. When analyzing a high-level language code, use the "Add tool..." menu in the compiler output panel to add OSACA analysis, e.g. https://godbolt.org/z/hbMoPn. To change the micro architecture model, add ``--arch`` and µarch shortname (e.g., ``SKX`` for Skylake, ``ZEN2``, ``N1`` for ARM Neoverse) to the "Compiler options..." (when using "Analysis" mode) or "Arguments" (when analyzing compiler output of a high-level code).
Installation
~~~~~~~~~~~~
------------
On most systems with python pip and setuputils installed, just run:
.. code:: bash
@@ -42,20 +55,28 @@ To build OSACA from source, clone this repository using ``git clone https://gith
After installation, OSACA can be started with the command ``osaca`` in the CLI.
Dependencies:
~~~~~~~~~~~~~~~
Additional requirements are:
-------------
Necessary equirements are:
- `Python3 <https://www.python.org/>`_
- `pandas <http://pandas.pydata.org/>`_
- `NumPy <http://www.numpy.org/>`_
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`_ for marker insertion
- `ibench <https://github.com/hofm/ibench>`_ for throughput/latency measurements
- `Graphviz <https://www.graphviz.org/>`_ for dependency graph creation (minimal dependency is ``libgraphviz-dev`` on Ubuntu)
- Python packages:
- `networkx <https://networkx.org/>`_
- `pyparsing <https://github.com/pyparsing/pyparsing>`_
- `ruamel.yaml <https://pypi.org/project/ruamel.yaml/>`_
Optional requirements are:
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
- `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
- `BeautifulSoup4 <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ for scraping instruction form information for the x86 ISA (experimental)
Design
======
A schematic design of OSACA's workflow is shown below:
.. image:: doc/osaca-workflow.png
.. image:: docs/img/osaca-workflow.png
:alt: OSACA workflow
:width: 80%
@@ -66,216 +87,369 @@ The usage of OSACA can be listed as:
.. code:: bash
osaca [-h] [-V] [--arch ARCH] [--tp-list] [-i | --iaca | -m] FILEPATH
osaca [-h] [-V] [--arch ARCH] [--fixed] [--lines LINES]
[--ignore-unknown] [--lcd-timeout SECONDS]
[--db-check] [--import MICROBENCH] [--insert-marker]
[--export-graph GRAPHNAME] [--consider-flag-deps]
[--out OUT] [--yaml-out YAML_OUT] [--verbose]
FILEPATH
- ``-h`` or ``--help`` prints out the help message.
- ``-V`` or ``--version`` shows the programs version number.
- ``ARCH`` needs to be replaced with the wished architecture abbreviation. This flag is necessary for the throughput analysis (default function) and the inclusion of an ibench output (``-i``). Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW`` and ``SKL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN`` for AMD Zen (17h family) architecture .
- While in the throughput analysis mode, one can add ``--tp-list`` for printing the additional throughput list of the kernel or ``--iaca`` for letting OSACA to know it has to search for IACA binary markers.
- ``-i`` or ``--include-ibench`` starts the integration of ibench output into the CSV data file determined by ``ARCH``.
- With the flag ``-m`` or ``--insert-marker`` OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`_ marker in suggested assembly blocks.
- ``FILEPATH`` describes the filepath to the file to work with and is always necessary
-h, --help
prints out the help message.
-V, --version
shows the programs version number.
--arch ARCH
needs to be replaced with the target architecture abbreviation.
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ICL`` (Client), ``ICX`` (Server), ``SPR`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN[1-4]`` for AMD Zen architectures.
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72, ``TSV110`` for the HiSilicon TaiShan v110, ``A64FX`` for Fujitsu's HPC ARM architecture, ``M1`` for the Apple M1-Firestorm performance core, and ``V2`` for the Neoverse V2 (used in NVIDIA's Grace CPU) are available.
If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
--fixed
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
Otherwise, OSACA will print out the optimal port utilization for the kernel.
--lines
Define lines that should be included in the analysis. This option overwrites any range defined by markers in the assembly. Add either single lines or ranges defined
by "-" or ":", each entry separated by commas, e.g.: ``--lines 1,2,8-18,20:24``
--db-check
Run a sanity check on the by "--arch" specified database.
The output depends on the verbosity level.
Keep in mind you have to provide an existing (dummy) filename in anyway.
--import MICROBENCH
Import a given microbenchmark output file into the corresponding architecture instruction database.
Define the type of microbenchmark either as "ibench" or "asmbench".
--insert-marker
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`__ byte markers or OSACA AArch64 byte markers in suggested assembly blocks.
--export-graph EXPORT_PATH
Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
After the file was created, you can convert it to a PDF file using `dot <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__.
--ignore-unknown
Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
--lcd-timeout SECONDS
Set timeout in seconds for LCD analysis. After timeout, OSACA will continue its analysis with the dependency paths found up to this point.
Defaults to `10`.
-f, --consider-flag-deps
Consider flag dependencies for the critical path and loop-carried dependency analysis. By default, those dependencies are ignored.
-v, --verbose
Increases verbosity level
-o OUT, --out OUT
Write analysis to this file (default to stdout)
--yaml-out YAML_OUT
Write analysis as YAML representation to this file
The **FILEPATH** describes the filepath to the file to work with and is always necessary, use "-" to read from stdin.
Supported microarchitectures
-----------------------------
**x86 CPUs**
+----------+-----------------+------------+
| Designer | Model/microarch | OSACA flag |
+==========+=================+============+
| Intel | Sandy Bridge | ``SNB`` |
+----------+-----------------+------------+
| Intel | Ivy Bridge | ``IVB`` |
+----------+-----------------+------------+
| Intel | Haswell | ``HSW`` |
+----------+-----------------+------------+
| Intel | Broadwell | ``BDW`` |
+----------+-----------------+------------+
| Intel | Skylake-X | ``SKX`` |
+----------+-----------------+------------+
| Intel | Cascadelake-X | ``CSX`` |
+----------+-----------------+------------+
| Intel | Icelake client | ``ICL`` |
+----------+-----------------+------------+
| Intel | Icelake server | ``ICX`` |
+----------+-----------------+------------+
| Intel | Sapphire Rapids | ``SPR`` |
+----------+-----------------+------------+
| AMD | Naples / Zen 1 | ``ZEN1`` |
+----------+-----------------+------------+
| AMD | Rome / Zen 2 | ``ZEN2`` |
+----------+-----------------+------------+
| AMD | Milan / Zen 3 | ``ZEN3`` |
+----------+-----------------+------------+
| AMD | Genoa / Zen 4 | ``ZEN4`` |
+----------+-----------------+------------+
**ARM AArch64 CPUs**
+-----------+-------------------+-------------+
| Designer | Model/microarch | OSACA flag |
+===========+===================+=============+
| ARM | Cortex-A72 | ``A72`` |
+-----------+-------------------+-------------+
| ARM | Neoverse N1 | ``N1`` |
+-----------+-------------------+-------------+
| ARM | Neoverse V2 | ``V2`` |
+-----------+-------------------+-------------+
| Marvell | ThunderX2 | ``TX2`` |
+-----------+-------------------+-------------+
| Fujitsu | FX700/A64FX | ``A64FX`` |
+-----------+-------------------+-------------+
| HiSilicon | TaiShan v110 | ``TSV110`` |
+-----------+-------------------+-------------+
| Apple | M1-Firestorm | ``M1`` |
+-----------+-------------------+-------------+
| NVIDIA | Neoverse V2/Grace | ``V2`` |
+-----------+-------------------+-------------+
----
Hereinafter OSACA's scope of function will be described.
Throughput analysis
~~~~~~~~~~~~~~~~~~~
As main functionality of OSACA this process starts by default. It is always necessary to specify the core architecture by the flag ``--arch ARCH``, where ``ARCH`` can stand for ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKL`` or ``ZEN``.
Throughput & Latency analysis
-----------------------------
As main functionality of OSACA, the tool starts the analysis on a marked assembly file by running the following command with one or more of the optional parameters:
For extracting the right kernel, one has to mark it beforehand. For this there are two different approaches:
.. code-block:: bash
| **High level code**
osaca --arch ARCH [--fixed] [--ignore-unknown]
[--export-graph EXPORT_PATH]
file
The OSACA marker is ``//STARTLOOP`` and must be put in one line in front of the loop head, and the loop code must be indented consistently. This means the marker and the head must have the same indentation level while the whole loop body needs to be more indented than the code before and after. For instance, this is a valid OSACA marker:
The ``file`` parameter specifies the target assembly file and is always mandatory.
.. code-block:: c
The parameter ``ARCH`` is positional for the analysis and must be replaced by the target architecture abbreviation.
int i = 0;
//STARTLOOP
while(i < N){
// do something...
i++;
}
OSACA assumes an optimal scheduling for all instructions and assumes the processor to be able to schedule instructions in a way that it achieves a minimal reciprocal throughput.
However, in older versions (<=v0.2.2) of OSACA, a fixed probability for port utilization was assumed.
This means, instructions with *N* available ports for execution were scheduled with a probability of *1/N* to each of the ports.
This behavior can be enforced by using the ``--fixed`` flag.
| **Assembly code**
If one or more instruction forms are unknown to OSACA, it refuses to print an overall throughput, CP and
LCD analysis and marks all unknown instruction forms with ``X`` next to the mnemonic.
This is done so the user does not miss out on this unrecognized instruction and might assume an incorrect runtime prediction.
To force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms, the flag ``--ignore-unknown`` can be specified.
Another way for marking a kernel is to insert the IACA byte markers in the assembly file in before and after the loop.
To get a visualization of the analyzed kernel and its dependency chains, OSACA provides the option to additionally produce a graph as DOT file, which represents the kernel and all register dependencies inside of it.
The tool highlights all LCDs and the CP.
The graph generation is done by running OSACA with the ``--export-graph EXPORT_GRAPH`` flag.
OSACA stores the DOT file either at the by ``EXPORT_GRAPH`` specified filepath or uses the default filename "osaca_dg.dot" in the current working directory.
Subsequently, the DOT-graph can be adjusted in its appearance and converted to various output formats such as PDF, SVG, or PNG using the `dot command <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__, e.g., ``dot -Tpdf osaca_dg.dot -o
graph.pdf`` to generate a PDF document.
Marker insertion
----------------
For extracting the right kernel, one can mark it in beforehand.
Currently, only the detection of markers in the assembly code and therefore the analysis of assembly files is supported by OSACA.
If OSACA cannot find any markers in the given input file, all lines will be evaluated.
Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
Start and end marker can be seen in the example below:
IACA requires byte markers since it operates on opcode-level.
To provide a trade-off between reusability for such tool and convenient usability, OSACA supports both byte markers and comment line markers.
While the byte markers for x86 are equivalent to IACA byte markers, the comment keywords ``OSACA-BEGIN`` and ``OSACA-END`` are based on LLVM-MCA's markers.
.. code-block:: gas
x86 markers
^^^^^^^^^^^
**Byte markers**
movl $111,%ebx ;IACA START MARKER
.byte 100,103,144 ;IACA START MARKER
; LABEL
; do something
; ...
; conditional jump to LABEL
movl $222,%ebx ;IACA END MARKER
.byte 100,103,144 ;IACA END MARKER
.. code-block:: asm
The optional flag ``--iaca`` defines if OSACA needs to search for the IACA byte markers or the OSACA marker in the chosen file.
movl $111,%ebx #IACA/OSACA START MARKER
.byte 100,103,144 #IACA/OSACA START MARKER
.loop:
# loop body
jb .loop
movl $222,%ebx #IACA/OSACA END MARKER
.byte 100,103,144 #IACA/OSACA END MARKER
With an additional, optional ``--tp-list``, OSACA adds a simple list of all kernel instruction forms together with their reciprocal throughput to the output. This is helpful in case of no further information about the port binding of the single instruction forms.
**Comment line markers**
Include new measurements into the data file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running OSACA with the flag ``-i`` or ``--include-ibench`` and a specified micro architecture ``ARCH``, it
takes the values given in an ibench output file and checks them for reasonability. If a value is not in the data file already, it will be added, otherwise OSACA prints out a warning message and keeps the old value in the data file. If a value does not pass the validation, a warning message is shown, however, OSACA will keep working with the new value.
The handling of ibench is shortly described in the example section below.
.. code-block:: asm
Insert IACA markers
~~~~~~~~~~~~~~~~~~~
Using the ``-m`` or ``--insert-marker`` flags for a given file, OSACA calls the implemented Kerncraft module for identifying and marking the inner-loop block in *manual mode*. More information about how this is done can be found in the `Kerncraft repository <https://github.com/RRZE-HPC/kerncraft>`_.
# OSACA-BEGIN
.loop:
# loop body
jb .loop
# OSACA-END
Example
=======
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel IVB core hereafter:
AArch64 markers
^^^^^^^^^^^^^^^
**Byte markers**
::
mov x1, #111 // OSACA START
.byte 213,3,32,31 // OSACA START
.loop:
// loop body
b.ne .loop
mov x1, #222 // OSACA END
.byte 213,3,32,31 // OSACA END
**Comment line markers**
::
// OSACA-BEGIN
.loop:
// loop body
b.ne .loop
// OSACA-END
OSACA in combination with Kerncraft provides a functionality for the automatic detection of possible loop kernels and inserting markers.
This can be done by using the ``--insert-marker`` flag together with the path to the target assembly file and the target architecture.
Benchmark import
----------------
OSACA supports the automatic integration of new instruction forms by parsing the output of the micro-
benchmark tools `asmbench <https://github.com/RRZE-HPC/asmbench>`__ and `ibench <https://github.com/RRZE-HPC/ibench>`__.
This can be achieved by running OSACA with the command line option ``--import MICROBENCH``:
.. code-block:: bash
osaca --arch ARCH --import MICROBENCH file
``MICROBENCH`` specifies one of the currently supported benchmark tools, i.e., "asmbench" or "ibench".
``ARCH`` defines the abbreviation of the target architecture for which the instructions will be added and file must be the path to the generated output file of the benchmark.
The format of this file has to match either the basic command line output of ibench, e.g.,
::
[INSTRUCTION FORM]-TP: 0.500 (clock cycles) [DEBUG - result: 1.000000]
[INSTRUCTION FORM]-LT: 4.000 (clock cycles) [DEBUG - result: 1.000000]
or the command line output of asmbench including the name of the instruction form in a separate line at the
beginning, e.g.:
::
[INSTRUCTION FORM]
Latency: 4.00 cycle
Throughput: 0.50 cycle
Note that there must be an empty line after each throughput measurement as part of the output so that one instruction form entry consists of four (4) lines.
To let OSACA import the instruction form with the correct operands, the naming conventions for the instruction form name must be followed:
* The first part of the name is the mnemonic and ends with the character "``-``" (not part of the mnemonic in the DB).
* The second part of the name are the operands. Each operand must be separated from another operand by the character "``_``".
* For each **x86** operand, one of the following symbols must be used:
* "``r``" for general purpose registers (rax, edi, r9, ...)
* "``x``", "``y``", or "``z``" for xmm, ymm, or zmm registers, respectively
* "``i``" for immediates
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more* than 1.
* For each **AArch64** operand, one of the following symbols must be used:
* "``w``", "``x``", "``b``", "``h``", "``s``", "``d``", or "``q``" for registers with the corresponding prefix.
* "``v``" followed by a single character ("``b``", "``h``", "``s``", or "``d``") for vector registers with the corresponding lane width of the second character.
If no second character is given, OSACA assumes a lane width of 64 bit (``d``) as default.
* "``i``" for immediates
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more*
than 1. Add "``r``" if the address format uses pre-indexing and "``p``" if it uses post-indexing.
Valid instruction form examples for x86 are ``vaddpd-x_x_x``, ``mov-r_mboi``, and ``vfmadd213pd-mbis_y_y``.
Valid instruction form examples for AArch64 are ``fadd-vd_vd_v``, ``ldp-d_d_mo``, and ``fmov-s_i``.
Note that the options to define operands are limited, therefore, one might need to adjust the instruction forms in the architecture DB after importing.
OSACA parses the output for an arbitrary number of instruction forms and adds them as entries to the architecture DB.
The user must edit the ISA DB in case the instruction form shows irregular source and destination operands for its ISA syntax. OSACA applies the following rules by default:
* If there is only one operand, it is considered as source operand
* In case of multiple operands the target operand (depending on the ISA syntax the last or first one) is considered to be the
destination operand, all others are considered as source operands.
Database check
--------------
Since a manual adjustment of the ISA DB is currently indispensable when adding new instruction forms,
OSACA provides a database sanity check using the --db-check flag. It can be executed via:
.. code-block:: bash
osaca --arch ARCH --db-check [-v] file
``ARCH`` defines the abbreviation of the target architecture of the database to check.
The ``file`` argument needs to be specified as it is positional but may be any existing dummy path.
When called, OSACA prints a summary of database information containing the amount of missing throughput values, latency values or μ-ops assignments for an instruction form.
Furthermore, it shows the amount of duplicate instruction forms in both the architecture DB and the ISA DB and checks how many instruction forms in the ISA DB are non-existent in the architecture DB.
Finally, it checks via simple heuristics how many of the instruction forms contained in the architecture DB might miss an ISA DB entry.
Running the database check including the ``-v`` verbosity flag, OSACA prints in addition the specific name of the identified instruction forms so that the user can check the mentioned incidents.
Examples
========
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:
.. code-block:: c
double a[N], double b[N];
double s;
//STARTLOOP
// loop
for(int i = 0; i < N; ++i)
a[i] = s * b[i];
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``. The result is
written in vector ``a``.
After including the OSACA marker ``//STARTLOOP`` and compiling the source, one can
start the analysis typing
.. code:: bash
osaca --arch IVB PATH/TO/FILE
in the command line. Optionally, one can create the assembly code out of the file, identify and mark the kernel of interest and run OSACA with the additional ``--iaca`` flag.
The output is:
.. code-block::
Throughput Analysis Report
--------------------------
X - No information for this instruction in database
* - Instruction micro-ops not bound to a port
Port Binding in Cycles Per Iteration:
-------------------------------------------------
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------------
| Cycles | 2.33 | 1.33 | 5.0 | 5.0 | 2.0 | 1.33 |
-------------------------------------------------
Ports Pressure in cycles
| 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------
| | | 0.50 | 0.50 | 1.00 | | movl $0x0,-0x24(%rbp)
| | | | | | | jmp 10b <scale+0x10b>
| | | 0.50 | 0.50 | | | mov -0x48(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | | | vmovsd (%rax,%rdx,8),%xmm0
| 1.00 | | 0.50 | 0.50 | | | vmulsd -0x50(%rbp),%xmm0,%xmm0
| | | 0.50 | 0.50 | | | mov -0x38(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | 1.00 | | vmovsd %xmm0,(%rax,%rdx,8)
| | | | | | | X addl $0x1,-0x24(%rbp)
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%eax
| 0.33 | 0.33 | 0.50 | 0.50 | | 0.33 | cmp -0x54(%rbp),%eax
| | | | | | | jl e4 <scale+0xe4>
| 0.33 | 0.33 | | | | 0.33 | mov %rcx,%rsp
Total number of estimated throughput: 5.0
It shows the whole kernel together with the average port pressure of each instruction form and the overall port binding.
In the fifth to last line containing ``addl $0x1, -0x24(%rbp)`` one can see an ``X`` in front of the instruction form and no port occupation.
This means either there are no measured values for this instruction form or no port binding is provided in the
data file.
In the first case, OSACA automatically creates two benchmark assembly files (``add-mem_imd.S`` for latency and ``add-mem_imd-TP.S`` for throughput) in the benchmark folder, if it not already exists there.
One can now run ibench to get the throughput value for addl with the given file. Mind that the assembly
file, which is used for ibench, is implemented in Intel syntax. So for a valid run instruction ``addl`` must be
changed to ``add`` manually.
For measuring the instruction forms with ibench we highly recommend to use an exclusively allocated node,
so there is no other workload falsifying the results. For the correct function of ibench the benchmark files
from OSACA need to be placed in a subdirectory of src in root so ibench can create the a folder with the
subdirectorys name and the shared objects. For running the tests the frequencies of all cores must set to a
constant value and this has to be given as an argument together with the directory of the shared objects to
ibench, e.g.:
.. code:: bash
./ibench ./AVX 2.2
for running ibench in the directory ``AVX`` with a core frequency of 2.2 GHz.
We get an output like:
.. code:: bash
Using frequency 2.20GHz.
add-mem_imd-TP: 1.023 (clock cycles) [DEBUG - result: 1.000000]
add-mem_imd: 6.050 (clock cycles) [DEBUG - result: 1.000000]
The debug output as resulting value of register ``xmm0`` is additional validation information depending on
the executed instruction form meant for the user and is not considered by OSACA.
The ibench output information can be included by OSACA running the program with the flag ``--include-ibench`` or just
``-i`` and the specify micro architecture:
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``.
The result is written in vector ``a``.
After including the OSACA byte marker into the assembly, one can start the analysis typing
.. code-block:: bash
osaca --arch IVB -i PATH/TO/IBENCH-OUTPUTFILE
osaca --arch CSX PATH/TO/FILE
For now no automatic allocation of ports for a instruction form is implemented, so for getting an output in the Ports Pressure table, one must add the port occupation by hand.
We know that the inserted instruction form must be assigned always to Port 2, 3 and 4 and additionally to either 0, 1 or 5, a valid data file therefore would look like this:
in the command line.
.. code:: bash
The output is:
addl-mem_imd,1.0,6.0,"(0.33,0.33,1.00,1.00,1.00,0.33)"
Another thorughput analysis with OSACA now returns all information for the kernel:
::
.. code-block::
Open Source Architecture Code Analyzer (OSACA) - v0.3
Analyzed file: scale.s.csx.O3.s
Architecture: csx
Timestamp: 2019-10-03 23:36:21
Throughput Analysis Report
--------------------------
X - No information for this instruction in database
* - Instruction micro-ops not bound to a port
Port Binding in Cycles Per Iteration:
-------------------------------------------------
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------------
| Cycles | 2.67 | 1.67 | 6.0 | 6.0 | 3.0 | 1.67 |
-------------------------------------------------
Ports Pressure in cycles
| 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------
| | | 0.50 | 0.50 | 1.00 | | movl $0x0,-0x24(%rbp)
| | | | | | | jmp 10b <scale+0x10b>
| | | 0.50 | 0.50 | | | mov -0x48(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | | | vmovsd (%rax,%rdx,8),%xmm0
| 1.00 | | 0.50 | 0.50 | | | vmulsd -0x50(%rbp),%xmm0,%xmm0
| | | 0.50 | 0.50 | | | mov -0x38(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | 1.00 | | vmovsd %xmm0,(%rax,%rdx,8)
| 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 0.33 | addl $0x1,-0x24(%rbp)
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%eax
| 0.33 | 0.33 | 0.50 | 0.50 | | 0.33 | cmp -0x54(%rbp),%eax
| | | | | | | jl e4 <scale+0xe4>
| 0.33 | 0.33 | | | | 0.33 | mov %rcx,%rsp
Total number of estimated throughput: 6.0
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
-----------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
170 | | | | | | | | || | | .L22:
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
172 | | | 0.50 | 0.50 | 1.00 | | | || 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | || | 1.0 | addq $32, %rax
174 | 0.00 | 0.00 | | | | 0.50 | 0.50 | || | | cmpq %rax, %r14
175 | | | | | | | | || | | * jne .L22
0.75 0.75 1.00 0.50 1.00 0.50 1.00 0.75 0.75 13.0 1.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
173 | 1.0 | addq $32, %rax | [173]
It shows the whole kernel together with the optimized port pressure of each instruction form and the overall port binding.
Furthermore, in the two columns on the right, the critical path (CP) and the longest loop-carried dependency (LCD) of the loop kernel.
In the bottom, all loop-carried dependencies are shown, each with a list of line numbers being part of this dependency chain on the right.
You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
Citations
=========
If you use OSACA for scientific work you can cite us as (for the Bibtex, see the `Wiki <https://github.com/RRZE-HPC/OSACA/wiki#acknowledgement>`_):
* `Automated Instruction Stream Throughput Prediction for Intel and AMD Microarchitectures <https://doi.org/10.1109/PMBS.2018.8641578>`_ (`Pre-print PMBS18 <https://arxiv.org/abs/1809.00912>`_)
* `Automatic Throughput and Critical Path Analysis of x86 and ARM Assembly Kernels <https://doi.org/10.1109/PMBS49563.2019.00006>`_ (`Pre-print PMBS19 <https://arxiv.org/abs/1910.00214>`_)
Credits
=======
Implementation: Jan Laukemann
Implementation: Jan Laukemann, Julian Hammer
License
=======
`AGPL-3.0 </LICENSE>`_
`AGPL-3.0 </LICENSE>`__

3
codecov.yml Normal file
View File

@@ -0,0 +1,3 @@
ignore:
- "tests" # ignore test folder and all its contents
- "**/__init__.py" # ignore init files

View File

@@ -1,134 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="459.03406mm"
height="217.28152mm"
viewBox="0 0 1626.4986 769.8951"
id="svg2"
version="1.1"
inkscape:version="0.92.1 r15371"
sodipodi:docname="OSACA-Logo_05.svg"
inkscape:export-filename="/home/cip/2014/ol68umur/Desktop/logo/OSACA-Logo_03b.png"
inkscape:export-xdpi="1104"
inkscape:export-ydpi="1104">
<defs
id="defs4" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0"
inkscape:pageshadow="2"
inkscape:zoom="0.5"
inkscape:cx="858.02629"
inkscape:cy="511.52256"
inkscape:document-units="px"
inkscape:current-layer="layer1"
showgrid="false"
fit-margin-top="0.5"
fit-margin-left="0.5"
fit-margin-right="0.5"
fit-margin-bottom="0.5"
inkscape:window-width="1920"
inkscape:window-height="1081"
inkscape:window-x="0"
inkscape:window-y="49"
inkscape:window-maximized="1" />
<metadata
id="metadata7">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Ebene 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(263.39161,902.34721)">
<g
id="g4583">
<text
transform="scale(1.0341487,0.96697893)"
id="text4147"
y="-333.24573"
x="542.02954"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;line-height:0%;font-family:'Open Sans book';-inkscape-font-specification:'Open Sans book, ';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
xml:space="preserve"><tspan
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:1.25;font-family:'Futura Bk';-inkscape-font-specification:'Futura Bk'"
y="-333.24573"
x="542.02954"
id="tspan4149"
sodipodi:role="line">ACA</tspan></text>
<text
transform="scale(1.0341487,0.96697893)"
id="text4147-3"
y="-417.88809"
x="-266.53079"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:0%;font-family:'Open Sans book';-inkscape-font-specification:'Open Sans book, ';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
xml:space="preserve"><tspan
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:1.25;font-family:'Futura Bk';-inkscape-font-specification:'Futura Bk';stroke-width:1px"
y="-417.88809"
x="-266.53079"
id="tspan4149-6"
sodipodi:role="line">OS</tspan></text>
<g
id="g4571">
<rect
style="fill:#4dd9ff;fill-opacity:1;stroke:none;stroke-width:4.99469042;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
id="rect4162-6-3"
width="501.93356"
height="46.874996"
x="-900.57556"
y="-486.72452"
transform="rotate(90)" />
<rect
style="fill:#7fff00;fill-opacity:1;stroke:none;stroke-width:7.58152723;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
id="rect4162-5"
width="676.86932"
height="46.874996"
x="134.22374"
y="-425.20099"
transform="matrix(0,-1,-1,0,0,0)" />
<rect
style="fill:#f2ff19;fill-opacity:1;stroke:none;stroke-width:5.0525918;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
id="rect4162-7-6"
width="456.4649"
height="46.874996"
x="-682.95709"
y="-363.67743"
transform="rotate(90)" />
<rect
style="fill:#8071ff;fill-opacity:1;stroke:none;stroke-width:5.53460026;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
id="rect4162-3-2"
width="360.71481"
height="46.874996"
x="322.24228"
y="-548.24811"
transform="matrix(0,-1,-1,0,0,0)" />
<rect
style="fill:#ff2a2a;fill-opacity:1;stroke:none;stroke-width:5.43282366;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
id="rect4162-67-9"
width="347.57031"
height="46.874996"
x="398.60016"
y="-302.15387"
transform="matrix(0,-1,-1,0,0,0)" />
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 231 KiB

66
docs/conf.py Normal file
View File

@@ -0,0 +1,66 @@
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath("."))
from version_from_src import get_version # noqa: E402
# -- Project information -----------------------------------------------------
project = "OSACA"
copyright = "2020, Jan Laukemann"
author = "Jan Laukemann"
html_logo = "img/osaca-logo.png"
# The full version, including alpha/beta/rc tags
version = get_version()
release = get_version()
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx.ext.mathjax",
"sphinx.ext.napoleon",
"sphinx.ext.todo",
"sphinx.ext.viewcode",
]
add_module_names = False
source_suffix = ".rst"
master_doc = "index"
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
# e.g., 'alabaster', 'sphinx_rtd_theme'
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = []
htmlhelp_basename = "osaca_doc"
html_sidebars = {"**": ["globaltoc.html", "relations.html", "sourcelink.html", "searchbox.html"]}
autodoc_member_order = "bysource"

BIN
docs/img/osaca-logo.pdf Normal file

Binary file not shown.

View File

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 45 KiB

BIN
docs/img/osaca-workflow.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 206 KiB

14
docs/index.rst Normal file
View File

@@ -0,0 +1,14 @@
OSACA -- Open Source Architecture Code Analyzer
=================================================
.. toctree::
:maxdepth: 2
:caption: Contents:
sphinx/home.rst
sphinx/api.rst
.. image:: /img/osaca-logo.png
:alt: OSACA logo
:width: 80%

7
docs/sphinx/api.rst Normal file
View File

@@ -0,0 +1,7 @@
API Reference
=============
.. toctree::
:maxdepth: 4
osaca

364
docs/sphinx/home.rst Normal file
View File

@@ -0,0 +1,364 @@
.. image:: /img/osaca-logo.png
:alt: OSACA logo
:width: 80%
OSACA
=====
Open Source Architecture Code Analyzer
--------------------------------------
For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.
.. image:: https://travis-ci.org/RRZE-HPC/OSACA.svg?branch=master
:target: https://travis-ci.org/RRZE-HPC/OSACA
:alt: Build Status
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
:alt: Code Coverage
.. image:: https://readthedocs.org/projects/osaca/badge/?version=latest
:target: https://osaca.readthedocs.io/en/latest/?badge=latest
:alt: Documentation Status
.. image:: https://img.shields.io/badge/read-the_docs-blue
:target: https://osaca.readthedocs.io/
:alt: Docs
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
:target: https://github.com/ambv/black
:alt: Code Style
Getting started
===============
Installation
------------
On most systems with python pip and setuputils installed, just run:
.. code:: bash
pip install --user osaca
for the latest release.
To build OSACA from source, clone this repository using ``git clone https://github.com/RRZE-HPC/OSACA`` and run in the root directory:
.. code:: bash
python ./setup.py install
After installation, OSACA can be started with the command ``osaca`` in the CLI.
Dependencies:
-------------
Additional requirements are:
- `Python3 <https://www.python.org/>`__
- `Graphviz <https://www.graphviz.org/>`__ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
- `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
Design
======
A schematic design of OSACA's workflow is shown below:
.. image:: /img/osaca-workflow.png
:alt: OSACA workflow
:width: 80%
Usage
=====
The usage of OSACA can be listed as:
.. code:: bash
osaca [-h] [-V] [--arch ARCH] [--fixed] [--db-check]
[--import MICROBENCH] [--insert-marker]
[--export-graph GRAPHNAME] [--ignore-unknown] [--verbose]
FILEPATH
-h, --help
prints out the help message.
-V, --version
shows the programs version number.
--arch ARCH
needs to be replaced with the target architecture abbreviation.
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX`` and ``CSX`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 architecture is available.
--fixed
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
Otherwise, OSACA will print out the optimal port utilization for the kernel.
--db-check
Run a sanity check on the by "--arch" specified database.
The output depends on the verbosity level.
Keep in mind you have to provide an existing (dummy) filename in anyway.
--import MICROBENCH
Import a given microbenchmark output file into the corresponding architecture instruction database.
Define the type of microbenchmark either as "ibench" or "asmbench".
--insert-marker
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`__ byte markers or OSACA AArch64 byte markers in suggested assembly blocks.
--export-graph EXPORT_PATH
Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
After the file was created, you can convert it to a PDF file using `dot <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__.
--ignore-unknown
Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
-v, --verbose
Increases verbosity level
The **FILEPATH** describes the filepath to the file to work with and is always necessary
______________________
Hereinafter OSACA's scope of function will be described.
Throughput & Latency analysis
-----------------------------
As main functionality of OSACA, the tool starts the analysis on a marked assembly file by running the following command with one or more of the optional parameters:
.. code-block:: bash
osaca --arch ARCH [--fixed] [--ignore-unknown]
[--export-graph EXPORT_PATH]
file
The ``file`` parameter specifies the target assembly file and is always mandatory.
The parameter ``ARCH`` is positional for the analysis and must be replaced by the target architecture abbreviation.
OSACA assumes an optimal scheduling for all instructions and assumes the processor to be able to schedule instructions in a way that it achieves a minimal reciprocal throughput.
However, in older versions (<=v0.2.2) of OSACA, a fixed probability for port utilization was assumed.
This means, instructions with *N* available ports for execution were scheduled with a probability of *1/N* to each of the ports.
This behavior can be enforced by using the ``--fixed`` flag.
If one or more instruction forms are unknown to OSACA, it refuses to print an overall throughput, CP and
LCD analysis and marks all unknown instruction forms with ``X`` next to the mnemonic.
This is done so the user does not miss out on this unrecognized instruction and might assume an incorrect runtime prediction.
To force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms, the flag ``--ignore-unknown`` can be specified.
To get a visualization of the analyzed kernel and its dependency chains, OSACA provides the option to additionally produce a graph as DOT file, which represents the kernel and all register dependencies inside of it.
The tool highlights all LCDs and the CP.
The graph generation is done by running OSACA with the ``--export-graph EXPORT_GRAPH`` flag.
OSACA stores the DOT file either at the by ``EXPORT_GRAPH`` specified filepath or uses the default filename "osaca_dg.dot" in the current working directory.
Subsequently, the DOT-graph can be adjusted in its appearance and converted to various output formats such as PDF, SVG, or PNG using the `dot command <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__, e.g., ``dot -Tpdf osaca_dg.dot -o
graph.pdf`` to generate a PDF document.
Marker insertion
----------------
For extracting the right kernel, one has to mark it in beforehand.
Currently, only the detection of markers in the assembly code and therefore the analysis of assembly files is supported by OSACA.
Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
IACA requires byte markers since it operates on opcode-level.
To provide a trade-off between reusability for such tool and convenient usability, OSACA supports both byte markers and comment line markers.
While the byte markers for x86 are equivalent to IACA byte markers, the comment keywords ``OSACA-BEGIN`` and ``OSACA-END`` are based on LLVM-MCA's markers.
x86 markers
^^^^^^^^^^^
**Byte markers**
.. code-block:: asm
movl $111,%ebx #IACA/OSACA START MARKER
.byte 100,103,144 #IACA/OSACA START MARKER
.loop:
# loop body
jb .loop
movl $222,%ebx #IACA/OSACA END MARKER
.byte 100,103,144 #IACA/OSACA END MARKER
**Comment line markers**
.. code-block:: asm
# OSACA-BEGIN
.loop:
# loop body
jb .loop
# OSACA-END
AArch64 markers
^^^^^^^^^^^^^^^
**Byte markers**
::
mov x1, #111 // OSACA START
.byte 213,3,32,31 // OSACA START
.loop:
// loop body
b.ne .loop
mov x1, #222 // OSACA END
.byte 213,3,32,31 // OSACA END
**Comment line markers**
::
// OSACA-BEGIN
.loop:
// loop body
b.ne .loop
// OSACA-END
OSACA in combination with Kerncraft provides a functionality for the automatic detection of possible loop kernels and inserting markers.
This can be done by using the ``--insert-marker`` flag together with the path to the target assembly file and the target architecture.
Benchmark import
----------------
OSACA supports the automatic integration of new instruction forms by parsing the output of the micro-
benchmark tools `asmbench <https://github.com/RRZE-HPC/asmbench>`__ and `ibench <https://github.com/RRZE-HPC/ibench>`__.
This can be achieved by running OSACA with the command line option ``--import MICROBENCH``:
.. code-block:: bash
osaca --arch ARCH --import MICROBENCH file
``MICROBENCH`` specifies one of the currently supported benchmark tools, i.e., "asmbench" or "ibench".
``ARCH`` defines the abbreviation of the target architecture for which the instructions will be added and file must be the path to the generated output file of the benchmark.
The format of this file has to match either the basic command line output of ibench, e.g.,
::
[INSTRUCTION FORM]-TP: 0.500 (clock cycles) [DEBUG - result: 1.000000]
[INSTRUCTION FORM]-LT: 4.000 (clock cycles) [DEBUG - result: 1.000000]
or the command line output of asmbench including the name of the instruction form in a separate line at the
beginning, e.g.:
::
[INSTRUCTION FORM]
Latency: 4.00 cycle
Throughput: 0.50 cycle
Note that there must be an empty line after each throughput measurement as part of the output so that one instruction form entry consists of four (4) lines.
To let OSACA import the instruction form with the correct operands, the naming conventions for the instruction form name must be followed:
* The first part of the name is the mnemonic and ends with the character "``-``" (not part of the mnemonic in the DB).
* The second part of the name are the operands. Each operand must be separated from another operand by the character "``_``".
* For each **x86** operand, one of the following symbols must be used:
* "``r``" for general purpose registers (rax, edi, r9, ...)
* "``x``", "``y``", or "``z``" for xmm, ymm, or zmm registers, respectively
* "``i``" for immediates
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more* than 1.
* For each **AArch64** operand, one of the following symbols must be used:
* "``w``", "``x``", "``b``", "``h``", "``s``", "``d``", or "``q``" for registers with the corresponding prefix.
* "``v``" followed by a single character ("``b``", "``h``", "``s``", or "``d``") for vector registers with the corresponding lane width of the second character.
If no second character is given, OSACA assumes a lane width of 64 bit (``d``) as default.
* "``i``" for immediates
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more*
than 1. Add "``r``" if the address format uses pre-indexing and "``p``" if it uses post-indexing.
Valid instruction form examples for x86 are ``vaddpd-x_x_x``, ``mov-r_mboi``, and ``vfmadd213pd-mbis_y_y``.
Valid instruction form examples for AArch64 are ``fadd-vd_vd_v``, ``ldp-d_d_mo``, and ``fmov-s_i``.
Note that the options to define operands are limited, therefore, one might need to adjust the instruction forms in the architecture DB after importing.
OSACA parses the output for an arbitrary number of instruction forms and adds them as entries to the architecture DB.
The user must edit the ISA DB in case the instruction form shows irregular source and destination operands for its ISA syntax. OSACA applies the following rules by default:
* If there is only one operand, it is considered as source operand
* In case of multiple operands the target operand (depending on the ISA syntax the last or first one) is considered to be the
destination operand, all others are considered as source operands.
Database check
--------------
Since a manual adjustment of the ISA DB is currently indispensable when adding new instruction forms,
OSACA provides a database sanity check using the --db-check flag. It can be executed via:
.. code-block:: bash
osaca --arch ARCH --db-check [-v] file
``ARCH`` defines the abbreviation of the target architecture of the database to check.
The ``file`` argument needs to be specified as it is positional but may be any existing dummy path.
When called, OSACA prints a summary of database information containing the amount of missing throughput values, latency values or μ-ops assignments for an instruction form.
Furthermore, it shows the amount of duplicate instruction forms in both the architecture DB and the ISA DB and checks how many instruction forms in the ISA DB are non-existent in the architecture DB.
Finally, it checks via simple heuristics how many of the instruction forms contained in the architecture DB might miss an ISA DB entry.
Running the database check including the ``-v`` verbosity flag, OSACA prints in addition the specific name of the identified instruction forms so that the user can check the mentioned incidents.
Examples
========
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:
.. code-block:: c
double a[N], double b[N];
double s;
// loop
for(int i = 0; i < N; ++i)
a[i] = s * b[i];
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``.
The result is written in vector ``a``.
After including the OSACA byte marker into the assembly, one can start the analysis typing
.. code-block:: bash
osaca --arch CSX PATH/TO/FILE
in the command line.
The output is:
::
Open Source Architecture Code Analyzer (OSACA) - v0.3
Analyzed file: scale.s.csx.O3.s
Architecture: csx
Timestamp: 2019-10-03 23:36:21
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
-----------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
170 | | | | | | | | || | | .L22:
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
172 | | | 0.50 | 0.50 | 1.00 | | | || 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | || | 1.0 | addq $32, %rax
174 | 0.00 | 0.00 | | | | 0.50 | 0.50 | || | | cmpq %rax, %r14
175 | | | | | | | | || | | * jne .L22
0.75 0.75 1.00 0.50 1.00 0.50 1.00 0.75 0.75 13.0 1.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
173 | 1.0 | addq $32, %rax | [173]
It shows the whole kernel together with the optimized port pressure of each instruction form and the overall port binding.
Furthermore, in the two columns on the right, the critical path (CP) and the longest loop-carried dependency (LCD) of the loop kernel.
In the bottom, all loop-carried dependencies are shown, each with a list of line numbers being part of this dependency chain on the right.
You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
Credits
=======
Implementation: Jan Laukemann
License
=======
`AGPL-3.0 </LICENSE>`__

20
docs/sphinx/osaca.api.rst Normal file
View File

@@ -0,0 +1,20 @@
osaca.api package
=================
Provides interfaces to other tools.
osaca.api.kerncraft\_interface module
-------------------------------------
.. automodule:: osaca.api.kerncraft_interface
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: osaca.api
:members:
:undoc-members:
:show-inheritance:

View File

@@ -0,0 +1,44 @@
osaca.parser package
====================
Parser module for parsing the assembly code.
osaca.parser.attr\_dict module
------------------------------
.. automodule:: osaca.parser.attr_dict
:members:
:undoc-members:
:show-inheritance:
osaca.parser.base\_parser module
--------------------------------
.. automodule:: osaca.parser.base_parser
:members:
:undoc-members:
:show-inheritance:
osaca.parser.parser\_AArch64v81 module
--------------------------------------
.. automodule:: osaca.parser.parser_AArch64v81
:members:
:undoc-members:
:show-inheritance:
osaca.parser.parser\_x86att module
----------------------------------
.. automodule:: osaca.parser.parser_x86att
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: osaca.parser
:members:
:undoc-members:
:show-inheritance:

46
docs/sphinx/osaca.rst Normal file
View File

@@ -0,0 +1,46 @@
osaca package
=============
Subpackages
-----------
.. toctree::
osaca.api
osaca.parser
osaca.semantics
Submodules
----------
osaca.db\_interface module
--------------------------
.. automodule:: osaca.db_interface
:members:
:undoc-members:
:show-inheritance:
osaca.frontend module
---------------------
.. automodule:: osaca.frontend
:members:
:undoc-members:
:show-inheritance:
osaca.osaca module
------------------
.. automodule:: osaca.osaca
:members:
:undoc-members:
:show-inheritance:
osaca.utils module
------------------
.. automodule:: osaca.utils
:members:
:undoc-members:
:show-inheritance:

View File

@@ -0,0 +1,52 @@
osaca.semantics package
=======================
Semantic part of OSACA.
osaca.semantics.arch\_semantics module
--------------------------------------
.. automodule:: osaca.semantics.arch_semantics
:members:
:undoc-members:
:show-inheritance:
osaca.semantics.hw\_model module
--------------------------------
.. automodule:: osaca.semantics.hw_model
:members:
:undoc-members:
:show-inheritance:
osaca.semantics.isa\_semantics module
-------------------------------------
.. automodule:: osaca.semantics.isa_semantics
:members:
:undoc-members:
:show-inheritance:
osaca.semantics.kernel\_dg module
---------------------------------
.. automodule:: osaca.semantics.kernel_dg
:members:
:undoc-members:
:show-inheritance:
osaca.semantics.marker\_utils module
------------------------------------
.. automodule:: osaca.semantics.marker_utils
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: osaca.semantics
:members:
:undoc-members:
:show-inheritance:

32
docs/version_from_src.py Normal file
View File

@@ -0,0 +1,32 @@
import io
import os
import re
# Stolen from pip
def __read(*names, **kwargs):
"""Reads in file"""
with io.open(
os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8"),
) as fp:
return fp.read()
# Stolen from pip
def __find_version(*file_paths):
"""Searches for a version attribute in the given file(s)"""
version_file = __read(*file_paths)
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
if version_match:
return version_match.group(1)
raise RuntimeError("Unable to find version string.")
def get_version():
"""
Gets the current OSACA version stated in the __init__ file
:returns: str -- the version string.
"""
return __find_version("../osaca/__init__.py")

View File

@@ -1,286 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.5.239 Build 20170817";
# mark_description "-fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o 2d.S";
.file "2d-5pt.c"
.text
..TXTST0:
# -- Begin jacobi2D5pt
.text
# mark_begin;
.align 16,0x90
.globl jacobi2D5pt
# --- jacobi2D5pt(int, int)
jacobi2D5pt:
# parameter 1: %edi
# parameter 2: %esi
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_jacobi2D5pt.1:
..L2:
#2.31
pushq %rbx #2.31
.cfi_def_cfa_offset 16
movq %rsp, %rbx #2.31
.cfi_def_cfa 3, 16
.cfi_offset 3, -16
andq $-32, %rsp #2.31
pushq %rbp #2.31
pushq %rbp #2.31
movq 8(%rbx), %rbp #2.31
movq %rbp, 8(%rsp) #2.31
movq %rsp, %rbp #2.31
.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
pushq %r13 #2.31
pushq %r14 #2.31
pushq %r15 #2.31
subq $88, %rsp #2.31
movslq %esi, %rsi #2.31
movslq %edi, %rcx #2.31
.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
movq %rsi, %r13 #4.17
imulq %rcx, %r13 #4.17
shlq $3, %r13 #4.12
movq %r13, %rax #4.12
addq $31, %rax #4.12
andq $-32, %rax #4.12
subq %rax, %rsp #4.12
movq %rsp, %rax #4.12
# LOE rax rcx rsi r12 r13 edi
..B1.29: # Preds ..B1.1
# Execution count [1.00e+00]
movq %rax, %r14 #4.12
# LOE rcx rsi r12 r13 r14 edi
..B1.2: # Preds ..B1.29
# Execution count [1.00e+00]
movq %r13, %rax #5.12
addq $31, %rax #5.12
andq $-32, %rax #5.12
subq %rax, %rsp #5.12
movq %rsp, %rax #5.12
# LOE rax rcx rsi r12 r13 r14 edi
..B1.30: # Preds ..B1.2
# Execution count [1.00e+00]
movq %rax, %r15 #5.12
# LOE rcx rsi r12 r13 r14 r15 edi
..B1.3: # Preds ..B1.30
# Execution count [1.00e+00]
xorl %r10d, %r10d #9.5
lea (%r15,%rcx,8), %r11 #13.13
vxorpd %xmm1, %xmm1, %xmm1 #6.5
lea (%r14,%rcx,8), %rdx #13.37
cmpq $2, %rsi #9.18
jle ..B1.21 # Prob 9% #9.18
# LOE rdx rcx rsi r10 r11 r12 r13 r14 r15 edi xmm1
..B1.4: # Preds ..B1.3
# Execution count [9.00e-01]
addl $-2, %edi #12.9
movq %rcx, %r9 #13.61
movl %edi, %eax #12.9
addq $-2, %rsi #9.18
andl $-16, %eax #12.9
xorl %r8d, %r8d #9.5
shlq $4, %r9 #13.61
movslq %eax, %rax #12.9
addq %r14, %r9 #13.61
movslq %edi, %rdi #12.9
vxorps %ymm0, %ymm0, %ymm0 #6.5
movq %rax, -80(%rbp) #12.9[spill]
movq %rdi, -88(%rbp) #12.9[spill]
movl %eax, -72(%rbp) #9.5[spill]
movq %rsi, -48(%rbp) #9.5[spill]
movq %rdx, -64(%rbp) #9.5[spill]
movq %r15, -96(%rbp) #9.5[spill]
movq %r14, -56(%rbp) #9.5[spill]
movq %r13, -104(%rbp) #9.5[spill]
movq %r12, -112(%rbp) #9.5[spill]
.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.5: # Preds ..B1.19 ..B1.4
# Execution count [5.00e+00]
cmpq $2, %rcx #12.22
jle ..B1.19 # Prob 50% #12.22
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.6: # Preds ..B1.5
# Execution count [4.50e+00]
cmpl $16, %edi #12.9
jl ..B1.26 # Prob 10% #12.9
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.7: # Preds ..B1.6
# Execution count [4.50e+00]
movl -72(%rbp), %r14d #12.9[spill]
xorl %edx, %edx #12.9
movq -80(%rbp), %r12 #13.13[spill]
lea (%r11,%r8), %rax #13.13
# LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
..B1.8: # Preds ..B1.8 ..B1.7
# Execution count [2.50e+01]
vmovupd %ymm0, 8(%rax,%rdx,8) #13.13
vmovupd %ymm0, 40(%rax,%rdx,8) #13.13
vmovupd %ymm0, 72(%rax,%rdx,8) #13.13
vmovupd %ymm0, 104(%rax,%rdx,8) #13.13
addq $16, %rdx #12.9
cmpq %r12, %rdx #12.9
jb ..B1.8 # Prob 82% #12.9
# LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
..B1.10: # Preds ..B1.8 ..B1.26
# Execution count [5.00e+00]
lea 1(%r14), %eax #12.9
cmpl %edi, %eax #12.9
ja ..B1.19 # Prob 50% #12.9
# LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
..B1.11: # Preds ..B1.10
# Execution count [4.50e+00]
movslq %r14d, %r14 #12.9
movq -88(%rbp), %r13 #12.9[spill]
subq %r14, %r13 #12.9
cmpq $4, %r13 #12.9
jl ..B1.25 # Prob 10% #12.9
# LOE rcx r8 r9 r10 r11 r13 r14 edi xmm1 ymm0
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
movl %r13d, %r15d #12.9
lea (%r11,%r8), %rax #13.13
andl $-4, %r15d #12.9
xorl %edx, %edx #12.9
movslq %r15d, %r15 #12.9
lea (%rax,%r14,8), %rax #13.13
# LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.13: # Preds ..B1.13 ..B1.12
# Execution count [2.50e+01]
vmovupd %ymm0, 8(%rax,%rdx,8) #13.13
addq $4, %rdx #12.9
cmpq %r15, %rdx #12.9
jb ..B1.13 # Prob 82% #12.9
# LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.15: # Preds ..B1.13 ..B1.25
# Execution count [5.00e+00]
cmpq %r13, %r15 #12.9
jae ..B1.19 # Prob 10% #12.9
# LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.16: # Preds ..B1.15
# Execution count [4.50e+00]
movq -56(%rbp), %rax #13.49[spill]
lea (%r11,%r8), %r12 #13.13
movq -64(%rbp), %rsi #13.25[spill]
lea (%r9,%r8), %rdx #13.61
lea (%r12,%r14,8), %r12 #13.13
addq %r8, %rax #13.49
addq %r8, %rsi #13.25
lea (%rdx,%r14,8), %rdx #13.61
lea (%rax,%r14,8), %rax #13.49
lea (%rsi,%r14,8), %r14 #13.25
# LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.17: # Preds ..B1.17 ..B1.16
# Execution count [2.50e+01]
vmovsd (%r14,%r15,8), %xmm2 #13.25
vaddsd 16(%r14,%r15,8), %xmm2, %xmm3 #13.37
vaddsd 8(%rax,%r15,8), %xmm3, %xmm4 #13.49
vaddsd 8(%rdx,%r15,8), %xmm4, %xmm5 #13.61
vmulsd %xmm5, %xmm1, %xmm6 #13.74
vmovsd %xmm6, 8(%r12,%r15,8) #13.13
incq %r15 #12.9
cmpq %r13, %r15 #12.9
jb ..B1.17 # Prob 82% #12.9
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
# LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
..B1.19: # Preds ..B1.17 ..B1.5 ..B1.10 ..B1.15
# Execution count [5.00e+00]
incq %r10 #9.5
lea (%r8,%rcx,8), %r8 #9.5
cmpq -48(%rbp), %r10 #9.5[spill]
jb ..B1.5 # Prob 82% #9.5
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.20: # Preds ..B1.19
# Execution count [9.00e-01]
movq -64(%rbp), %rdx #[spill]
movq -96(%rbp), %r15 #[spill]
movq -56(%rbp), %r14 #[spill]
movq -104(%rbp), %r13 #[spill]
movq -112(%rbp), %r12 #[spill]
.cfi_restore 12
# LOE rdx r11 r12 r13 r14 r15
..B1.21: # Preds ..B1.3 ..B1.20
# Execution count [1.00e+00]
addq $8, %rdx #16.5
addq $8, %r11 #16.5
movq %rdx, %rdi #16.5
movq %r11, %rsi #16.5
vzeroupper #16.5
..___tag_value_jacobi2D5pt.12:
# dummy(double *, double *)
call dummy #16.5
..___tag_value_jacobi2D5pt.13:
# LOE r12 r13 r14 r15
..B1.22: # Preds ..B1.21
# Execution count [1.00e+00]
movq %r15, %rdx #16.5
movq %r13, %rax #16.5
addq $31, %rax #16.5
andq $-32, %rax #16.5
addq %rax, %rsp #16.5
# LOE r12 r13 r14
..B1.23: # Preds ..B1.22
# Execution count [1.00e+00]
movq %r14, %rdx #16.5
movq %r13, %rax #16.5
addq $31, %rax #16.5
andq $-32, %rax #16.5
addq %rax, %rsp #16.5
# LOE r12
..B1.24: # Preds ..B1.23
# Execution count [1.00e+00]
lea -24(%rbp), %rsp #17.1
.cfi_restore 15
popq %r15 #17.1
.cfi_restore 14
popq %r14 #17.1
.cfi_restore 13
popq %r13 #17.1
popq %rbp #17.1
.cfi_restore 6
movq %rbx, %rsp #17.1
popq %rbx #17.1
.cfi_def_cfa 7, 8
.cfi_restore 3
ret #17.1
.cfi_def_cfa 3, 16
.cfi_offset 3, -16
.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
# LOE
..B1.25: # Preds ..B1.11
# Execution count [4.50e-01]: Infreq
xorl %r15d, %r15d #12.9
jmp ..B1.15 # Prob 100% #12.9
# LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.26: # Preds ..B1.6
# Execution count [4.50e-01]: Infreq
xorl %r14d, %r14d #12.9
jmp ..B1.10 # Prob 100% #12.9
.align 16,0x90
# LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
.cfi_endproc
# mark_end;
.type jacobi2D5pt,@function
.size jacobi2D5pt,.-jacobi2D5pt
.data
# -- End jacobi2D5pt
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End

View File

@@ -1,16 +0,0 @@
void jacobi2D5pt(int N, int M){
void dummy(double*, double*);
double a[M][N];
double b[M][N];
double s;
for(int j=1; j<M-1; ++j){
#pragma vector aligned
//STARTLOOP
for(int i=1; i<N-1; ++i){
b[j][i] = ( a[j][i-1] + a[j][i+1] + a[j-1][i] + a[j+1][i]) * s;
}
}
dummy(&a[1][1], &b[1][1]);
}

114
examples/README.md Normal file
View File

@@ -0,0 +1,114 @@
# Examples
We collected sample kernels for the user to run examples with OSACA.
The assembly files contain only the extracted and already marked kernel for code compiled with on Intel Cascade Lake&nbsp;(CSX), AMD Zen and Marvell ThunderX2&nbsp;(TX2), but can be run on any system supporting the ISA and supported by OSACA.
The used compilers were Intel Parallel Studio&nbsp;19.0up05 and GNU&nbsp;9.1.0 in case of the x86 systems and ARM HPC Compiler for Linux version&nbsp;19.2 and GNU&nbsp;8.2.0 for the ARM-based TX2.
To analyze the kernels with OSACA, run
```
osaca --arch ARCH FILE
```
While all Zen and TX2 kernels use the comment-style OSACA markers, the kernels for Intel Cascade Lake (*.csx.*.s) use the byte markers to be able to be analyzed by IACA as well.
For this use
```
gcc -c FILE.s
iaca -arch SKX FILE.o
```
------------
The kernels currently contained in the examples are shown briefly in the following.
### Copy (`copy/`)
```c
double * restrict a, * restrict b;
for(long i=0; i < size; ++i){
a[i] = b[i];
}
```
### Vector add (`add/`)
```c
double * restrict a, * restrict b, * restrict c;
for(long i=0; i < size; ++i){
a[i] = b[i] + c[i];
}
```
### Vector update (`update/`)
```c
double * restrict a;
for(long i=0; i < size; ++i){
a[i] = scale * a[i];
}
```
### Sum reduction (`sum_reduction/`)
```c
double * restrict a;
for(long i=0; i < size; ++i){
scale = scale + a[i];
}
```
For this kernel we noticed an overlap of the loop bodies when using gcc with `-Ofast` flag (see this [blog post](https://blogs.fau.de/hager/archives/7658) for more information).
We therefore compiled all gcc version additionally with `-O3` flag instead.
These versions are named accordingly.
### DAXPY (`daxpy/`)
```c
double * restrict a, * restrict b;
for(long i=0; i < size; ++i){
a[i] = a[i] + scale * b[i];
}
```
### STREAM triad (`triad/`)
```c
double * restrict a, * restrict b, * restrict c;
for(long i=0; i < size; ++i){
a[i] = b[i] + scale * c[i];
}
```
### Schönauer triad (`striad/`)
```c
double * restrict a, * restrict b, * restrict c, * restrict d;
for(long i=0; i < size; ++i){
a[i] = b[i] + c[i] * d[i];
}
```
### Gauss-Seidel method (`gs/`)
```c
double ** restrict a;
for(long k=1; k < size_k-1; ++k){
for(long i=1; i < size_i-1; ++i){
a[k][i] = scale * (
a[k][i-1] + a[k+1][i]
+ a[k][i+1] + a[k-1][i]
);
}
}
```
### Jacobi 2D (`j2d/`)
```c
double ** restrict a, ** restrict b;
for(long k=1; k < size_k-1; ++k){
for(long i=1; i < size_i-1; ++i){
a[k][i] = 0.25 * (
b[k][i-1] + b[k+1][i]
+ b[k][i+1] + b[k-1][i]
);
}
}
```
For this kernel we noticed a discrepancy between measurements and predcitions especially when using AVX-512 instructions.
We therefore compiled the x86 kernels additionally with AVX/SSE instruction and marekd those kernels accordingly.

View File

@@ -0,0 +1,36 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r14,%rax), %ymm3
vmovupd 32(%r14,%rax), %ymm4
vmovupd 64(%r14,%rax), %ymm6
vmovupd 96(%r14,%rax), %ymm9
vmovupd 128(%r14,%rax), %ymm11
vmovupd 160(%r14,%rax), %ymm13
vmovupd 192(%r14,%rax), %ymm15
vmovupd 224(%r14,%rax), %ymm0
vaddpd 0(%r13,%rax), %ymm3, %ymm7
vaddpd 32(%r13,%rax), %ymm4, %ymm5
vaddpd 64(%r13,%rax), %ymm6, %ymm8
vaddpd 96(%r13,%rax), %ymm9, %ymm10
vaddpd 128(%r13,%rax), %ymm11, %ymm12
vaddpd 160(%r13,%rax), %ymm13, %ymm14
vaddpd 192(%r13,%rax), %ymm15, %ymm1
vaddpd 224(%r13,%rax), %ymm0, %ymm2
vmovupd %ymm7, (%r12,%rax)
vmovupd %ymm5, 32(%r12,%rax)
vmovupd %ymm8, 64(%r12,%rax)
vmovupd %ymm10, 96(%r12,%rax)
vmovupd %ymm12, 128(%r12,%rax)
vmovupd %ymm14, 160(%r12,%rax)
vmovupd %ymm1, 192(%r12,%rax)
vmovupd %ymm2, 224(%r12,%rax)
addq $256, %rax
cmpq %rax, %rcx
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,19 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.40: # Preds ..B1.40 ..B1.39
# Execution count [2.22e+03]
vmovups (%rcx,%rax,8), %zmm1 #78.5
vmovups 64(%rcx,%rax,8), %zmm3 #78.5
vaddpd (%r13,%rax,8), %zmm1, %zmm2 #78.5
vaddpd 64(%r13,%rax,8), %zmm3, %zmm4 #78.5
vmovupd %zmm2, (%r14,%rax,8) #78.5
vmovupd %zmm4, 64(%r14,%rax,8) #78.5
addq $16, %rax #78.5
cmpq %r12, %rax #78.5
jb ..B1.40 # Prob 82% #78.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,91 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x9, #-256]
ldp q4, q5, [x9, #-224]
ldp q2, q3, [x10, #-256]
ldp q6, q7, [x10, #-224]
fadd v2.2d, v2.2d, v0.2d
fadd v3.2d, v3.2d, v1.2d
stp q2, q3, [x11, #-256]
fadd v0.2d, v6.2d, v4.2d
fadd v1.2d, v7.2d, v5.2d
stp q0, q1, [x11, #-224]
ldp q4, q5, [x9, #-192]
ldp q16, q17, [x9, #-160]
ldp q6, q7, [x10, #-192]
ldp q18, q19, [x10, #-160]
fadd v6.2d, v6.2d, v4.2d
fadd v7.2d, v7.2d, v5.2d
stp q6, q7, [x11, #-192]
fadd v4.2d, v18.2d, v16.2d
fadd v5.2d, v19.2d, v17.2d
stp q4, q5, [x11, #-160]
ldp q16, q17, [x9, #-128]
ldp q19, q20, [x9, #-96]
ldp q18, q21, [x10, #-128]
ldp q22, q23, [x10, #-96]
fadd v16.2d, v18.2d, v16.2d
fadd v18.2d, v21.2d, v17.2d
stp q16, q18, [x11, #-128]
fadd v17.2d, v22.2d, v19.2d
fadd v19.2d, v23.2d, v20.2d
stp q17, q19, [x11, #-96]
ldp q20, q21, [x9, #-64]
ldp q24, q25, [x10, #-64]
ldp q22, q23, [x9, #-32]
ldp q26, q27, [x10, #-32]
fadd v20.2d, v24.2d, v20.2d
fadd v21.2d, v25.2d, v21.2d
stp q20, q21, [x11, #-64]
ldp q24, q25, [x9]
ldp q28, q29, [x10]
fadd v22.2d, v26.2d, v22.2d
fadd v23.2d, v27.2d, v23.2d
stp q22, q23, [x11, #-32]
ldp q26, q27, [x9, #32]
ldp q30, q31, [x10, #32]
fadd v24.2d, v28.2d, v24.2d
fadd v25.2d, v29.2d, v25.2d
stp q24, q25, [x11]
ldp q28, q29, [x9, #64]
ldp q8, q10, [x10, #64]
fadd v26.2d, v30.2d, v26.2d
fadd v27.2d, v31.2d, v27.2d
stp q26, q27, [x11, #32]
ldp q30, q31, [x9, #96]
ldp q11, q12, [x10, #96]
fadd v28.2d, v8.2d, v28.2d
fadd v29.2d, v10.2d, v29.2d
stp q28, q29, [x11, #64]
ldp q8, q10, [x9, #128]
ldp q13, q14, [x10, #128]
ldp q3, q0, [x9, #192]
ldp q1, q6, [x10, #192]
fadd v30.2d, v11.2d, v30.2d
fadd v31.2d, v12.2d, v31.2d
stp q30, q31, [x11, #96]
ldp q11, q12, [x9, #160]
fadd v8.2d, v13.2d, v8.2d
fadd v10.2d, v14.2d, v10.2d
stp q8, q10, [x11, #128]
ldp q13, q14, [x10, #160]
fadd v1.2d, v1.2d, v3.2d
ldp q3, q4, [x9, #224]
fadd v0.2d, v6.2d, v0.2d
stp q1, q0, [x11, #192]
ldp q5, q6, [x10, #224]
fadd v11.2d, v13.2d, v11.2d
fadd v2.2d, v14.2d, v12.2d
stp q11, q2, [x11, #160]
fadd v3.2d, v5.2d, v3.2d
fadd v4.2d, v6.2d, v4.2d
stp q3, q4, [x11, #224]
add x8, x8, #64 // =64
add x11, x11, #512 // =512
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x12, x12, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,45 @@
// OSACA-BEGIN
.L17:
add x0, x10, 16
ldr q29, [x21, x10]
ldr q30, [x20, x10]
add x7, x10, 32
ldr q31, [x21, x0]
ldr q2, [x20, x0]
add x6, x10, 48
add x5, x10, 64
ldr q5, [x21, x7]
ldr q1, [x20, x7]
add x4, x10, 80
add x11, x10, 96
ldr q4, [x21, x6]
ldr q0, [x20, x6]
add x2, x10, 112
fadd v7.2d, v29.2d, v30.2d
ldr q3, [x21, x5]
ldr q9, [x20, x5]
fadd v6.2d, v31.2d, v2.2d
ldr q19, [x21, x4]
ldr q18, [x20, x4]
fadd v20.2d, v5.2d, v1.2d
ldr q21, [x21, x11]
ldr q17, [x20, x11]
fadd v22.2d, v4.2d, v0.2d
ldr q23, [x21, x2]
ldr q16, [x20, x2]
fadd v24.2d, v3.2d, v9.2d
fadd v25.2d, v19.2d, v18.2d
fadd v26.2d, v21.2d, v17.2d
str q7, [x19, x10]
add x10, x10, 128
fadd v27.2d, v23.2d, v16.2d
str q6, [x19, x0]
str q20, [x19, x7]
str q22, [x19, x6]
str q24, [x19, x5]
str q25, [x19, x4]
str q26, [x19, x11]
str q27, [x19, x2]
cmp x24, x10
bne .L17
// OSACA-END

View File

@@ -0,0 +1,30 @@
# OSACA-BEGIN
.L19:
vmovups 0(%r13,%rax), %xmm0
vmovups 16(%r13,%rax), %xmm3
vmovups 32(%r13,%rax), %xmm4
vmovups 48(%r13,%rax), %xmm6
vmovups 64(%r13,%rax), %xmm9
vmovups 80(%r13,%rax), %xmm11
vmovups 96(%r13,%rax), %xmm13
vmovups 112(%r13,%rax), %xmm15
vaddpd (%r12,%rax), %xmm0, %xmm7
vaddpd 16(%r12,%rax), %xmm3, %xmm2
vaddpd 32(%r12,%rax), %xmm4, %xmm5
vaddpd 48(%r12,%rax), %xmm6, %xmm8
vaddpd 64(%r12,%rax), %xmm9, %xmm10
vaddpd 80(%r12,%rax), %xmm11, %xmm12
vaddpd 96(%r12,%rax), %xmm13, %xmm14
vaddpd 112(%r12,%rax), %xmm15, %xmm1
vmovups %xmm7, 0(%rbp,%rax)
vmovups %xmm2, 16(%rbp,%rax)
vmovups %xmm5, 32(%rbp,%rax)
vmovups %xmm8, 48(%rbp,%rax)
vmovups %xmm10, 64(%rbp,%rax)
vmovups %xmm12, 80(%rbp,%rax)
vmovups %xmm14, 96(%rbp,%rax)
vmovups %xmm1, 112(%rbp,%rax)
subq $-128, %rax
cmpq %rbx, %rax
jne .L19
# OSACA-END

View File

@@ -0,0 +1,28 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r12,%rcx), %ymm10
vmovupd 32(%r12,%rcx), %ymm11
vmovupd 64(%r12,%rcx), %ymm12
vmovupd 96(%r12,%rcx), %ymm13
vmovupd 128(%r12,%rcx), %ymm14
vmovupd 160(%r12,%rcx), %ymm15
vmovupd 192(%r12,%rcx), %ymm0
vmovupd 224(%r12,%rcx), %ymm1
vmovupd %ymm10, 0(%r13,%rcx)
vmovupd %ymm11, 32(%r13,%rcx)
vmovupd %ymm12, 64(%r13,%rcx)
vmovupd %ymm13, 96(%r13,%rcx)
vmovupd %ymm14, 128(%r13,%rcx)
vmovupd %ymm15, 160(%r13,%rcx)
vmovupd %ymm0, 192(%r13,%rcx)
vmovupd %ymm1, 224(%r13,%rcx)
addq $256, %rcx
cmpq %rcx, %r10
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,15 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.39: # Preds ..B1.39 ..B1.38
# Execution count [2.22e+03]
vmovups (%r14,%rax,8), %zmm1 #79.5
vmovupd %zmm1, (%r13,%rax,8) #79.5
addq $8, %rax #79.5
cmpq %r12, %rax #79.5
jb ..B1.39 # Prob 82% #79.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,42 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x9, #-256]
ldp q2, q3, [x9, #-224]
stp q0, q1, [x10, #-256]
stp q2, q3, [x10, #-224]
add x8, x8, #64 // =64
ldp q0, q1, [x9]
ldp q2, q3, [x9, #32]
stp q0, q1, [x10]
stp q2, q3, [x10, #32]
ldp q0, q1, [x9, #-192]
ldp q2, q3, [x9, #-160]
stp q0, q1, [x10, #-192]
stp q2, q3, [x10, #-160]
ldp q0, q1, [x9, #64]
ldp q2, q3, [x9, #96]
stp q0, q1, [x10, #64]
stp q2, q3, [x10, #96]
ldp q0, q1, [x9, #-128]
ldp q2, q3, [x9, #-96]
stp q0, q1, [x10, #-128]
stp q2, q3, [x10, #-96]
ldp q0, q1, [x9, #128]
ldp q2, q3, [x9, #160]
stp q0, q1, [x10, #128]
stp q2, q3, [x10, #160]
ldp q0, q1, [x9, #-64]
ldp q2, q3, [x9, #-32]
stp q0, q1, [x10, #-64]
stp q2, q3, [x10, #-32]
ldp q0, q1, [x9, #192]
ldp q2, q3, [x9, #224]
add x9, x9, #512 // =512
stp q0, q1, [x10, #192]
stp q2, q3, [x10, #224]
add x10, x10, #512 // =512
adds x11, x11, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,29 @@
// OSACA-BEGIN
.L17:
add x16, x15, 16
ldr q9, [x19, x15]
add x30, x15, 32
add x17, x15, 48
ldr q16, [x19, x16]
ldr q18, [x19, x30]
add x18, x15, 64
add x1, x15, 80
ldr q17, [x19, x17]
ldr q19, [x19, x18]
add x3, x15, 96
add x2, x15, 112
ldr q20, [x19, x1]
ldr q21, [x19, x3]
str q9, [x20, x15]
ldr q22, [x19, x2]
add x15, x15, 128
str q16, [x20, x16]
str q18, [x20, x30]
str q17, [x20, x17]
str q19, [x20, x18]
str q20, [x20, x1]
str q21, [x20, x3]
str q22, [x20, x2]
cmp x23, x15
bne .L17
// OSACA-END

View File

@@ -0,0 +1,22 @@
# OSACA-BEGIN
.L19:
vmovups 0(%rbp,%r10), %xmm9
vmovups 16(%rbp,%r10), %xmm10
vmovups 32(%rbp,%r10), %xmm11
vmovups 48(%rbp,%r10), %xmm12
vmovups 64(%rbp,%r10), %xmm13
vmovups 80(%rbp,%r10), %xmm14
vmovups 96(%rbp,%r10), %xmm15
vmovups 112(%rbp,%r10), %xmm0
vmovups %xmm9, (%r12,%r10)
vmovups %xmm10, 16(%r12,%r10)
vmovups %xmm11, 32(%r12,%r10)
vmovups %xmm12, 48(%r12,%r10)
vmovups %xmm13, 64(%r12,%r10)
vmovups %xmm14, 80(%r12,%r10)
vmovups %xmm15, 96(%r12,%r10)
vmovups %xmm0, 112(%r12,%r10)
subq $-128, %r10
cmpq %r10, %r15
jne .L19
# OSACA-END

View File

@@ -1,13 +0,0 @@
void daxpy(int N){
void dummy(double*, double*);
double a[N], b[N];
double s;
//STARTLOOP
for(int i=0; i<N; ++i)
a[i] = a[i] + s * b[i];
dummy(&a[1], &b[1]);
}

View File

@@ -0,0 +1,36 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd 0(%r13,%rsi), %ymm14
vmovupd 32(%r13,%rsi), %ymm15
vmovupd 64(%r13,%rsi), %ymm1
vmovupd 96(%r13,%rsi), %ymm0
vmovupd 128(%r13,%rsi), %ymm3
vmovupd 160(%r13,%rsi), %ymm4
vmovupd 192(%r13,%rsi), %ymm5
vmovupd 224(%r13,%rsi), %ymm7
vfmadd213pd (%r12,%rsi), %ymm6, %ymm14
vfmadd213pd 32(%r12,%rsi), %ymm6, %ymm15
vfmadd213pd 64(%r12,%rsi), %ymm6, %ymm1
vfmadd213pd 96(%r12,%rsi), %ymm6, %ymm0
vfmadd213pd 128(%r12,%rsi), %ymm6, %ymm3
vfmadd213pd 160(%r12,%rsi), %ymm6, %ymm4
vfmadd213pd 192(%r12,%rsi), %ymm6, %ymm5
vfmadd213pd 224(%r12,%rsi), %ymm6, %ymm7
vmovupd %ymm14, (%r12,%rsi)
vmovupd %ymm15, 32(%r12,%rsi)
vmovupd %ymm1, 64(%r12,%rsi)
vmovupd %ymm0, 96(%r12,%rsi)
vmovupd %ymm3, 128(%r12,%rsi)
vmovupd %ymm4, 160(%r12,%rsi)
vmovupd %ymm5, 192(%r12,%rsi)
vmovupd %ymm7, 224(%r12,%rsi)
addq $256, %rsi
cmpq %rsi, %r10
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,16 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.39: # Preds ..B1.39 ..B1.38
# Execution count [2.22e+03]
vmovups (%r13,%rax,8), %zmm1 #77.5
vfmadd213pd (%r14,%rax,8), %zmm2, %zmm1 #77.5
vmovupd %zmm1, (%r14,%rax,8) #77.5
addq $8, %rax #77.5
cmpq %rbx, %rax #77.5
jb ..B1.39 # Prob 82% #77.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,90 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q1, q2, [x9, #-256]
ldp q3, q0, [x9, #-224]
ldp q4, q5, [x10, #-256]
ldp q6, q7, [x10, #-224]
fmla v1.2d, v4.2d, v31.2d
fmla v2.2d, v5.2d, v31.2d
stp q1, q2, [x9, #-256]
fmla v3.2d, v6.2d, v31.2d
fmla v0.2d, v7.2d, v31.2d
stp q3, q0, [x9, #-224]
ldp q5, q6, [x9, #-192]
ldp q7, q4, [x9, #-160]
ldp q16, q17, [x10, #-192]
ldp q18, q19, [x10, #-160]
fmla v5.2d, v16.2d, v31.2d
fmla v6.2d, v17.2d, v31.2d
stp q5, q6, [x9, #-192]
fmla v7.2d, v18.2d, v31.2d
fmla v4.2d, v19.2d, v31.2d
stp q7, q4, [x9, #-160]
ldp q19, q18, [x9, #-128]
ldp q16, q17, [x9, #-96]
ldp q20, q21, [x10, #-128]
ldp q22, q23, [x10, #-96]
fmla v18.2d, v21.2d, v31.2d
fmla v16.2d, v22.2d, v31.2d
ldp q21, q22, [x9, #-64]
ldp q24, q25, [x10, #-64]
fmla v19.2d, v20.2d, v31.2d
stp q19, q18, [x9, #-128]
fmla v17.2d, v23.2d, v31.2d
stp q16, q17, [x9, #-96]
ldp q23, q20, [x9, #-32]
ldp q26, q27, [x10, #-32]
fmla v21.2d, v24.2d, v31.2d
fmla v22.2d, v25.2d, v31.2d
stp q21, q22, [x9, #-64]
ldp q24, q25, [x9]
ldp q28, q29, [x10]
fmla v23.2d, v26.2d, v31.2d
fmla v20.2d, v27.2d, v31.2d
stp q23, q20, [x9, #-32]
ldp q26, q27, [x9, #32]
fmla v24.2d, v28.2d, v31.2d
fmla v25.2d, v29.2d, v31.2d
stp q24, q25, [x9]
ldp q28, q29, [x10, #32]
fmla v26.2d, v28.2d, v31.2d
fmla v27.2d, v29.2d, v31.2d
stp q26, q27, [x9, #32]
ldp q24, q25, [x9, #64]
ldp q28, q29, [x10, #64]
ldp q26, q27, [x9, #96]
fmla v24.2d, v28.2d, v31.2d
fmla v25.2d, v29.2d, v31.2d
stp q24, q25, [x9, #64]
ldp q28, q29, [x10, #96]
fmla v26.2d, v28.2d, v31.2d
fmla v27.2d, v29.2d, v31.2d
stp q26, q27, [x9, #96]
ldp q24, q25, [x9, #128]
ldp q26, q27, [x10, #128]
fmla v24.2d, v26.2d, v31.2d
fmla v25.2d, v27.2d, v31.2d
stp q24, q25, [x9, #128]
ldp q26, q27, [x9, #160]
ldp q1, q2, [x10, #160]
fmla v26.2d, v1.2d, v31.2d
fmla v27.2d, v2.2d, v31.2d
stp q26, q27, [x9, #160]
ldp q0, q1, [x9, #192]
ldp q2, q3, [x10, #192]
fmla v0.2d, v2.2d, v31.2d
fmla v1.2d, v3.2d, v31.2d
stp q0, q1, [x9, #192]
ldp q2, q3, [x9, #224]
ldp q4, q5, [x10, #224]
fmla v2.2d, v4.2d, v31.2d
fmla v3.2d, v5.2d, v31.2d
stp q2, q3, [x9, #224]
add x8, x8, #64 // =64
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x11, x11, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,41 @@
// OSACA-BEGIN
.L17:
mov x5, x3
ldr q23, [x10]
ldr q24, [x5], 16
mov x6, x10
ldr q25, [x3, 16]
ldr q26, [x3, 48]
add x10, x10, 128
add x3, x3, 128
ldr q27, [x3, -64]
ldr q28, [x3, -48]
ldr q29, [x3, -32]
ldr q30, [x3, -16]
fmla v23.2d, v3.2d, v24.2d
ldr q31, [x5, 16]
str q23, [x6], 16
ldr q0, [x10, -112]
fmla v0.2d, v3.2d, v25.2d
str q0, [x10, -112]
ldr q2, [x6, 16]
fmla v2.2d, v3.2d, v31.2d
str q2, [x6, 16]
ldr q5, [x10, -80]
ldr q4, [x10, -64]
ldr q6, [x10, -48]
ldr q1, [x10, -32]
ldr q7, [x10, -16]
fmla v5.2d, v3.2d, v26.2d
fmla v4.2d, v3.2d, v27.2d
fmla v6.2d, v3.2d, v28.2d
fmla v1.2d, v3.2d, v29.2d
fmla v7.2d, v3.2d, v30.2d
str q5, [x10, -80]
str q4, [x10, -64]
str q6, [x10, -48]
str q1, [x10, -32]
str q7, [x10, -16]
cmp x23, x10
bne .L17
// OSACA-END

View File

@@ -0,0 +1,30 @@
# OSACA-BEGIN
.L19:
vmovups (%r12,%rax), %xmm12
vmovups 16(%r12,%rax), %xmm13
vmovups 32(%r12,%rax), %xmm14
vmovups 48(%r12,%rax), %xmm15
vmovups 64(%r12,%rax), %xmm1
vmovups 80(%r12,%rax), %xmm0
vmovups 96(%r12,%rax), %xmm4
vmovups 112(%r12,%rax), %xmm5
vfmadd213pd 0(%rbp,%rax), %xmm3, %xmm12
vfmadd213pd 16(%rbp,%rax), %xmm3, %xmm13
vfmadd213pd 32(%rbp,%rax), %xmm3, %xmm14
vfmadd213pd 48(%rbp,%rax), %xmm3, %xmm15
vfmadd213pd 64(%rbp,%rax), %xmm3, %xmm1
vfmadd213pd 80(%rbp,%rax), %xmm3, %xmm0
vfmadd213pd 96(%rbp,%rax), %xmm3, %xmm4
vfmadd213pd 112(%rbp,%rax), %xmm3, %xmm5
vmovups %xmm12, 0(%rbp,%rax)
vmovups %xmm13, 16(%rbp,%rax)
vmovups %xmm14, 32(%rbp,%rax)
vmovups %xmm15, 48(%rbp,%rax)
vmovups %xmm1, 64(%rbp,%rax)
vmovups %xmm0, 80(%rbp,%rax)
vmovups %xmm4, 96(%rbp,%rax)
vmovups %xmm5, 112(%rbp,%rax)
subq $-128, %rax
cmpq %r15, %rax
jne .L19
# OSACA-END

View File

@@ -0,0 +1,67 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L31:
vmovsd (%rax,%rsi,8), %xmm7
vaddsd (%rax,%rcx,8), %xmm8, %xmm11
vaddsd 8(%rax), %xmm7, %xmm10
leaq 8(%rax), %rdx
vaddsd %xmm11, %xmm10, %xmm12
vmulsd %xmm9, %xmm12, %xmm13
vmovsd %xmm13, (%rax)
vmovsd (%rdx,%rsi,8), %xmm14
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
vaddsd 16(%rax), %xmm14, %xmm15
leaq 16(%rax), %rdx
vaddsd %xmm1, %xmm15, %xmm0
vmulsd %xmm9, %xmm0, %xmm3
vmovsd %xmm3, 8(%rax)
vmovsd (%rdx,%rsi,8), %xmm2
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
vaddsd 24(%rax), %xmm2, %xmm4
leaq 24(%rax), %rdx
vaddsd %xmm5, %xmm4, %xmm6
vmulsd %xmm9, %xmm6, %xmm8
vmovsd %xmm8, 16(%rax)
vmovsd (%rdx,%rsi,8), %xmm7
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
vaddsd 32(%rax), %xmm7, %xmm10
leaq 32(%rax), %rdx
vaddsd %xmm11, %xmm10, %xmm12
vmulsd %xmm9, %xmm12, %xmm13
vmovsd %xmm13, 24(%rax)
vmovsd (%rdx,%rsi,8), %xmm14
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
vaddsd 40(%rax), %xmm14, %xmm15
leaq 40(%rax), %rdx
vaddsd %xmm1, %xmm15, %xmm0
vmulsd %xmm9, %xmm0, %xmm3
vmovsd %xmm3, 32(%rax)
vmovsd (%rdx,%rsi,8), %xmm2
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
vaddsd 48(%rax), %xmm2, %xmm4
leaq 48(%rax), %rdx
vaddsd %xmm5, %xmm4, %xmm6
vmulsd %xmm9, %xmm6, %xmm8
vmovsd %xmm8, 40(%rax)
vmovsd (%rdx,%rsi,8), %xmm7
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
vaddsd 56(%rax), %xmm7, %xmm10
leaq 56(%rax), %rdx
addq $64, %rax
vaddsd %xmm11, %xmm10, %xmm12
vmulsd %xmm9, %xmm12, %xmm13
vmovsd %xmm13, -16(%rax)
vmovsd (%rdx,%rsi,8), %xmm14
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
vaddsd (%rax), %xmm14, %xmm15
vaddsd %xmm1, %xmm15, %xmm0
vmulsd %xmm9, %xmm0, %xmm8
vmovsd %xmm8, -8(%rax)
cmpq %r8, %rax
jne .L31
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,36 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.58: # Preds ..B1.58 ..B1.57
# Execution count [9.36e+01]
vmovsd 8(%r11,%r10), %xmm2 #55.35
incq %r15 #54.9
vaddsd 16(%r11,%r12), %xmm2, %xmm3 #55.12
vaddsd 8(%r11,%rbx), %xmm3, %xmm4 #55.12
vaddsd %xmm1, %xmm4, %xmm1 #55.12
vmulsd %xmm1, %xmm0, %xmm5 #55.12
vmovsd %xmm5, 8(%r11,%r12) #55.12
vaddsd 16(%r11,%r10), %xmm5, %xmm6 #55.48
vaddsd 24(%r11,%r12), %xmm6, %xmm7 #55.63
vaddsd 16(%r11,%rbx), %xmm7, %xmm8 #55.79
vmulsd %xmm8, %xmm0, %xmm9 #55.12
vmovsd %xmm9, 16(%r11,%r12) #55.12
vaddsd 24(%r11,%r10), %xmm9, %xmm10 #55.48
vaddsd 32(%r11,%r12), %xmm10, %xmm11 #55.63
vaddsd 24(%r11,%rbx), %xmm11, %xmm12 #55.79
vmulsd %xmm12, %xmm0, %xmm13 #55.12
vmovsd %xmm13, 24(%r11,%r12) #55.12
vaddsd 32(%r11,%r10), %xmm13, %xmm14 #55.48
vaddsd 40(%r11,%r12), %xmm14, %xmm15 #55.63
vaddsd 32(%r11,%rbx), %xmm15, %xmm16 #55.79
vmulsd %xmm16, %xmm0, %xmm1 #55.12
vmovsd %xmm1, 32(%r11,%r12) #55.12
addq $32, %r11 #54.9
cmpq %r14, %r15 #54.9
jb ..B1.58 # Prob 28% #54.9
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,19 @@
// OSACA-BEGIN
.LBB0_62: // %L.LB1_398.1
// Parent Loop BB0_50 Depth=1
// Parent Loop BB0_55 Depth=2
// Parent Loop BB0_59 Depth=3
// => This Inner Loop Header: Depth=4
ldr d1, [x7], #8
fadd d0, d1, d0
ldr d2, [x22]
ldr d3, [x23], #8
fadd d2, d2, d3
fadd d0, d0, d2
sub w26, w26, #1 // =1
fmul d0, d0, d9
stur d0, [x22, #-8]
add x22, x22, #8 // =8
cmp w26, #2 // =2
b.gt .LBB0_62
// OSACA-END

View File

@@ -0,0 +1,41 @@
// OSACA-BEGIN
.L20:
ldr d31, [x15, x18, lsl 3]
ldr d0, [x15, 8]
mov x14, x15
add x16, x15, 24
ldr d2, [x15, x30, lsl 3]
add x15, x15, 32
fadd d1, d31, d0
fadd d3, d1, d30
fadd d4, d3, d2
fmul d5, d4, d9
str d5, [x14], 8
ldr d6, [x14, x18, lsl 3]
ldr d16, [x14, 8]
add x13, x14, 8
ldr d7, [x14, x30, lsl 3]
fadd d17, d6, d16
fadd d18, d17, d5
fadd d19, d18, d7
fmul d20, d19, d9
str d20, [x15, -24]
ldr d21, [x13, x18, lsl 3]
ldr d23, [x14, 16]
ldr d22, [x13, x30, lsl 3]
fadd d24, d21, d23
fadd d25, d24, d20
fadd d26, d25, d22
fmul d27, d26, d9
str d27, [x14, 8]
ldr d30, [x15]
ldr d28, [x16, x18, lsl 3]
ldr d29, [x16, x30, lsl 3]
fadd d31, d28, d30
fadd d2, d31, d27
fadd d0, d2, d29
fmul d30, d0, d9
str d30, [x15, -8]
cmp x7, x15
bne .L20
// OSACA-END

View File

@@ -0,0 +1,61 @@
# OSACA-BEGIN
.L32:
vmovsd (%rax,%rsi,8), %xmm7
leaq 8(%rax), %rdx
vaddsd (%rax,%rcx,8), %xmm8, %xmm11
vaddsd 8(%rax), %xmm7, %xmm10
vaddsd %xmm11, %xmm10, %xmm12
vmulsd %xmm9, %xmm12, %xmm13
vmovsd %xmm13, (%rax)
vmovsd (%rdx,%rsi,8), %xmm14
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
leaq 16(%rax), %rdx
vaddsd 16(%rax), %xmm14, %xmm15
vaddsd %xmm1, %xmm15, %xmm0
vmulsd %xmm9, %xmm0, %xmm3
vmovsd %xmm3, 8(%rax)
vmovsd (%rdx,%rsi,8), %xmm2
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
leaq 24(%rax), %rdx
vaddsd 24(%rax), %xmm2, %xmm4
vaddsd %xmm5, %xmm4, %xmm6
vmulsd %xmm9, %xmm6, %xmm8
vmovsd %xmm8, 16(%rax)
vmovsd (%rdx,%rsi,8), %xmm7
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
leaq 32(%rax), %rdx
vaddsd 32(%rax), %xmm7, %xmm10
vaddsd %xmm11, %xmm10, %xmm12
vmulsd %xmm9, %xmm12, %xmm13
vmovsd %xmm13, 24(%rax)
vmovsd (%rdx,%rsi,8), %xmm14
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
leaq 40(%rax), %rdx
vaddsd 40(%rax), %xmm14, %xmm15
vaddsd %xmm1, %xmm15, %xmm0
vmulsd %xmm9, %xmm0, %xmm3
vmovsd %xmm3, 32(%rax)
vmovsd (%rdx,%rsi,8), %xmm2
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
leaq 48(%rax), %rdx
vaddsd 48(%rax), %xmm2, %xmm4
vaddsd %xmm5, %xmm4, %xmm6
vmulsd %xmm9, %xmm6, %xmm8
vmovsd %xmm8, 40(%rax)
vmovsd (%rdx,%rsi,8), %xmm7
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
leaq 56(%rax), %rdx
vaddsd 56(%rax), %xmm7, %xmm10
addq $64, %rax
vaddsd %xmm11, %xmm10, %xmm12
vmulsd %xmm9, %xmm12, %xmm13
vmovsd %xmm13, -16(%rax)
vmovsd (%rdx,%rsi,8), %xmm14
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
vaddsd (%rax), %xmm14, %xmm15
vaddsd %xmm1, %xmm15, %xmm0
vmulsd %xmm9, %xmm0, %xmm8
vmovsd %xmm8, -8(%rax)
cmpq %r8, %rax
jne .L32
# OSACA-END

View File

@@ -0,0 +1,40 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L21:
vmovupd (%r8,%rax), %ymm11
vmovupd (%rsi,%rax), %ymm13
vaddpd (%r9,%rax), %ymm11, %ymm12
vaddpd (%rdi,%rax), %ymm13, %ymm14
vmovupd 32(%r8,%rax), %ymm1
vmovupd 32(%rsi,%rax), %ymm2
vaddpd %ymm14, %ymm12, %ymm15
vaddpd 32(%r9,%rax), %ymm1, %ymm5
vaddpd 32(%rdi,%rax), %ymm2, %ymm7
vmulpd %ymm8, %ymm15, %ymm0
vmovupd 64(%r8,%rax), %ymm10
vaddpd %ymm7, %ymm5, %ymm6
vmovupd 64(%rsi,%rax), %ymm12
vmovupd 96(%rsi,%rax), %ymm5
vmovupd %ymm0, (%rdx,%rax)
vmovupd 96(%r8,%rax), %ymm0
vaddpd 64(%r9,%rax), %ymm10, %ymm11
vaddpd 64(%rdi,%rax), %ymm12, %ymm13
vaddpd 96(%r9,%rax), %ymm0, %ymm1
vaddpd 96(%rdi,%rax), %ymm5, %ymm2
vaddpd %ymm13, %ymm11, %ymm14
vmulpd %ymm8, %ymm6, %ymm9
vaddpd %ymm2, %ymm1, %ymm7
vmulpd %ymm8, %ymm14, %ymm15
vmulpd %ymm8, %ymm7, %ymm6
vmovupd %ymm9, 32(%rdx,%rax)
vmovupd %ymm15, 64(%rdx,%rax)
vmovupd %ymm6, 96(%rdx,%rax)
subq $-128, %rax
cmpq %rax, %r15
jne .L21
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,46 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L28:
movupd 16(%r8,%rax), %xmm11
movupd 16(%rdi,%rax), %xmm12
movupd 16(%rsi,%rax), %xmm13
addpd %xmm11, %xmm15
addpd %xmm13, %xmm12
movupd 32(%rdi,%rax), %xmm14
movupd 32(%rsi,%rax), %xmm0
addpd %xmm15, %xmm12
movupd 32(%r8,%rax), %xmm15
addpd %xmm0, %xmm14
addpd %xmm15, %xmm11
movupd 48(%rdi,%rax), %xmm1
movupd 48(%rsi,%rax), %xmm7
addpd %xmm11, %xmm14
addpd %xmm7, %xmm1
mulpd %xmm2, %xmm12
mulpd %xmm2, %xmm14
movups %xmm12, 16(%rcx,%rax)
movups %xmm14, 32(%rcx,%rax)
movupd 48(%r8,%rax), %xmm14
addpd %xmm14, %xmm15
addpd %xmm15, %xmm1
mulpd %xmm2, %xmm1
movups %xmm1, 48(%rcx,%rax)
addq $64, %rax
.L21:
movupd (%r8,%rax), %xmm15
movupd (%rdi,%rax), %xmm0
movupd (%rsi,%rax), %xmm1
addpd %xmm15, %xmm14
addpd %xmm1, %xmm0
leaq 16(%rax), %r10
addpd %xmm0, %xmm14
mulpd %xmm2, %xmm14
movups %xmm14, (%rcx,%rax)
cmpq %r10, %r14
jne .L28
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,37 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.47: # Preds ..B1.47 ..B1.46
# Execution count [1.15e+04]
vmovupd 10016(%r8,%rcx,8), %ymm1 #94.5
vmovupd 10048(%r8,%rcx,8), %ymm6 #94.5
vmovupd 10080(%r8,%rcx,8), %ymm11 #94.5
vaddpd 16(%r12,%rcx,8), %ymm1, %ymm2 #94.5
vaddpd 48(%r12,%rcx,8), %ymm6, %ymm7 #94.5
vaddpd 80(%r12,%rcx,8), %ymm11, %ymm12 #94.5
vaddpd 20032(%r10,%rcx,8), %ymm2, %ymm3 #94.5
vaddpd 20064(%r10,%rcx,8), %ymm7, %ymm8 #94.5
vaddpd 20096(%r10,%rcx,8), %ymm12, %ymm13 #94.5
vaddpd 10032(%r8,%rcx,8), %ymm3, %ymm4 #94.5
vaddpd 10064(%r8,%rcx,8), %ymm8, %ymm9 #94.5
vaddpd 10096(%r8,%rcx,8), %ymm13, %ymm14 #94.5
vmovupd 10112(%r8,%rcx,8), %ymm1 #94.5
vmulpd %ymm4, %ymm0, %ymm5 #94.5
vmulpd %ymm9, %ymm0, %ymm10 #94.5
vmulpd %ymm14, %ymm0, %ymm15 #94.5
vaddpd 112(%r12,%rcx,8), %ymm1, %ymm2 #94.5
vmovupd %ymm5, 10016(%r9,%rcx,8) #94.5
vmovupd %ymm10, 10048(%r9,%rcx,8) #94.5
vmovupd %ymm15, 10080(%r9,%rcx,8) #94.5
vaddpd 20128(%r10,%rcx,8), %ymm2, %ymm3 #94.5
vaddpd 10128(%r8,%rcx,8), %ymm3, %ymm4 #94.5
vmulpd %ymm4, %ymm0, %ymm5 #94.5
vmovupd %ymm5, 10112(%r9,%rcx,8) #94.5
addq $16, %rcx #94.5
cmpq %r14, %rcx #94.5
jb ..B1.47 # Prob 82% #94.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,69 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.47: # Preds ..B1.63 ..B1.46
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.48: # Preds ..B1.47
# Execution count [1.73e+04]
vmovupd 10032(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10016(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.51: # Preds ..B1.48
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
vaddpd 16(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20032(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10016(%r9,%rcx,8) #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.52: # Preds ..B1.51
# Execution count [1.73e+04]
vmovupd 10096(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10080(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.55: # Preds ..B1.52
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
vaddpd 80(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20096(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10080(%r9,%rcx,8) #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.56: # Preds ..B1.55
# Execution count [1.73e+04]
vmovupd 10160(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10144(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.59: # Preds ..B1.56
# Execution count [1.15e+04]
lea (%r12,%r11), %r8 #94.5
vaddpd 144(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20160(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10144(%r9,%rcx,8) #94.5
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
..B1.60: # Preds ..B1.59
# Execution count [1.73e+04]
vmovupd 10224(%r8,%rcx,8), %zmm2 #94.5
vmovupd 10208(%r8,%rcx,8), %zmm0 #94.5
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
..B1.63: # Preds ..B1.60
# Execution count [1.15e+04]
vaddpd 208(%r12,%rcx,8), %zmm0, %zmm0 #94.5
vaddpd 20224(%r10,%rcx,8), %zmm0, %zmm1 #94.5
vaddpd %zmm2, %zmm1, %zmm2 #94.5
vmulpd %zmm2, %zmm4, %zmm3 #94.5
vmovupd %zmm3, 10208(%r9,%rcx,8) #94.5
addq $32, %rcx #94.5
cmpq %r14, %rcx #94.5
jb ..B1.47 # Prob 82% #94.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,40 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.42: # Preds ..B1.42 ..B1.41
# Execution count [1.15e+04]
movups 10016(%r8,%rcx,8), %xmm0 #94.5
addpd 16(%r12,%rcx,8), %xmm0 #94.5
addpd 20032(%r10,%rcx,8), %xmm0 #94.5
movups 10032(%r8,%rcx,8), %xmm2 #94.5
movups 32(%r12,%rcx,8), %xmm1 #94.5
addpd %xmm2, %xmm0 #94.5
addpd %xmm1, %xmm2 #94.5
mulpd %xmm7, %xmm0 #94.5
addpd 20048(%r10,%rcx,8), %xmm2 #94.5
movups 10048(%r8,%rcx,8), %xmm4 #94.5
movups 48(%r12,%rcx,8), %xmm3 #94.5
addpd %xmm4, %xmm2 #94.5
addpd %xmm3, %xmm4 #94.5
mulpd %xmm7, %xmm2 #94.5
addpd 20064(%r10,%rcx,8), %xmm4 #94.5
movups 10064(%r8,%rcx,8), %xmm6 #94.5
movups 64(%r12,%rcx,8), %xmm5 #94.5
addpd %xmm6, %xmm4 #94.5
addpd %xmm5, %xmm6 #94.5
mulpd %xmm7, %xmm4 #94.5
addpd 20080(%r10,%rcx,8), %xmm6 #94.5
addpd 10080(%r8,%rcx,8), %xmm6 #94.5
mulpd %xmm7, %xmm6 #94.5
movups %xmm0, 10016(%r9,%rcx,8) #94.5
movups %xmm2, 10032(%r9,%rcx,8) #94.5
movups %xmm4, 10048(%r9,%rcx,8) #94.5
movups %xmm6, 10064(%r9,%rcx,8) #94.5
addq $8, %rcx #94.5
cmpq %r14, %rcx #94.5
jb ..B1.42 # Prob 82% #94.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,131 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_16 Depth=1
// Parent Loop BB1_19 Depth=2
// Parent Loop BB1_24 Depth=3
// => This Inner Loop Header: Depth=4
add x0, x5, x16
add x18, x21, x16
ldp q4, q5, [x0, #16]
ldp q6, q7, [x0, #48]
ldur q0, [x18, #8]
ldur q1, [x18, #24]
ldur q2, [x18, #40]
ldur q3, [x18, #56]
add x1, x28, x16
add x15, x15, #32 // =32
fadd v0.2d, v4.2d, v0.2d
fadd v4.2d, v5.2d, v1.2d
fadd v5.2d, v6.2d, v2.2d
fadd v6.2d, v7.2d, v3.2d
ldp q7, q16, [x1, #16]
fadd v1.2d, v7.2d, v1.2d
ldp q17, q18, [x1, #48]
ldur q19, [x18, #72]
fadd v0.2d, v0.2d, v1.2d
fadd v1.2d, v16.2d, v2.2d
fadd v2.2d, v17.2d, v3.2d
fadd v3.2d, v18.2d, v19.2d
ldp q16, q17, [x0, #80]
ldp q18, q19, [x0, #112]
fadd v1.2d, v4.2d, v1.2d
fadd v2.2d, v5.2d, v2.2d
fadd v3.2d, v6.2d, v3.2d
ldur q4, [x18, #72]
ldur q5, [x18, #88]
ldur q6, [x18, #104]
ldur q7, [x18, #120]
fadd v4.2d, v16.2d, v4.2d
fadd v16.2d, v17.2d, v5.2d
fadd v17.2d, v18.2d, v6.2d
fadd v18.2d, v19.2d, v7.2d
ldp q19, q20, [x1, #80]
fadd v5.2d, v19.2d, v5.2d
ldp q21, q22, [x1, #112]
ldur q23, [x18, #136]
fadd v4.2d, v4.2d, v5.2d
fadd v5.2d, v20.2d, v6.2d
fadd v6.2d, v21.2d, v7.2d
fadd v7.2d, v22.2d, v23.2d
ldp q20, q21, [x0, #144]
ldp q22, q23, [x0, #176]
fadd v5.2d, v16.2d, v5.2d
fadd v6.2d, v17.2d, v6.2d
fadd v7.2d, v18.2d, v7.2d
ldur q16, [x18, #136]
ldur q17, [x18, #152]
ldur q18, [x18, #168]
ldur q19, [x18, #184]
fadd v16.2d, v20.2d, v16.2d
fadd v20.2d, v21.2d, v17.2d
fadd v21.2d, v22.2d, v18.2d
fadd v22.2d, v23.2d, v19.2d
ldp q23, q24, [x1, #144]
fadd v17.2d, v23.2d, v17.2d
ldp q25, q26, [x1, #176]
fadd v16.2d, v16.2d, v17.2d
fadd v17.2d, v24.2d, v18.2d
fadd v18.2d, v25.2d, v19.2d
ldp q24, q25, [x0, #208]
ldur q23, [x18, #200]
fadd v17.2d, v20.2d, v17.2d
fadd v18.2d, v21.2d, v18.2d
ldur q20, [x18, #200]
ldur q21, [x18, #216]
fadd v19.2d, v26.2d, v23.2d
fadd v20.2d, v24.2d, v20.2d
fadd v24.2d, v25.2d, v21.2d
ldp q25, q26, [x1, #208]
fadd v21.2d, v25.2d, v21.2d
fadd v20.2d, v20.2d, v21.2d
ldp q21, q25, [x0, #240]
fadd v19.2d, v22.2d, v19.2d
ldur q22, [x18, #232]
fadd v21.2d, v21.2d, v22.2d
fadd v22.2d, v26.2d, v22.2d
fadd v22.2d, v24.2d, v22.2d
ldp q24, q26, [x1, #240]
ldur q23, [x18, #248]
fadd v25.2d, v25.2d, v23.2d
fadd v23.2d, v24.2d, v23.2d
add x18, x18, #264 // =264
fmul v0.2d, v0.2d, v28.2d
fmul v1.2d, v1.2d, v28.2d
fmul v2.2d, v2.2d, v28.2d
fmul v5.2d, v5.2d, v28.2d
fadd v21.2d, v21.2d, v23.2d
ldr q23, [x18]
add x18, x25, x16
stur q0, [x18, #8]
stur q1, [x18, #24]
fmul v3.2d, v3.2d, v28.2d
stur q2, [x18, #40]
fadd v23.2d, v26.2d, v23.2d
stur q5, [x18, #88]
fmul v4.2d, v4.2d, v28.2d
stur q3, [x18, #56]
fmul v6.2d, v6.2d, v28.2d
stur q4, [x18, #72]
fmul v0.2d, v7.2d, v28.2d
stur q6, [x18, #104]
fmul v1.2d, v16.2d, v28.2d
stur q0, [x18, #120]
fmul v2.2d, v17.2d, v28.2d
stur q1, [x18, #136]
fmul v4.2d, v19.2d, v28.2d
stur q2, [x18, #152]
fadd v5.2d, v25.2d, v23.2d
stur q4, [x18, #184]
fmul v3.2d, v18.2d, v28.2d
stur q3, [x18, #168]
fmul v6.2d, v20.2d, v28.2d
stur q6, [x18, #200]
fmul v0.2d, v22.2d, v28.2d
stur q0, [x18, #216]
fmul v1.2d, v21.2d, v28.2d
stur q1, [x18, #232]
add x16, x16, #256 // =256
fmul v2.2d, v5.2d, v28.2d
stur q2, [x18, #248]
adds x17, x17, #4 // =4
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,43 @@
// OSACA-BEGIN
.L93:
add x5, x0, 16
ldr q2, [x14, x0]
ldr q5, [x25, x0]
add x7, x0, 32
ldr q13, [x22, x0]
ldr q4, [x25, x5]
add x6, x0, 48
ldr x9, [sp, 144]
ldr q19, [x22, x5]
ldr q7, [x14, x5]
ldr q6, [x14, x7]
ldr q3, [x25, x7]
ldr q18, [x22, x7]
fadd v17.2d, v2.2d, v30.2d
ldr q16, [x14, x6]
ldr q20, [x25, x6]
fadd v23.2d, v5.2d, v13.2d
ldr q22, [x22, x6]
fadd v24.2d, v4.2d, v19.2d
fadd v25.2d, v7.2d, v2.2d
fadd v27.2d, v6.2d, v7.2d
fadd v26.2d, v3.2d, v18.2d
fadd v28.2d, v16.2d, v6.2d
mov v30.16b, v16.16b
fadd v29.2d, v20.2d, v22.2d
fadd v31.2d, v23.2d, v17.2d
fadd v0.2d, v24.2d, v25.2d
fadd v2.2d, v26.2d, v27.2d
fadd v1.2d, v29.2d, v28.2d
fmul v5.2d, v31.2d, v21.2d
fmul v13.2d, v0.2d, v21.2d
fmul v4.2d, v2.2d, v21.2d
fmul v19.2d, v1.2d, v21.2d
str q5, [x28, x0]
add x0, x0, 64
str q13, [x28, x5]
str q4, [x28, x7]
str q19, [x28, x6]
cmp x9, x0
bne .L93
// OSACA-END

View File

@@ -0,0 +1,36 @@
# OSACA-BEGIN
.L28:
vmovups (%r10,%rcx), %xmm5
vmovups 32(%r10,%rax), %xmm13
vmovups (%rdi,%rcx), %xmm1
vmovups 32(%rdi,%rax), %xmm14
vmovups 48(%rdi,%rax), %xmm9
vaddpd (%r8,%rcx), %xmm1, %xmm10
vaddpd 32(%r8,%rax), %xmm14, %xmm15
vaddpd 48(%r8,%rax), %xmm9, %xmm1
vaddpd %xmm5, %xmm8, %xmm8
vaddpd %xmm13, %xmm5, %xmm6
vmovups 48(%r10,%rax), %xmm5
vaddpd %xmm8, %xmm10, %xmm11
vaddpd %xmm6, %xmm15, %xmm0
vmulpd %xmm2, %xmm11, %xmm12
vaddpd %xmm5, %xmm13, %xmm4
vmulpd %xmm2, %xmm0, %xmm7
vaddpd %xmm4, %xmm1, %xmm10
vmovups %xmm12, (%rsi,%rcx)
vmovups %xmm7, 32(%rsi,%rax)
vmulpd %xmm2, %xmm10, %xmm8
vmovups %xmm8, 48(%rsi,%rax)
addq $64, %rax
.L21:
vmovups (%r10,%rax), %xmm8
leaq 16(%rax), %rcx
vmovups (%rdi,%rax), %xmm9
vaddpd (%r8,%rax), %xmm9, %xmm10
vaddpd %xmm8, %xmm5, %xmm11
vaddpd %xmm11, %xmm10, %xmm12
vmulpd %xmm2, %xmm12, %xmm13
vmovups %xmm13, (%rsi,%rax)
cmpq %rcx, %r14
jne .L28
# OSACA-END

View File

@@ -1,13 +0,0 @@
void scale(int N){
void dummy(double*, double*);
double a[N], b[N];
double s;
//STARTLOOP
for(int i=0; i<N; ++i){
a[i] = s * b[i];
}
dummy(&a[1],&b[1]);
}

View File

@@ -0,0 +1,44 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r15,%rax), %ymm5
vmovupd 0(%r13,%rax), %ymm6
vmovupd 32(%r15,%rax), %ymm8
vmovupd 32(%r13,%rax), %ymm7
vmovupd 64(%r15,%rax), %ymm9
vmovupd 64(%r13,%rax), %ymm10
vmovupd 96(%r15,%rax), %ymm11
vmovupd 96(%r13,%rax), %ymm12
vmovupd 128(%r15,%rax), %ymm13
vmovupd 128(%r13,%rax), %ymm14
vmovupd 160(%r15,%rax), %ymm15
vmovupd 160(%r13,%rax), %ymm2
vmovupd 192(%r15,%rax), %ymm0
vmovupd 192(%r13,%rax), %ymm1
vmovupd 224(%r15,%rax), %ymm3
vmovupd 224(%r13,%rax), %ymm4
vfmadd132pd (%r14,%rax), %ymm6, %ymm5
vfmadd132pd 32(%r14,%rax), %ymm7, %ymm8
vfmadd132pd 64(%r14,%rax), %ymm10, %ymm9
vfmadd132pd 96(%r14,%rax), %ymm12, %ymm11
vfmadd132pd 128(%r14,%rax), %ymm14, %ymm13
vfmadd132pd 160(%r14,%rax), %ymm2, %ymm15
vfmadd132pd 192(%r14,%rax), %ymm1, %ymm0
vfmadd132pd 224(%r14,%rax), %ymm4, %ymm3
vmovupd %ymm5, (%r12,%rax)
vmovupd %ymm8, 32(%r12,%rax)
vmovupd %ymm9, 64(%r12,%rax)
vmovupd %ymm11, 96(%r12,%rax)
vmovupd %ymm13, 128(%r12,%rax)
vmovupd %ymm15, 160(%r12,%rax)
vmovupd %ymm0, 192(%r12,%rax)
vmovupd %ymm3, 224(%r12,%rax)
addq $256, %rax
cmpq %rax, %r8
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,21 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.41: # Preds ..B1.41 ..B1.40
# Execution count [2.22e+03]
vmovups (%rcx,%rax,8), %zmm2 #80.5
vmovups 64(%rcx,%rax,8), %zmm4 #80.5
vmovups (%r14,%rax,8), %zmm1 #80.5
vmovups 64(%r14,%rax,8), %zmm3 #80.5
vfmadd213pd (%r8,%rax,8), %zmm1, %zmm2 #80.5
vfmadd213pd 64(%r8,%rax,8), %zmm3, %zmm4 #80.5
vmovupd %zmm2, (%r13,%rax,8) #80.5
vmovupd %zmm4, 64(%r13,%rax,8) #80.5
addq $16, %rax #80.5
cmpq %r12, %rax #80.5
jb ..B1.41 # Prob 82% #80.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,112 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x9, #-256]
ldp q2, q3, [x9, #-224]
ldp q4, q5, [x10, #-256]
ldp q6, q7, [x10, #-224]
ldp q16, q17, [x11, #-256]
ldp q18, q19, [x11, #-224]
fmla v0.2d, v16.2d, v4.2d
fmla v1.2d, v17.2d, v5.2d
stp q1, q0, [sp, #96] // 32-byte Folded Spill
fmla v2.2d, v18.2d, v6.2d
fmla v3.2d, v19.2d, v7.2d
ldp q4, q5, [x9, #-192]
ldp q6, q7, [x9, #-160]
ldp q16, q17, [x10, #-192]
ldp q18, q19, [x10, #-160]
ldp q20, q21, [x11, #-192]
ldp q22, q23, [x11, #-160]
fmla v4.2d, v20.2d, v16.2d
stp q3, q4, [x12, #-208]
fmla v5.2d, v21.2d, v17.2d
fmla v6.2d, v22.2d, v18.2d
stp q5, q6, [x12, #-176]
fmla v7.2d, v23.2d, v19.2d
ldp q16, q18, [x9, #-128]
ldp q17, q19, [x9, #-96]
ldp q20, q21, [x10, #-128]
ldp q22, q23, [x10, #-96]
ldp q24, q25, [x11, #-128]
ldp q26, q27, [x11, #-96]
fmla v16.2d, v24.2d, v20.2d
stp q7, q16, [x12, #-144]
fmla v18.2d, v25.2d, v21.2d
fmla v17.2d, v26.2d, v22.2d
stp q18, q17, [x12, #-112]
fmla v19.2d, v27.2d, v23.2d
ldp q22, q23, [x9, #-64]
ldp q20, q21, [x9, #-32]
ldp q24, q25, [x10, #-64]
ldp q26, q27, [x10, #-32]
ldp q28, q29, [x11, #-64]
ldp q30, q31, [x11, #-32]
fmla v22.2d, v28.2d, v24.2d
stp q19, q22, [x12, #-80]
fmla v23.2d, v29.2d, v25.2d
fmla v20.2d, v30.2d, v26.2d
stp q23, q20, [x12, #-48]
fmla v21.2d, v31.2d, v27.2d
stur q21, [x12, #-16]
ldp q24, q25, [x9]
ldp q26, q27, [x9, #32]
ldp q28, q29, [x10]
ldp q30, q31, [x10, #32]
ldp q8, q10, [x11]
ldp q11, q12, [x11, #32]
fmla v24.2d, v8.2d, v28.2d
fmla v25.2d, v10.2d, v29.2d
stp q24, q25, [x12]
fmla v26.2d, v11.2d, v30.2d
fmla v27.2d, v12.2d, v31.2d
stp q26, q27, [x12, #32]
ldp q28, q29, [x9, #64]
ldp q30, q31, [x9, #96]
ldp q8, q10, [x10, #64]
ldp q11, q12, [x10, #96]
ldp q13, q14, [x11, #64]
ldp q15, q9, [x11, #96]
fmla v28.2d, v13.2d, v8.2d
fmla v29.2d, v14.2d, v10.2d
stp q28, q29, [x12, #64]
fmla v30.2d, v15.2d, v11.2d
fmla v31.2d, v9.2d, v12.2d
stp q30, q31, [x12, #96]
ldp q8, q9, [x9, #128]
ldp q12, q13, [x10, #128]
ldp q14, q15, [x11, #128]
ldp q10, q11, [x9, #160]
fmla v8.2d, v14.2d, v12.2d
ldp q12, q14, [x10, #160]
fmla v9.2d, v15.2d, v13.2d
stp q8, q9, [x12, #128]
ldp q13, q15, [x11, #160]
fmla v10.2d, v13.2d, v12.2d
fmla v11.2d, v15.2d, v14.2d
stp q10, q11, [x12, #160]
ldp q12, q13, [x9, #192]
ldp q14, q15, [x10, #192]
ldp q0, q1, [x11, #192]
fmla v12.2d, v0.2d, v14.2d
ldr q0, [sp, #112] // 16-byte Folded Reload
stur q0, [x12, #-256]
ldr q0, [sp, #96] // 16-byte Folded Reload
stp q0, q2, [x12, #-240]
ldp q0, q2, [x9, #224]
ldp q3, q4, [x10, #224]
ldp q5, q6, [x11, #224]
fmla v13.2d, v1.2d, v15.2d
stp q12, q13, [x12, #192]
fmla v0.2d, v5.2d, v3.2d
fmla v2.2d, v6.2d, v4.2d
stp q0, q2, [x12, #224]
add x8, x8, #64 // =64
add x12, x12, #512 // =512
add x11, x11, #512 // =512
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x13, x13, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,53 @@
// OSACA-BEGIN
.L17:
add x12, x11, 16
ldr q29, [x22, x11]
ldr q30, [x20, x11]
add x7, x11, 32
ldr q31, [x21, x11]
ldr q7, [x22, x12]
add x6, x11, 48
add x5, x11, 64
ldr q6, [x20, x12]
ldr q2, [x21, x12]
add x8, x11, 80
add x0, x11, 96
ldr q9, [x22, x7]
ldr q5, [x20, x7]
add x13, x11, 112
ldr q1, [x21, x7]
ldr q16, [x22, x6]
ldr q4, [x20, x6]
ldr q0, [x21, x6]
fmla v30.2d, v29.2d, v31.2d
ldr q23, [x22, x5]
ldr q3, [x20, x5]
fmla v6.2d, v7.2d, v2.2d
ldr q22, [x21, x5]
ldr q21, [x22, x8]
ldr q24, [x20, x8]
ldr q20, [x21, x8]
fmla v5.2d, v9.2d, v1.2d
ldr q19, [x22, x0]
ldr q25, [x20, x0]
fmla v4.2d, v16.2d, v0.2d
ldr q18, [x21, x0]
ldr q17, [x22, x13]
ldr q26, [x20, x13]
ldr q27, [x21, x13]
fmla v3.2d, v23.2d, v22.2d
fmla v24.2d, v21.2d, v20.2d
str q30, [x19, x11]
add x11, x11, 128
str q6, [x19, x12]
fmla v25.2d, v19.2d, v18.2d
str q5, [x19, x7]
fmla v26.2d, v17.2d, v27.2d
str q4, [x19, x6]
str q3, [x19, x5]
str q24, [x19, x8]
str q25, [x19, x0]
str q26, [x19, x13]
cmp x25, x11
bne .L17
// OSACA-END

View File

@@ -0,0 +1,38 @@
# OSACA-BEGIN
.L19:
vmovups (%r14,%rax), %xmm0
vmovups (%r12,%rax), %xmm5
vmovups 16(%r14,%rax), %xmm3
vmovups 16(%r12,%rax), %xmm6
vmovups 32(%r14,%rax), %xmm4
vmovups 32(%r12,%rax), %xmm7
vmovups 48(%r14,%rax), %xmm8
vmovups 48(%r12,%rax), %xmm9
vmovups 64(%r14,%rax), %xmm10
vmovups 64(%r12,%rax), %xmm11
vmovups 80(%r14,%rax), %xmm12
vmovups 80(%r12,%rax), %xmm13
vmovups 96(%r14,%rax), %xmm14
vmovups 96(%r12,%rax), %xmm15
vmovups 112(%r14,%rax), %xmm2
vmovups 112(%r12,%rax), %xmm1
vfmadd132pd 0(%r13,%rax), %xmm5, %xmm0
vfmadd132pd 16(%r13,%rax), %xmm6, %xmm3
vfmadd132pd 32(%r13,%rax), %xmm7, %xmm4
vfmadd132pd 48(%r13,%rax), %xmm9, %xmm8
vfmadd132pd 64(%r13,%rax), %xmm11, %xmm10
vfmadd132pd 80(%r13,%rax), %xmm13, %xmm12
vfmadd132pd 96(%r13,%rax), %xmm15, %xmm14
vfmadd132pd 112(%r13,%rax), %xmm1, %xmm2
vmovups %xmm0, 0(%rbp,%rax)
vmovups %xmm3, 16(%rbp,%rax)
vmovups %xmm4, 32(%rbp,%rax)
vmovups %xmm8, 48(%rbp,%rax)
vmovups %xmm10, 64(%rbp,%rax)
vmovups %xmm12, 80(%rbp,%rax)
vmovups %xmm14, 96(%rbp,%rax)
vmovups %xmm2, 112(%rbp,%rax)
subq $-128, %rax
cmpq %rcx, %rax
jne .L19
# OSACA-END

View File

@@ -0,0 +1,46 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
# LLVM-MCA-BEGIN
.L19:
vmovupd (%rcx), %ymm4
vmovupd 32(%rcx), %ymm13
vaddsd %xmm4, %xmm0, %xmm6
vunpckhpd %xmm4, %xmm4, %xmm3
vextractf64x2 $0x1, %ymm4, %xmm8
vaddsd %xmm6, %xmm3, %xmm7
vunpckhpd %xmm8, %xmm8, %xmm11
vunpckhpd %xmm13, %xmm13, %xmm1
vaddsd %xmm7, %xmm8, %xmm10
vextractf64x2 $0x1, %ymm13, %xmm2
vunpckhpd %xmm2, %xmm2, %xmm3
vaddsd %xmm11, %xmm10, %xmm12
vmovupd 64(%rcx), %ymm8
vmovupd 96(%rcx), %ymm5
vaddsd %xmm13, %xmm12, %xmm0
vunpckhpd %xmm8, %xmm8, %xmm12
vextractf64x2 $0x1, %ymm8, %xmm14
vaddsd %xmm0, %xmm1, %xmm4
vunpckhpd %xmm14, %xmm14, %xmm0
vextractf64x2 $0x1, %ymm5, %xmm9
vaddsd %xmm4, %xmm2, %xmm6
subq $-128, %rcx
vaddsd %xmm3, %xmm6, %xmm7
vaddsd %xmm8, %xmm7, %xmm11
vunpckhpd %xmm5, %xmm5, %xmm7
vaddsd %xmm11, %xmm12, %xmm13
vunpckhpd %xmm9, %xmm9, %xmm12
vaddsd %xmm13, %xmm14, %xmm1
vaddsd %xmm0, %xmm1, %xmm4
vaddsd %xmm5, %xmm4, %xmm3
vaddsd %xmm3, %xmm7, %xmm8
vaddsd %xmm8, %xmm9, %xmm11
vaddsd %xmm12, %xmm11, %xmm0
cmpq %rcx, %r15
jne .L19
# LLVM-MCA-END
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,20 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vaddpd (%rcx), %ymm3, %ymm4
addq $256, %rcx
vaddpd -224(%rcx), %ymm4, %ymm5
vaddpd -192(%rcx), %ymm5, %ymm6
vaddpd -160(%rcx), %ymm6, %ymm8
vaddpd -128(%rcx), %ymm8, %ymm9
vaddpd -96(%rcx), %ymm9, %ymm10
vaddpd -64(%rcx), %ymm10, %ymm11
vaddpd -32(%rcx), %ymm11, %ymm3
cmpq %rcx, %r15
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,17 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.38: # Preds ..B1.38 ..B1.37
# Execution count [2.22e+03]
vaddpd (%r13,%rax,8), %zmm4, %zmm4 #76.5
vaddpd 64(%r13,%rax,8), %zmm3, %zmm3 #76.5
vaddpd 128(%r13,%rax,8), %zmm2, %zmm2 #76.5
vaddpd 192(%r13,%rax,8), %zmm1, %zmm1 #76.5
addq $32, %rax #76.5
cmpq %r14, %rax #76.5
jb ..B1.38 # Prob 82% #76.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,57 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q4, q5, [x9, #-256]
fadd v0.2d, v4.2d, v0.2d
fadd v1.2d, v5.2d, v1.2d
ldp q4, q5, [x9, #-192]
ldp q16, q17, [x9, #-128]
fadd v4.2d, v4.2d, v16.2d
ldp q6, q7, [x9, #-224]
fadd v2.2d, v6.2d, v2.2d
fadd v3.2d, v7.2d, v3.2d
fadd v0.2d, v0.2d, v4.2d
fadd v4.2d, v5.2d, v17.2d
ldp q6, q7, [x9, #-160]
ldp q18, q19, [x9, #-96]
ldp q16, q17, [x9]
add x8, x8, #64 // =64
fadd v1.2d, v1.2d, v4.2d
fadd v4.2d, v6.2d, v18.2d
fadd v2.2d, v2.2d, v4.2d
fadd v4.2d, v7.2d, v19.2d
ldp q6, q7, [x9, #-32]
ldp q18, q19, [x9, #32]
fadd v6.2d, v6.2d, v18.2d
fadd v7.2d, v7.2d, v19.2d
fadd v3.2d, v3.2d, v4.2d
ldp q4, q5, [x9, #-64]
fadd v4.2d, v4.2d, v16.2d
fadd v5.2d, v5.2d, v17.2d
ldp q16, q17, [x9, #64]
fadd v4.2d, v4.2d, v16.2d
fadd v5.2d, v5.2d, v17.2d
ldp q16, q17, [x9, #128]
fadd v0.2d, v0.2d, v16.2d
fadd v1.2d, v1.2d, v17.2d
ldp q16, q17, [x9, #192]
ldp q18, q19, [x9, #96]
fadd v6.2d, v6.2d, v18.2d
fadd v7.2d, v7.2d, v19.2d
fadd v4.2d, v4.2d, v16.2d
ldp q18, q19, [x9, #160]
fadd v2.2d, v2.2d, v18.2d
fadd v3.2d, v3.2d, v19.2d
fadd v0.2d, v0.2d, v4.2d
fadd v4.2d, v5.2d, v17.2d
ldp q18, q19, [x9, #224]
add x9, x9, #512 // =512
fadd v1.2d, v1.2d, v4.2d
fadd v4.2d, v6.2d, v18.2d
fadd v2.2d, v2.2d, v4.2d
fadd v4.2d, v7.2d, v19.2d
fadd v3.2d, v3.2d, v4.2d
adds x10, x10, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,47 @@
// OSACA-BEGIN
.L17:
mov x17, x16
ldr q4, [x17], 16
ldr q5, [x16, 16]
add x16, x16, 128
ldr q3, [x16, -80]
ldr q2, [x16, -64]
ldr q0, [x16, -48]
ldr q1, [x16, -32]
ldr q7, [x16, -16]
dup d16, v4.d[0]
dup d6, v4.d[1]
ldr q4, [x17, 16]
dup d22, v5.d[0]
dup d5, v5.d[1]
dup d20, v3.d[0]
dup d3, v3.d[1]
dup d19, v2.d[0]
dup d2, v2.d[1]
dup d21, v4.d[0]
dup d4, v4.d[1]
fadd d10, d8, d16
dup d18, v0.d[0]
dup d0, v0.d[1]
dup d8, v1.d[0]
dup d1, v1.d[1]
dup d17, v7.d[0]
dup d7, v7.d[1]
fadd d23, d6, d10
fadd d24, d23, d22
fadd d25, d5, d24
fadd d26, d25, d21
fadd d27, d4, d26
fadd d28, d27, d20
fadd d29, d3, d28
fadd d30, d29, d19
fadd d31, d2, d30
fadd d16, d31, d18
fadd d6, d0, d16
fadd d22, d6, d8
fadd d5, d1, d22
fadd d20, d5, d17
fadd d8, d7, d20
cmp x22, x16
bne .L17
// OSACA-END

View File

@@ -0,0 +1,23 @@
// OSACA-BEGIN
.L17:
mov x17, x16
ldr q10, [x17], 16
ldr q16, [x16, 16]
add x16, x16, 128
ldr q17, [x16, -80]
ldr q18, [x16, -64]
ldr q19, [x16, -48]
ldr q20, [x16, -32]
ldr q21, [x16, -16]
fadd v22.2d, v1.2d, v10.2d
ldr q23, [x17, 16]
fadd v24.2d, v22.2d, v16.2d
fadd v25.2d, v24.2d, v23.2d
fadd v26.2d, v25.2d, v17.2d
fadd v27.2d, v26.2d, v18.2d
fadd v28.2d, v27.2d, v19.2d
fadd v29.2d, v28.2d, v20.2d
fadd v1.2d, v29.2d, v21.2d
cmp x22, x16
bne .L17
// OSACA-END

View File

@@ -0,0 +1,38 @@
# OSACA-BEGIN
.L19:
vmovsd (%r10), %xmm8
vmovsd 8(%r10), %xmm10
subq $-128, %r10
vmovsd -112(%r10), %xmm12
vmovsd -104(%r10), %xmm14
vmovsd -96(%r10), %xmm1
vmovsd -88(%r10), %xmm2
vmovsd -80(%r10), %xmm3
vmovsd -72(%r10), %xmm6
vaddsd %xmm8, %xmm7, %xmm9
vmovsd -64(%r10), %xmm8
vaddsd %xmm9, %xmm10, %xmm11
vmovsd -56(%r10), %xmm10
vaddsd %xmm12, %xmm11, %xmm13
vmovsd -48(%r10), %xmm12
vaddsd %xmm13, %xmm14, %xmm15
vmovsd -40(%r10), %xmm14
vaddsd %xmm1, %xmm15, %xmm4
vmovsd -32(%r10), %xmm1
vaddsd %xmm4, %xmm2, %xmm0
vmovsd -24(%r10), %xmm2
vaddsd %xmm3, %xmm0, %xmm5
vmovsd -16(%r10), %xmm3
vaddsd %xmm5, %xmm6, %xmm7
vmovsd -8(%r10), %xmm6
vaddsd %xmm8, %xmm7, %xmm9
vaddsd %xmm9, %xmm10, %xmm11
vaddsd %xmm12, %xmm11, %xmm13
vaddsd %xmm13, %xmm14, %xmm15
vaddsd %xmm1, %xmm15, %xmm4
vaddsd %xmm4, %xmm2, %xmm0
vaddsd %xmm3, %xmm0, %xmm5
vaddsd %xmm5, %xmm6, %xmm7
cmpq %r10, %r14
jne .L19
# OSACA-END

View File

@@ -0,0 +1,14 @@
# OSACA-BEGIN
.L19:
vaddpd (%r10), %xmm3, %xmm1
subq $-128, %r10
vaddpd -112(%r10), %xmm1, %xmm4
vaddpd -96(%r10), %xmm4, %xmm5
vaddpd -80(%r10), %xmm5, %xmm6
vaddpd -64(%r10), %xmm6, %xmm8
vaddpd -48(%r10), %xmm8, %xmm9
vaddpd -32(%r10), %xmm9, %xmm10
vaddpd -16(%r10), %xmm10, %xmm3
cmpq %r10, %r14
jne .L19
# OSACA-END

Binary file not shown.

Binary file not shown.

View File

@@ -1,199 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
.file "taxCalc.c"
.text
..TXTST0:
# -- Begin main
.text
# mark_begin;
.align 16,0x90
.globl main
# --- main(void)
main:
..B1.1: # Preds ..B1.0
.cfi_startproc
..___tag_value_main.1:
..L2:
#4.15
pushq %rbp #4.15
.cfi_def_cfa_offset 16
movq %rsp, %rbp #4.15
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-128, %rsp #4.15
subq $4096, %rsp #4.15
movl $104446, %esi #4.15
movl $3, %edi #4.15
call __intel_new_feature_proc_init #4.15
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
vstmxcsr (%rsp) #4.15
movl $.2.3_2_kmpc_loc_struct_pack.3, %edi #4.15
xorl %esi, %esi #4.15
orl $32832, (%rsp) #4.15
xorl %eax, %eax #4.15
vldmxcsr (%rsp) #4.15
..___tag_value_main.6:
call __kmpc_begin #4.15
..___tag_value_main.7:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
movl $il0_peep_printf_format_0, %edi #5.5
call puts #5.5
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
vmovss .L_2il0floatpacket.0(%rip), %xmm0 #8.15
xorl %eax, %eax #11.5
vxorps %xmm1, %xmm1, %xmm1 #9.5
vmovss %xmm1, (%rsp) #9.5
movl $111,%ebx #IACA START
.byte 100,103,144 #IACA START
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
..B1.4: # Preds ..B1.4 ..B1.3
lea 1(%rax,%rax), %edx #12.9
vcvtsi2ss %edx, %xmm2, %xmm2 #12.27
vmulss %xmm2, %xmm0, %xmm3 #12.29
lea 2(%rax,%rax), %ecx #12.9
vaddss %xmm3, %xmm1, %xmm4 #12.29
vxorps %xmm1, %xmm1, %xmm1 #12.27
vcvtsi2ss %ecx, %xmm1, %xmm1 #12.27
vmulss %xmm1, %xmm0, %xmm5 #12.29
vmovss %xmm4, 4(%rsp,%rax,8) #12.9
vaddss %xmm5, %xmm4, %xmm1 #12.29
vmovss %xmm1, 8(%rsp,%rax,8) #12.9
incq %rax #11.5
cmpq $499, %rax #11.5
jb ..B1.4 # Prob 99% #11.5
movl $222,%ebx #IACA END
.byte 100,103,144 #IACA END
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
..B1.5: # Preds ..B1.4
vmovss 3992(%rsp), %xmm0 #12.18
movl $il0_peep_printf_format_1, %edi #15.5
vaddss .L_2il0floatpacket.1(%rip), %xmm0, %xmm1 #12.29
vmovss %xmm1, 3996(%rsp) #12.9
call puts #15.5
# LOE rbx r12 r13 r14 r15
..B1.6: # Preds ..B1.5
movl $.2.3_2_kmpc_loc_struct_pack.14, %edi #16.12
xorl %eax, %eax #16.12
..___tag_value_main.8:
call __kmpc_end #16.12
..___tag_value_main.9:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
xorl %eax, %eax #16.12
movq %rbp, %rsp #16.12
popq %rbp #16.12
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #16.12
.align 16,0x90
.cfi_endproc
# LOE
# mark_end;
.type main,@function
.size main,.-main
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.3:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.2
.align 4
.2.3_2__kmpc_loc_pack.2:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 52
.byte 59
.byte 52
.byte 59
.byte 59
.space 1, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.14:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.13
.align 4
.2.3_2__kmpc_loc_pack.13:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 49
.byte 54
.byte 59
.byte 49
.byte 54
.byte 59
.byte 59
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
il0_peep_printf_format_0:
.long 1128354639
.long 1702109249
.long 1931506803
.long 1953653108
.byte 0
.space 3, 0x00 # pad
.align 4
il0_peep_printf_format_1:
.long 1128354639
.long 1702109249
.long 1696625779
.word 25710
.byte 0
.data
# -- End main
.section .rodata, "a"
.align 4
.align 4
.L_2il0floatpacket.0:
.long 0x3e428f5c
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,4
.align 4
.L_2il0floatpacket.1:
.long 0x433dcf5c
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,4
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End

View File

@@ -1,18 +0,0 @@
#include <stdio.h>
//#include "iacaMarks.h"
int main(void){
printf("OSACA test start\n");
int i = 1;
float arr[1000];
float tax = 0.19;
arr[0] = 0;
//STARTLOOP
while(i < 1000){
arr[i] = arr[i-1]+i*tax;
i += 1;
}
printf("OSACA test end\n");
return 0;
}

View File

@@ -1,12 +0,0 @@
void triad(int N){
void dummy(double*);
double a[N], b[N], c[N], d[N];
double s;
//STARTLOOP
for(int i=0; i<N; ++i)
a[i] = b[i] + c[i] * d[i];
dummy(&a[1]);
}

View File

@@ -0,0 +1,36 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmovupd (%r14,%rsi), %ymm14
vmovupd 32(%r14,%rsi), %ymm15
vmovupd 64(%r14,%rsi), %ymm1
vmovupd 96(%r14,%rsi), %ymm0
vmovupd 128(%r14,%rsi), %ymm3
vmovupd 160(%r14,%rsi), %ymm4
vmovupd 192(%r14,%rsi), %ymm5
vmovupd 224(%r14,%rsi), %ymm7
vfmadd213pd 0(%r13,%rsi), %ymm6, %ymm14
vfmadd213pd 32(%r13,%rsi), %ymm6, %ymm15
vfmadd213pd 64(%r13,%rsi), %ymm6, %ymm1
vfmadd213pd 96(%r13,%rsi), %ymm6, %ymm0
vfmadd213pd 128(%r13,%rsi), %ymm6, %ymm3
vfmadd213pd 160(%r13,%rsi), %ymm6, %ymm4
vfmadd213pd 192(%r13,%rsi), %ymm6, %ymm5
vfmadd213pd 224(%r13,%rsi), %ymm6, %ymm7
vmovupd %ymm14, (%r12,%rsi)
vmovupd %ymm15, 32(%r12,%rsi)
vmovupd %ymm1, 64(%r12,%rsi)
vmovupd %ymm0, 96(%r12,%rsi)
vmovupd %ymm3, 128(%r12,%rsi)
vmovupd %ymm4, 160(%r12,%rsi)
vmovupd %ymm5, 192(%r12,%rsi)
vmovupd %ymm7, 224(%r12,%rsi)
addq $256, %rsi
cmpq %rsi, %rcx
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,16 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.40: # Preds ..B1.40 ..B1.39
# Execution count [2.22e+03]
vmovups (%r13,%rax,8), %zmm1 #78.5
vfmadd213pd (%rcx,%rax,8), %zmm2, %zmm1 #78.5
vmovupd %zmm1, (%r14,%rax,8) #78.5
addq $8, %rax #78.5
cmpq %r12, %rax #78.5
jb ..B1.40 # Prob 82% #78.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,118 @@
// OSACA-BEGIN
.LBB1_29: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q2, q3, [x9, #-256]
ldp q0, q1, [x9, #-224]
ldp q4, q5, [x10, #-256]
ldp q6, q7, [x10, #-224]
fmla v2.2d, v4.2d, v16.2d
fmla v3.2d, v5.2d, v16.2d
stp q2, q3, [x11, #-256]
fmla v0.2d, v6.2d, v16.2d
fmla v1.2d, v7.2d, v16.2d
stp q0, q1, [x11, #-224]
ldp q6, q7, [x9, #-192]
ldp q16, q17, [x10, #-192]
ldr q20, [sp, #80] // 16-byte Folded Reload
fmla v6.2d, v16.2d, v20.2d
ldr q16, [sp, #80] // 16-byte Folded Reload
ldp q4, q5, [x9, #-160]
ldp q18, q19, [x10, #-160]
fmla v7.2d, v17.2d, v16.2d
stp q6, q7, [x11, #-192]
ldr q16, [sp, #80] // 16-byte Folded Reload
fmla v4.2d, v18.2d, v16.2d
ldr q16, [sp, #80] // 16-byte Folded Reload
fmla v5.2d, v19.2d, v16.2d
stp q4, q5, [x11, #-160]
ldp q17, q19, [x9, #-128]
ldp q20, q21, [x10, #-128]
ldr q24, [sp, #80] // 16-byte Folded Reload
fmla v17.2d, v20.2d, v24.2d
ldr q20, [sp, #80] // 16-byte Folded Reload
ldp q16, q18, [x9, #-96]
ldp q22, q23, [x10, #-96]
fmla v19.2d, v21.2d, v20.2d
stp q17, q19, [x11, #-128]
ldr q20, [sp, #80] // 16-byte Folded Reload
fmla v16.2d, v22.2d, v20.2d
ldr q20, [sp, #80] // 16-byte Folded Reload
ldp q24, q25, [x10, #-64]
fmla v18.2d, v23.2d, v20.2d
stp q16, q18, [x11, #-96]
ldp q20, q22, [x9, #-64]
ldr q28, [sp, #80] // 16-byte Folded Reload
fmla v20.2d, v24.2d, v28.2d
ldr q24, [sp, #80] // 16-byte Folded Reload
ldp q21, q23, [x9, #-32]
ldp q26, q27, [x10, #-32]
fmla v22.2d, v25.2d, v24.2d
stp q20, q22, [x11, #-64]
ldr q24, [sp, #80] // 16-byte Folded Reload
fmla v21.2d, v26.2d, v24.2d
ldr q24, [sp, #80] // 16-byte Folded Reload
ldp q28, q29, [x10]
ldr q8, [sp, #80] // 16-byte Folded Reload
ldp q30, q31, [x10, #32]
ldr q9, [sp, #80] // 16-byte Folded Reload
fmla v23.2d, v27.2d, v24.2d
stp q21, q23, [x11, #-32]
ldp q24, q25, [x9]
fmla v24.2d, v28.2d, v8.2d
ldr q28, [sp, #80] // 16-byte Folded Reload
ldp q26, q27, [x9, #32]
ldp q8, q10, [x10, #64]
ldp q11, q12, [x10, #96]
fmla v25.2d, v29.2d, v28.2d
stp q24, q25, [x11]
ldr q28, [sp, #80] // 16-byte Folded Reload
fmla v26.2d, v30.2d, v28.2d
ldr q28, [sp, #80] // 16-byte Folded Reload
ldp q13, q14, [x10, #128]
ldr q2, [sp, #80] // 16-byte Folded Reload
ldp q1, q3, [x10, #192]
fmla v27.2d, v31.2d, v28.2d
stp q26, q27, [x11, #32]
ldp q28, q29, [x9, #64]
fmla v28.2d, v8.2d, v9.2d
ldr q8, [sp, #80] // 16-byte Folded Reload
ldp q30, q31, [x9, #96]
ldr q9, [sp, #80] // 16-byte Folded Reload
ldr q6, [sp, #80] // 16-byte Folded Reload
ldr q5, [sp, #80] // 16-byte Folded Reload
fmla v29.2d, v10.2d, v8.2d
stp q28, q29, [x11, #64]
ldr q8, [sp, #80] // 16-byte Folded Reload
fmla v30.2d, v11.2d, v8.2d
ldr q8, [sp, #80] // 16-byte Folded Reload
ldr q16, [sp, #80] // 16-byte Folded Reload
add x8, x8, #64 // =64
fmla v31.2d, v12.2d, v8.2d
stp q30, q31, [x11, #96]
ldp q8, q10, [x9, #128]
fmla v8.2d, v13.2d, v9.2d
ldr q9, [sp, #80] // 16-byte Folded Reload
ldp q11, q12, [x9, #160]
fmla v10.2d, v14.2d, v9.2d
stp q8, q10, [x11, #128]
ldp q13, q14, [x10, #160]
fmla v12.2d, v14.2d, v2.2d
ldp q2, q0, [x9, #192]
ldr q9, [sp, #80] // 16-byte Folded Reload
fmla v2.2d, v1.2d, v6.2d
ldp q1, q4, [x9, #224]
fmla v0.2d, v3.2d, v5.2d
stp q2, q0, [x11, #192]
ldp q3, q5, [x10, #224]
fmla v11.2d, v13.2d, v9.2d
stp q11, q12, [x11, #160]
fmla v1.2d, v3.2d, v16.2d
fmla v4.2d, v5.2d, v16.2d
stp q1, q4, [x11, #224]
add x11, x11, #512 // =512
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x12, x12, #8 // =8
b.ne .LBB1_29
// OSACA-END

View File

@@ -0,0 +1,45 @@
// OSACA-BEGIN
.L17:
add x0, x10, 16
ldr q23, [x20, x10]
ldr q24, [x21, x10]
add x7, x10, 32
ldr q25, [x20, x0]
ldr q26, [x21, x0]
add x6, x10, 48
add x5, x10, 64
ldr q27, [x20, x7]
ldr q28, [x21, x7]
add x4, x10, 80
add x11, x10, 96
ldr q29, [x20, x6]
ldr q30, [x21, x6]
add x2, x10, 112
fmla v23.2d, v3.2d, v24.2d
ldr q31, [x20, x5]
ldr q4, [x21, x5]
fmla v25.2d, v3.2d, v26.2d
ldr q2, [x20, x4]
ldr q5, [x21, x4]
fmla v27.2d, v3.2d, v28.2d
ldr q1, [x20, x11]
ldr q6, [x21, x11]
fmla v29.2d, v3.2d, v30.2d
ldr q0, [x20, x2]
ldr q7, [x21, x2]
fmla v31.2d, v3.2d, v4.2d
fmla v2.2d, v3.2d, v5.2d
fmla v1.2d, v3.2d, v6.2d
str q23, [x19, x10]
add x10, x10, 128
fmla v0.2d, v3.2d, v7.2d
str q25, [x19, x0]
str q27, [x19, x7]
str q29, [x19, x6]
str q31, [x19, x5]
str q2, [x19, x4]
str q1, [x19, x11]
str q0, [x19, x2]
cmp x24, x10
bne .L17
// OSACA-END

View File

@@ -0,0 +1,30 @@
# OSACA-BEGIN
.L19:
vmovups 0(%r13,%rax), %xmm12
vmovups 16(%r13,%rax), %xmm13
vmovups 32(%r13,%rax), %xmm14
vmovups 48(%r13,%rax), %xmm15
vmovups 64(%r13,%rax), %xmm1
vmovups 80(%r13,%rax), %xmm0
vmovups 96(%r13,%rax), %xmm4
vmovups 112(%r13,%rax), %xmm5
vfmadd213pd (%r12,%rax), %xmm3, %xmm12
vfmadd213pd 16(%r12,%rax), %xmm3, %xmm13
vfmadd213pd 32(%r12,%rax), %xmm3, %xmm14
vfmadd213pd 48(%r12,%rax), %xmm3, %xmm15
vfmadd213pd 64(%r12,%rax), %xmm3, %xmm1
vfmadd213pd 80(%r12,%rax), %xmm3, %xmm0
vfmadd213pd 96(%r12,%rax), %xmm3, %xmm4
vfmadd213pd 112(%r12,%rax), %xmm3, %xmm5
vmovups %xmm12, 0(%rbp,%rax)
vmovups %xmm13, 16(%rbp,%rax)
vmovups %xmm14, 32(%rbp,%rax)
vmovups %xmm15, 48(%rbp,%rax)
vmovups %xmm1, 64(%rbp,%rax)
vmovups %xmm0, 80(%rbp,%rax)
vmovups %xmm4, 96(%rbp,%rax)
vmovups %xmm5, 112(%rbp,%rax)
subq $-128, %rax
cmpq %rbx, %rax
jne .L19
# OSACA-END

View File

@@ -0,0 +1,28 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L19:
vmulpd (%rcx), %ymm3, %ymm12
vmulpd 32(%rcx), %ymm3, %ymm13
vmulpd 64(%rcx), %ymm3, %ymm14
vmulpd 96(%rcx), %ymm3, %ymm15
vmulpd 128(%rcx), %ymm3, %ymm0
vmulpd 160(%rcx), %ymm3, %ymm1
vmulpd 192(%rcx), %ymm3, %ymm7
vmulpd 224(%rcx), %ymm3, %ymm4
vmovupd %ymm12, (%rcx)
vmovupd %ymm13, 32(%rcx)
vmovupd %ymm14, 64(%rcx)
vmovupd %ymm15, 96(%rcx)
vmovupd %ymm0, 128(%rcx)
vmovupd %ymm1, 160(%rcx)
vmovupd %ymm7, 192(%rcx)
vmovupd %ymm4, 224(%rcx)
addq $256, %rcx
cmpq %r15, %rcx
jne .L19
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,17 @@
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.38: # Preds ..B1.38 ..B1.37
# Execution count [2.22e+03]
vmulpd (%r13,%rax,8), %zmm3, %zmm1 #75.5
vmulpd 64(%r13,%rax,8), %zmm3, %zmm2 #75.5
vmovupd %zmm1, (%r13,%rax,8) #75.5
vmovupd %zmm2, 64(%r13,%rax,8) #75.5
addq $16, %rax #75.5
cmpq %r14, %rax #75.5
jb ..B1.38 # Prob 82% #75.5
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY

View File

@@ -0,0 +1,15 @@
// OSACA-BEGIN
.LBB1_32: // Parent Loop BB1_20 Depth=1
// Parent Loop BB1_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q0, q1, [x8]
ldp q2, q3, [x8, #-32]
fmul v2.2d, v2.2d, v26.2d
fmul v3.2d, v3.2d, v26.2d
stp q2, q3, [x8, #-32]
fmul v0.2d, v0.2d, v26.2d
fmul v1.2d, v1.2d, v26.2d
stp q0, q1, [x8], #64
adds x9, x9, #1 // =1
b.ne .LBB1_32
// OSACA-END

View File

@@ -0,0 +1,31 @@
// OSACA-BEGIN
.L17:
ldr q23, [x16]
mov x17, x16
add x16, x16, 128
fmul v24.2d, v23.2d, v2.2d
str q24, [x17], 16
ldr q25, [x16, -112]
fmul v26.2d, v25.2d, v2.2d
str q26, [x16, -112]
ldr q27, [x17, 16]
fmul v28.2d, v27.2d, v2.2d
str q28, [x17, 16]
ldr q29, [x16, -80]
ldr q30, [x16, -64]
ldr q31, [x16, -48]
ldr q1, [x16, -32]
ldr q0, [x16, -16]
fmul v5.2d, v29.2d, v2.2d
fmul v4.2d, v30.2d, v2.2d
fmul v3.2d, v31.2d, v2.2d
fmul v6.2d, v1.2d, v2.2d
fmul v7.2d, v0.2d, v2.2d
str q5, [x16, -80]
str q4, [x16, -64]
str q3, [x16, -48]
str q6, [x16, -32]
str q7, [x16, -16]
cmp x22, x16
bne .L17
// OSACA-END

View File

@@ -0,0 +1,22 @@
# OSACA-BEGIN
.L19:
vmulpd (%r10), %xmm3, %xmm11
subq $-128, %r10
vmulpd -112(%r10), %xmm3, %xmm12
vmulpd -96(%r10), %xmm3, %xmm13
vmulpd -80(%r10), %xmm3, %xmm14
vmulpd -64(%r10), %xmm3, %xmm15
vmulpd -48(%r10), %xmm3, %xmm0
vmovups %xmm11, -128(%r10)
vmulpd -32(%r10), %xmm3, %xmm7
vmovups %xmm12, -112(%r10)
vmulpd -16(%r10), %xmm3, %xmm1
vmovups %xmm13, -96(%r10)
vmovups %xmm14, -80(%r10)
vmovups %xmm15, -64(%r10)
vmovups %xmm0, -48(%r10)
vmovups %xmm7, -32(%r10)
vmovups %xmm1, -16(%r10)
cmpq %r10, %r14
jne .L19
# OSACA-END

View File

@@ -1,2 +1,11 @@
"""Open Source Architecture Code Analyzer"""
name = "osaca"
__version__ = '0.2.0'
__version__ = "0.6.1"
# To trigger travis deployment to pypi, do the following:
# 1. Increment __version___
# 2. commit to RRZE-HPC/osaca's master branch
# 3. wait for Github Actions to complete successful (unless already tested)
# 4. tag commit with 'v{}'.format(__version__) (`git tag vX.Y.Z`)
# 5. push tag to github (`git push origin vX.Y.Z` or push all tags with `git push --tags`)

4
osaca/__main__.py Normal file
View File

@@ -0,0 +1,4 @@
#!/usr/bin/env python3
from .osaca import main
main()

View File

@@ -1,41 +0,0 @@
#!/usr/bin/env python3
from param import Register, MemAddr, Parameter
from testcase import Testcase
# Choose out of various operands
reg8 = Register('al')
reg16 = Register('ax')
reg32 = Register('eax')
reg64 = Register('rax')
xmm = Register('xmm0')
ymm = Register('ymm0')
zmm = Register('zmm0')
mem0 = MemAddr('(%rax, %esi, 4)')
imd1 = Parameter('IMD')
# -----------------------------------------------
# -USER INPUT------------------------------------
# -----------------------------------------------
# Enter your mnemonic
mnemonic = 'add'
# Define your operands. If you don't need it, just type in None
dst = mem0
op1 = imd1
op2 = None
# Define the number of instructions per loop (default: 12)
per_loop = '32'
# -----------------------------------------------
# -----------------------------------------------
# Start
operands = [x for x in [dst, op1, op2] if x is not None]
opListStr = ', '.join([str(x) for x in operands])
print('Create Testcase for {} {}'.format(mnemonic, opListStr), end='')
tc = Testcase(mnemonic, operands, per_loop)
tc.write_testcase()
print(' --------> SUCCEEDED')

34
osaca/data/_build_cache.py Executable file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
from glob import glob
import os.path
import sys
sys.path[0:0] = ["../.."]
failed = False
try:
from osaca.semantics.hw_model import MachineModel
except ModuleNotFoundError:
print(
"Unable to import MachineModel, probably some dependency is not yet installed. SKIPPING. "
"First run of OSACA may take a while to build caches, subsequent runs will be as fast as "
"ever."
)
sys.exit()
print("Building cache: ", end="")
sys.stdout.flush()
# Iterating architectures
for f in glob(os.path.join(os.path.dirname(__file__), "*.yml")):
MachineModel(path_to_yaml=f)
print(".", end="")
sys.stdout.flush()
# Iterating ISAs
for f in glob(os.path.join(os.path.dirname(__file__), "isa/*.yml")):
MachineModel(path_to_yaml=f)
print("+", end="")
sys.stdout.flush()
print()

2617
osaca/data/a64fx.yml Normal file

File diff suppressed because it is too large Load Diff

4180
osaca/data/a72.yml Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

808
osaca/data/a72/template.yml Normal file
View File

@@ -0,0 +1,808 @@
osaca_version: 0.3.11
micro_architecture: Cortex A-72
arch_code: a72
isa: aarch64
hidden_loads: false
ports: ['0', '1', '2', '3', '4', '5', '6', '7']
port_model_scheme: |
+-------------------------------------------------------------------------------------+
| scheduler |
+-------------------------------------------------------------------------------------+
0 |I 1 |L 2 |M 3 |S 4 |F1 5 |I 6 |F0 7 |B
\/ \/ \/ \/ \/ \/ \/ \/
+-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
|INT ALU| | LOAD | | MUL | | STORE | | ASIMD | |INT ALU| | ASIMD | | Branch|
+-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
+-------+ +-------+ +-----------+ +-------+ +---------+
| AGU | | DIV | | FP ALU | | AGU | |ASIMD MUL|
+-------+ +-------+ +-----------+ +-------+ +---------+
+-------+ +-----------+ +---------+
| SHIFT | | FP MUL | | FP ALU |
+-------+ +-----------+ +---------+
+-------+ +-----------+ +---------+
| CRC | | FP DIV | | FP MUL |
+-------+ +-----------+ +---------+
+-------+ +-----------+ +---------+
| USAD | | FP SQRT | | FP DIV |
+-------+ +-----------+ +---------+
+-----------+ +---------+
|ASIMD SHIFT| | FP CONV |
+-----------+ +---------+
+---------+
| CRYPTO |
+---------+
# The port pressues do not always correctly match this schema, because most
# instructions are imported from an experimentally determined mapping, which
# is not always correct.
load_latency: {x: 4.0, s: 5.0, d: 5.0, h: 6.0, q: 6.0}
load_throughput: []
load_throughput_default: [[1, '1']]
store_throughput: []
store_throughput_default: [[2, '3']]
instruction_forms:
# Branch
- name: b
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: bne
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: b.ne
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: br
operands:
- class: register
prefix: x
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: ret
operands:
- class: register
prefix: x
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: bl
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '05'], [1, '7']]
throughput: 1.0
- name: blr
operands:
- class: register
prefix: x
latency: 1.0
port_pressure: [[1, '05'], [1, '7']]
throughput: 1.0
# Load GPR
- name: ldr
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 4.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 5.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 5.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
# Load FP d
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 5.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 5.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 5.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
# Load FP q
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post_indexed: false
pre_indexed: false
latency: 5.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post_indexed: true
pre_indexed: false
latency: 5.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post_indexed: false
pre_indexed: true
latency: 5.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 6.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 6.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 6.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
# Store GPR
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 1.0
port_pressure: [[1, '3']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
# Store FP d
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
# Store FP q
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post_indexed: false
pre_indexed: false
latency: 4.0
port_pressure: [[2, '3']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post_indexed: true
pre_indexed: false
latency: 4.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post_indexed: false
pre_indexed: true
latency: 2.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 4.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 4.0
port_pressure: [[2, '3'], [2, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 4.0
port_pressure: [[2, '3'], [2, '05']]
throughput: 2.0
# Load unscaled GPR
- name: ldur
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: '*'
pre_indexed: '*'
latency: 4.0
port_pressure: [[1, '1']]
throughput: 1.0
# Load unscaled FP q
- name: ldur
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: '*'
pre_indexed: '*'
latency: 5.0
port_pressure: [[1, '1']]
throughput: 1.0
# Store unscaled GPR
- name: stur
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: '*'
pre_indexed: '*'
latency: 1.0
port_pressure: [[1, '3']]
throughput: 1.0
# Store unscaled FP q
- name: stur
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: '*'
pre_indexed: '*'
latency: 2.0
port_pressure: [[2, '3']]
throughput: 2.0
# Load pair GPR
- name: ldp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 4.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 4.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 4.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
# Load pair FP q
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 6.0
port_pressure: [[2, '1']]
throughput: 2.0
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 6.0
port_pressure: [[2, '1'], [1, '05']]
throughput: 2.0
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 6.0
port_pressure: [[2, '1'], [1, '05']]
throughput: 2.0
# Store pair GPR
- name: stp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 2.0
port_pressure: [[2, '3']]
throughput: 2.0
- name: stp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 2.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: stp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 2.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
# Store pair FP q
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: false
latency: 4.0
port_pressure: [[4, '3'], [1, '05']]
throughput: 4.0
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: true
pre_indexed: false
latency: 4.0
port_pressure: [[4, '3'], [1, '05']]
throughput: 4.0
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post_indexed: false
pre_indexed: true
latency: 4.0
port_pressure: [[4, '3'], [1, '05']]
throughput: 4.0
# Fast-forward (measures 4 cycles, but can be 3)
# Lower bound is used in order to ensure no over-estimates are possible.
# Ports do not match documentation, but "fixing" requires also "fixing" almost
# the entire rest of the model.
- name: fadd
operands:
- class: register
prefix: s
- class: register
prefix: s
- class: register
prefix: s
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fadd
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: register
prefix: d
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fadd
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
- name: fadd
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
- name: fsub
operands:
- class: register
prefix: s
- class: register
prefix: s
- class: register
prefix: s
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fsub
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: register
prefix: d
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fsub
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
- name: fsub
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
# Automatically generated instructions

44150
osaca/data/bdw.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,39 +0,0 @@
instr,TP,LT,ports
jmp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
je-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jne-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jna-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
ja-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jng-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jg-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jpe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jnp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jpo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jcxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jecxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0.0, 0, 0)"
1 instr TP LT ports
2 jmp-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
3 jo-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
4 jno-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
5 js-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
6 jns-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
7 je-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
8 jz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
9 jne-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
10 jnz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
11 jb-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
12 jnae-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
13 jc-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
14 jnb-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
15 jae-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
16 jnc-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
17 jbe-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
18 jna-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
19 ja-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
20 jnbe-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
21 jl-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
22 jnge-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
23 jge-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
24 jnl-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
25 jle-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
26 jng-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
27 jg-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
28 jnle-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
29 jp-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
30 jpe-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
31 jnp-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
32 jpo-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
33 jcxz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
34 jecxz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
35 jo-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
36 jno-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
37 js-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)
38 jns-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0.0, 0, 0)

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
from collections import defaultdict
from fractions import Fraction
class EntryBuilder:
@staticmethod
def compute_throughput(port_pressure):
port_occupancy = defaultdict(Fraction)
for uops, ports in port_pressure:
for p in ports:
port_occupancy[p] += Fraction(uops, len(ports))
return float(max(list(port_occupancy.values()) + [0]))
@staticmethod
def classify(operands_types):
load = "mem" in operands_types[:-1]
store = "mem" in operands_types[-1:]
vec = False
if any([vecr in operands_types for vecr in ["mm", "xmm", "ymm", "zmm"]]):
vec = True
assert not (load and store), "Can not process a combined load-store instruction."
return load, store, vec
def build_description(
self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
):
if comment:
comment = " # " + comment
else:
comment = ""
description = "- name: {}{}\n operands: {}\n".format(
instruction_name, comment, "[]" if len(operand_types) == 0 else ""
)
for ot in operand_types:
if ot == "imd":
description += " - class: immediate\n imd: int\n"
elif ot.startswith("mem"):
description += " - class: memory\n" ' base: "*"\n' ' offset: "*"\n'
if ot == "mem_simple":
description += " index: ~\n"
elif ot == "mem_complex":
description += " index: gpr\n"
else:
description += ' index: "*"\n'
description += ' scale: "*"\n'
else:
if "{k}" in ot:
description += " - class: register\n name: {}\n mask: True\n".format(
ot.replace("{k}", "")
)
else:
description += " - class: register\n name: {}\n".format(ot)
description += (
" latency: {latency}\n"
" port_pressure: {port_pressure!r}\n"
" throughput: {throughput}\n"
" uops: {uops}\n"
).format(
latency=latency,
port_pressure=port_pressure,
throughput=self.compute_throughput(port_pressure),
uops=sum([i for i, p in port_pressure]),
)
return description
def parse_port_pressure(self, port_pressure_str):
"""
Example:
1*p45+2*p0+2*p10,11 -> [[1, '45'], [2, '0'], [2, ['10', '11']]]
"""
port_pressure = []
if port_pressure_str:
for p in port_pressure_str.split("+"):
cycles, ports = p.split("*p")
ports = ports.split(",")
if len(ports) == 1:
ports = ports[0]
else:
ports = list(filter(lambda p: len(p) > 0, ports))
port_pressure.append([int(cycles), ports])
return port_pressure
def process_item(self, instruction_form, resources):
"""
Example:
('mov xmm mem', ('1*p45+2*p0', 7) -> ('mov', ['xmm', 'mem'], [[1, '45'], [2, '0']], 7)
"""
if instruction_form.startswith("[") and "]" in instruction_form:
instr_elements = instruction_form.split("]")
instr_elements = [instr_elements[0] + "]"] + instr_elements[1].strip().split(" ")
else:
instr_elements = instruction_form.split(" ")
latency = int(resources[1])
port_pressure = self.parse_port_pressure(resources[0])
instruction_name = instr_elements[0]
operand_types = instr_elements[1:]
return self.build_description(instruction_name, operand_types, port_pressure, latency)
class ArchEntryBuilder(EntryBuilder):
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
# Intel ICX
# LD_pressure = [[1, "23"], [1, ["2D", "3D"]]]
# LD_pressure_vec = LD_pressure
# ST_pressure = [[1, "79"], [1, "48"]]
# ST_pressure_vec = ST_pressure
# LD_lat = 5
# ST_lat = 0
# Zen3
LD_pressure = [[1, ["11", "12", "13"]]]
LD_pressure_vec = [[1, ["11", "12"]]]
ST_pressure = [[1, ["12", "13"]]]
ST_pressure_vec = [[1, ["4"]], [1, ["13"]]]
LD_lat = 4
ST_lat = 0
load, store, vec = self.classify(operand_types)
if load:
if vec:
port_pressure += LD_pressure_vec
else:
port_pressure += LD_pressure
latency += LD_lat
comment = "with load"
return EntryBuilder.build_description(
self, instruction_name, operand_types, port_pressure, latency, comment
)
if store:
if vec:
port_pressure = port_pressure + ST_pressure_vec
else:
port_pressure = port_pressure + ST_pressure
operands = ["mem" if o == "mem" else o for o in operand_types]
latency += ST_lat
return EntryBuilder.build_description(
self,
instruction_name,
operands,
port_pressure,
latency,
"with store",
)
# Register only:
return EntryBuilder.build_description(
self, instruction_name, operand_types, port_pressure, latency
)
def get_description(instruction_form, port_pressure, latency, rhs_comment=None):
entry = ArchEntryBuilder().process_item(instruction_form, (port_pressure, latency))
if rhs_comment is not None:
max_length = max([len(line) for line in entry.split("\n")])
commented_entry = ""
for line in entry.split("\n"):
commented_entry += ("{:<" + str(max_length) + "} # {}\n").format(line, rhs_comment)
entry = commented_entry
return entry
if __name__ == "__main__":
import sys
if len(sys.argv) != 4 and len(sys.argv) != 5:
print("Usage: {} <INSTRUCTION> <PORT_PRESSURE> <LATENCY> [COMMENT]".format(sys.argv[0]))
sys.exit(0)
try:
print(get_description(*sys.argv[1:]))
except KeyError:
print("Unknown architecture.")
sys.exit(1)

Some files were not shown because too many files have changed in this diff Show More