mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2025-12-16 09:00:05 +01:00
Compare commits
291 Commits
v0.3.1.dev
...
v0.3.11
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
75edfc808a | ||
|
|
c8c077a834 | ||
|
|
26ee005adc | ||
|
|
207c53aaad | ||
|
|
fafd7bc526 | ||
|
|
b986d7eba0 | ||
|
|
6b0adb5d68 | ||
|
|
f9f382a948 | ||
|
|
c6b58c63ab | ||
|
|
78530bfdb0 | ||
|
|
5aa0899961 | ||
|
|
7f0abd7d10 | ||
|
|
9ba9bab107 | ||
|
|
983e66938c | ||
|
|
1c889fa785 | ||
|
|
022598d94f | ||
|
|
1f5c9d1c61 | ||
|
|
30e0ad038d | ||
|
|
decec86e56 | ||
|
|
9af689b28c | ||
|
|
3aea3f2b49 | ||
|
|
a6cb09cf1f | ||
|
|
9d2ea8603f | ||
|
|
a7918db145 | ||
|
|
b5b1a1f2b2 | ||
|
|
dd59af16b2 | ||
|
|
d9325724e2 | ||
|
|
7e7269c2bc | ||
|
|
c64a24ae1b | ||
|
|
e8b78e4cc6 | ||
|
|
cd5a706f56 | ||
|
|
13426358d0 | ||
|
|
c80088b628 | ||
|
|
748474cd81 | ||
|
|
2fec0bf810 | ||
|
|
711a41d18e | ||
|
|
cf4a9cddcb | ||
|
|
5a5a1e74f5 | ||
|
|
4865e7ea72 | ||
|
|
d03398ddf9 | ||
|
|
edb8df3205 | ||
|
|
489050723c | ||
|
|
0cc0d35ce9 | ||
|
|
7f65bdb022 | ||
|
|
04360cc897 | ||
|
|
5e7a12f9bb | ||
|
|
1def12ee79 | ||
|
|
7269156854 | ||
|
|
d6529ced73 | ||
|
|
eac728dc9f | ||
|
|
451ba62959 | ||
|
|
57cf1bfe6f | ||
|
|
44b921aa73 | ||
|
|
accb52ce53 | ||
|
|
9e78f85475 | ||
|
|
64da89ec3d | ||
|
|
adeae88665 | ||
|
|
1698ed1776 | ||
|
|
2ef6051e64 | ||
|
|
3308f5d68f | ||
|
|
bd61b94669 | ||
|
|
0db8b6bcbf | ||
|
|
40755b2080 | ||
|
|
269148c2a1 | ||
|
|
12a8506530 | ||
|
|
e715badcf9 | ||
|
|
d6b4355a77 | ||
|
|
5361b63b52 | ||
|
|
cc39342047 | ||
|
|
addcdeda85 | ||
|
|
23d36a651b | ||
|
|
b052ab4151 | ||
|
|
673da99fba | ||
|
|
6c72281d65 | ||
|
|
5520362e65 | ||
|
|
93060eee43 | ||
|
|
0e77b7bc9a | ||
|
|
ce8c3ff9ab | ||
|
|
acbde7a19c | ||
|
|
34e978d2ae | ||
|
|
6294e2e9da | ||
|
|
6801229275 | ||
|
|
d3d1a89600 | ||
|
|
93c1951097 | ||
|
|
7211dd0799 | ||
|
|
5258d65c8e | ||
|
|
379fe80169 | ||
|
|
94d7d35c0b | ||
|
|
1009c60d2d | ||
|
|
229b316b6d | ||
|
|
c0753be899 | ||
|
|
eaa56792ab | ||
|
|
3425fa3024 | ||
|
|
38924b6ec1 | ||
|
|
d6ae457de4 | ||
|
|
a5c2ab1a4a | ||
|
|
e4393189dc | ||
|
|
3016fc7c46 | ||
|
|
82f47d217c | ||
|
|
1754df42d2 | ||
|
|
ac1295aac2 | ||
|
|
9624e6c109 | ||
|
|
2d16037c44 | ||
|
|
c5801cfe2f | ||
|
|
3e960dd4ac | ||
|
|
680774267d | ||
|
|
1aa710f195 | ||
|
|
71206897fd | ||
|
|
af247e64b6 | ||
|
|
2973f543b7 | ||
|
|
0b7f1ed6e7 | ||
|
|
17e7f0e0d8 | ||
|
|
c30ad4fb33 | ||
|
|
666512d54d | ||
|
|
381e9e9f76 | ||
|
|
8f63621d6d | ||
|
|
e41d05868a | ||
|
|
8013a40e52 | ||
|
|
3db330de66 | ||
|
|
4e73e24b99 | ||
|
|
dcd5b8fd61 | ||
|
|
3fb053fa79 | ||
|
|
cfd16aa079 | ||
|
|
1bf9a858ad | ||
|
|
5fc660484c | ||
|
|
c194f57f09 | ||
|
|
40a35ce067 | ||
|
|
4e58552c03 | ||
|
|
280f9c5790 | ||
|
|
d861d66206 | ||
|
|
3348afe219 | ||
|
|
f8ae6599c5 | ||
|
|
ffb016af45 | ||
|
|
51586cdaa1 | ||
|
|
c9000f74bc | ||
|
|
b06570ed45 | ||
|
|
b4682d16fb | ||
|
|
6c08a98418 | ||
|
|
2d30d190f4 | ||
|
|
8cce680bd7 | ||
|
|
9a60aa2c28 | ||
|
|
03b4cd1686 | ||
|
|
5bdc61aa09 | ||
|
|
04db2bfa79 | ||
|
|
5a0365ab35 | ||
|
|
4cdee8b621 | ||
|
|
248829141f | ||
|
|
131646b01a | ||
|
|
3cf40d9cd0 | ||
|
|
0adde7b9fc | ||
|
|
9888ef2da4 | ||
|
|
cadedeba7b | ||
|
|
f5489621fa | ||
|
|
77aa7f8fe0 | ||
|
|
760e3a9846 | ||
|
|
d0436838de | ||
|
|
c204b19caa | ||
|
|
731f1f9636 | ||
|
|
edae1720dc | ||
|
|
79afcba61d | ||
|
|
559c95a34a | ||
|
|
9c7907ee21 | ||
|
|
3243455ec5 | ||
|
|
5574a93a5e | ||
|
|
24583de74e | ||
|
|
530ad8484e | ||
|
|
421cf55af7 | ||
|
|
2fc1f3a186 | ||
|
|
092403c529 | ||
|
|
2d82c32f02 | ||
|
|
53135a03da | ||
|
|
02233f627e | ||
|
|
662ad829ec | ||
|
|
cb34733abe | ||
|
|
aa92234e5d | ||
|
|
1fd2453a50 | ||
|
|
4eea686e8b | ||
|
|
daa566329c | ||
|
|
b202bdfdb0 | ||
|
|
534eda8015 | ||
|
|
b2bb2cd003 | ||
|
|
97dbefdb6f | ||
|
|
789406c863 | ||
|
|
5341a2e94d | ||
|
|
c2d8742ac0 | ||
|
|
5b1c984552 | ||
|
|
6d6d3b7ccb | ||
|
|
3656b222ca | ||
|
|
60b6b603b7 | ||
|
|
70c66dbd0f | ||
|
|
d85daa9ecc | ||
|
|
3f55ae2368 | ||
|
|
7e4fcf5399 | ||
|
|
571d090344 | ||
|
|
55652f84e0 | ||
|
|
6e25da6c08 | ||
|
|
0269fc7085 | ||
|
|
76469f7898 | ||
|
|
3ff8a695b6 | ||
|
|
cb100d118f | ||
|
|
0c22634601 | ||
|
|
54ae9f4d26 | ||
|
|
e8fab533db | ||
|
|
0e05bd66d8 | ||
|
|
e635b2b015 | ||
|
|
383b720cc5 | ||
|
|
b6572720af | ||
|
|
354ab8e148 | ||
|
|
4f4a53c3be | ||
|
|
2a50207045 | ||
|
|
03f544638e | ||
|
|
bfe45f09bc | ||
|
|
8e30cd583a | ||
|
|
e86803df02 | ||
|
|
184751cf9e | ||
|
|
d99522583e | ||
|
|
cafe4c5bf8 | ||
|
|
623c4ea113 | ||
|
|
3ca2586bac | ||
|
|
36d6a82da5 | ||
|
|
b1444cf352 | ||
|
|
4d6d8d9379 | ||
|
|
1687ba8be9 | ||
|
|
59402a0837 | ||
|
|
f5b6611474 | ||
|
|
262fa4b288 | ||
|
|
0fdbb7f52c | ||
|
|
bad230fa7b | ||
|
|
dc02192d04 | ||
|
|
c23e52cdf6 | ||
|
|
b2b4aba0f3 | ||
|
|
bbb004a2aa | ||
|
|
4628d52210 | ||
|
|
99781b4171 | ||
|
|
04a1433f02 | ||
|
|
d88617109f | ||
|
|
cbed2c46f4 | ||
|
|
9d069c39d9 | ||
|
|
c9d3a90cd0 | ||
|
|
9ea2c5f46d | ||
|
|
f18a48653f | ||
|
|
ff68f03aed | ||
|
|
a16fee9fb1 | ||
|
|
2fcfc01542 | ||
|
|
fe9cd6c0c9 | ||
|
|
e7838cac54 | ||
|
|
4dc4323e2e | ||
|
|
6f5b8adadd | ||
|
|
744e1d83cc | ||
|
|
47e39f1f77 | ||
|
|
6d814d416b | ||
|
|
f8ed85b7c9 | ||
|
|
db6e40ee88 | ||
|
|
8359aa4807 | ||
|
|
697c5b5f4b | ||
|
|
f3f91536b5 | ||
|
|
687693d2a5 | ||
|
|
eb55693871 | ||
|
|
abf4fc391f | ||
|
|
4da262a902 | ||
|
|
a91413c270 | ||
|
|
6c56a77967 | ||
|
|
224e24d5e9 | ||
|
|
0b78d290ec | ||
|
|
a839af76c5 | ||
|
|
15da6044dd | ||
|
|
0f5d3a0370 | ||
|
|
1c8067545d | ||
|
|
484d6da85e | ||
|
|
0ecc656055 | ||
|
|
af0c8fc953 | ||
|
|
5fe983f4ef | ||
|
|
8b4acf0508 | ||
|
|
0c63d4f1cd | ||
|
|
b1e4cb90a7 | ||
|
|
159a1fa343 | ||
|
|
db862441b0 | ||
|
|
de5479a06c | ||
|
|
d92523e133 | ||
|
|
cb7cec20a8 | ||
|
|
1c673382b4 | ||
|
|
14abed8f85 | ||
|
|
f6d12cae2a | ||
|
|
646490ac2a | ||
|
|
8e13432318 | ||
|
|
b683bf7ce3 | ||
|
|
64a7cb8196 | ||
|
|
d14ccee0b4 | ||
|
|
171b57b381 | ||
|
|
d6042b4006 | ||
|
|
c1cf539c45 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,5 @@
|
||||
# OSACA specific files and folders
|
||||
osaca/taxCalc/
|
||||
*.*.pickle
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
19
.travis.yml
19
.travis.yml
@@ -1,13 +1,17 @@
|
||||
sudo: false
|
||||
os: linux
|
||||
language: python
|
||||
python:
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
# Python 3.7 not working yet
|
||||
# - "3.7"
|
||||
- "3.7"
|
||||
- "3.8"
|
||||
- "3.9"
|
||||
before_install:
|
||||
# - pip install tox-travis
|
||||
- pip install codecov
|
||||
- pip install bs4
|
||||
- pip install pygraphviz
|
||||
- pip install kerncraft
|
||||
install:
|
||||
- pip install -e .
|
||||
cache: pip
|
||||
@@ -19,12 +23,13 @@ after_success:
|
||||
- codecov
|
||||
deploy:
|
||||
provider: pypi
|
||||
user: "__token__"
|
||||
username: "__token__"
|
||||
password:
|
||||
secure: "fRRCETOwDkJ4pFacYZghPfCQ9mSsV4PlD3sTDp8rDHoCnebPjvFYc1tIdv+Wds0ae162KNUaj9GbxjK0MTGiRcy4pD08n7ufv8snmBQ2rtOLkj7RCRg1hw30WcMHjzqScFJgQcBrpjdPmR5AlesUufh6OadGvF1NspmVRWKr8ir3KQhmNV+itAliYoqaSTRTg1zC/znm+49l5gkzlLxd+mPj5/dtcc8vZ/i2M2+nNTTjDxq71q4Ddqv+bgZV1y7OZY2YuvjEDPflUbwc3fjOxpj891uMDHodsGmEHBu8WsLpF2tAO0C/x63S0jXamkV+/4cAQqQAwWr0Lby9/BjCfUwyUMOEgZ0S+z9WoFpBpQTQEfkD2JH/UFrv4CMnLFqgDkVMcx0vc/rT4Od8eJ5wOSG5+VdniJNOLpodFOXuKc09eJMk2lE9vk9OBrcsZ09UOTPTUCMZSIP4cBDxaIkx+RHQEy63TQdJZcElRBEWGEgj2e9hbiktvIoOvbFGQDscpz7ShBDklXIpu9hnxcKHtNDEjyywTUJmx7lTMILL05DPUnpUmnMb1Gyx5lbHzhSExc9re0cxEA354UUQKBS5HwHQcEBw9stMfsaForiBAUOocUKdGqlGP9cOXFoxdC9M+ff5FNstgbjPYSowb/JbATMlmCWKgH/bXXcTGCO10sk="
|
||||
distributions: sdist
|
||||
distributions: "sdist bdist_wheel"
|
||||
skip_existing: true
|
||||
skip_cleanup: true
|
||||
cleanup: false
|
||||
on:
|
||||
repo: RRZE-HPC/OSACA
|
||||
tag: true
|
||||
branch: master
|
||||
tags: true
|
||||
|
||||
@@ -2,6 +2,8 @@ include README.rst
|
||||
include LICENSE
|
||||
include tox.ini
|
||||
recursive-include osaca/data/ *.yml
|
||||
recursive-include osaca/data/ *.pickle
|
||||
include osaca/data/_build_cache.py
|
||||
include examples/*
|
||||
recursive-include tests *.py *.out
|
||||
recursive-include tests/testfiles/ *
|
||||
|
||||
353
README.rst
353
README.rst
@@ -1,4 +1,4 @@
|
||||
.. image:: doc/osaca-logo.png
|
||||
.. image:: docs/img/osaca-logo.png
|
||||
:alt: OSACA logo
|
||||
:width: 80%
|
||||
|
||||
@@ -6,31 +6,35 @@ OSACA
|
||||
=====
|
||||
|
||||
Open Source Architecture Code Analyzer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
--------------------------------------
|
||||
|
||||
This tool allows automatic instruction fetching of assembly code,
|
||||
auto-generating of testcases for assembly instructions creating latency
|
||||
and throughput benchmarks on a specific instruction form and throughput
|
||||
analysis and throughput prediction for a innermost loop kernel.
|
||||
For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.
|
||||
|
||||
.. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?token=393L6z2HEXNiGLtZ43s6&branch=master
|
||||
:target: https://travis-ci.com/RRZE-HPC/OSACA
|
||||
.. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?branch=master
|
||||
:target: https://travis-ci.com/github/RRZE-HPC/OSACA
|
||||
:alt: Build Status
|
||||
|
||||
.. ..image:: https://landscape.io/github/RRZE-HPC/OSACA/master/landscape.svg?style=flat&badge_auth_token=c95f01b247f94bc79c09d21c5c827697
|
||||
.. :target: https://landscape.io/github/RRZE-HPC/OSACA/master
|
||||
.. :alt: Code Health
|
||||
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
|
||||
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
|
||||
:alt: Code Coverage
|
||||
|
||||
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=v3
|
||||
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=v3
|
||||
.. image:: https://readthedocs.org/projects/osaca/badge/?version=latest
|
||||
:target: https://osaca.readthedocs.io/en/latest/?badge=latest
|
||||
:alt: Documentation Status
|
||||
|
||||
.. image:: https://img.shields.io/badge/read-the_docs-blue
|
||||
:target: https://osaca.readthedocs.io/
|
||||
:alt: Docs
|
||||
|
||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
||||
:target: https://github.com/ambv/black
|
||||
:alt: Code Style
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
Installation
|
||||
~~~~~~~~~~~~
|
||||
------------
|
||||
On most systems with python pip and setuputils installed, just run:
|
||||
|
||||
.. code:: bash
|
||||
@@ -48,19 +52,23 @@ To build OSACA from source, clone this repository using ``git clone https://gith
|
||||
After installation, OSACA can be started with the command ``osaca`` in the CLI.
|
||||
|
||||
Dependencies:
|
||||
~~~~~~~~~~~~~~~
|
||||
-------------
|
||||
Additional requirements are:
|
||||
|
||||
- `Python3 <https://www.python.org/>`_
|
||||
- `Graphviz <https://www.graphviz.org/>`_ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
|
||||
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`_ for marker insertion
|
||||
- `ibench <https://github.com/hofm/ibench>`_ for throughput/latency measurements
|
||||
- `Python3 <https://www.python.org/>`__
|
||||
- `Graphviz <https://www.graphviz.org/>`__ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
|
||||
|
||||
Optional requirements are:
|
||||
|
||||
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
|
||||
- `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
|
||||
- `BeautifulSoup4 <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ for scraping instruction form information for the x86 ISA (experimental)
|
||||
|
||||
Design
|
||||
======
|
||||
A schematic design of OSACA's workflow is shown below:
|
||||
|
||||
.. image:: doc/osaca-workflow.png
|
||||
.. image:: docs/img/osaca-workflow.png
|
||||
:alt: OSACA workflow
|
||||
:width: 80%
|
||||
|
||||
@@ -71,26 +79,39 @@ The usage of OSACA can be listed as:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
osaca [-h] [-V] [--arch ARCH] [--export-graph GRAPHNAME] FILEPATH
|
||||
osaca [-h] [-V] [--arch ARCH] [--fixed] [--db-check]
|
||||
[--import MICROBENCH] [--insert-marker]
|
||||
[--export-graph GRAPHNAME] [--ignore-unknown] [--verbose]
|
||||
FILEPATH
|
||||
|
||||
-h, --help
|
||||
prints out the help message.
|
||||
-V, --version
|
||||
shows the program’s version number.
|
||||
--arch ARCH
|
||||
needs to be replaced with the wished architecture abbreviation.
|
||||
This flag is necessary for the throughput analysis (default function) and the inclusion of an ibench output (``-i``).
|
||||
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX`` and ``CSX`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1`` for AMD Zen (17h family) architecture.
|
||||
Furthermore, `VULCAN` for Marvell`s ARM-based ThunderX2 architecture is available.
|
||||
--insert-marker
|
||||
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`_ marker in suggested assembly blocks.
|
||||
needs to be replaced with the target architecture abbreviation.
|
||||
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX`` and ``CSX`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
|
||||
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 architecture is available.
|
||||
--fixed
|
||||
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
|
||||
Otherwise, OSACA will print out the optimal port utilization for the kernel.
|
||||
--db-check
|
||||
Run a sanity check on the by "--arch" specified database.
|
||||
The output depends on the verbosity level.
|
||||
Keep in mind you have to provide a (dummy) filename in anyway.
|
||||
Keep in mind you have to provide an existing (dummy) filename in anyway.
|
||||
--import MICROBENCH
|
||||
Import a given microbenchmark output file into the corresponding architecture instruction database.
|
||||
Define the type of microbenchmark either as "ibench" or "asmbench".
|
||||
--insert-marker
|
||||
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`__ byte markers or OSACA AArch64 byte markers in suggested assembly blocks.
|
||||
--export-graph EXPORT_PATH
|
||||
Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
|
||||
After the file was created, you can convert it to a PDF file using dot: `dot -Tpdf osaca_dg.dot -o osaca_dependency_graph.pdf`
|
||||
After the file was created, you can convert it to a PDF file using `dot <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__.
|
||||
--ignore-unknown
|
||||
Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
|
||||
If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
|
||||
-v, --verbose
|
||||
Increases verbosity level
|
||||
|
||||
The **FILEPATH** describes the filepath to the file to work with and is always necessary
|
||||
|
||||
@@ -99,50 +120,184 @@ ______________________
|
||||
Hereinafter OSACA's scope of function will be described.
|
||||
|
||||
Throughput & Latency analysis
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
As main functionality of OSACA this process starts by default. It is always necessary to specify the core architecture by the flag ``--arch ARCH``, where ``ARCH`` can stand for ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ZEN`` or ``VULCAN``.
|
||||
-----------------------------
|
||||
As main functionality of OSACA, the tool starts the analysis on a marked assembly file by running the following command with one or more of the optional parameters:
|
||||
|
||||
For extracting the right kernel, one has to mark it beforehand.
|
||||
Currently, only the detechtion of markers in the assembly code and therefore the analysis of assemly files is supported by OSACA.
|
||||
.. code-block:: bash
|
||||
|
||||
**Assembly code**
|
||||
osaca --arch ARCH [--fixed] [--ignore-unknown]
|
||||
[--export-graph EXPORT_PATH]
|
||||
file
|
||||
|
||||
The ``file`` parameter specifies the target assembly file and is always mandatory.
|
||||
|
||||
The parameter ``ARCH`` is positional for the analysis and must be replaced by the target architecture abbreviation.
|
||||
|
||||
OSACA assumes an optimal scheduling for all instructions and assumes the processor to be able to schedule instructions in a way that it achieves a minimal reciprocal throughput.
|
||||
However, in older versions (<=v0.2.2) of OSACA, a fixed probability for port utilization was assumed.
|
||||
This means, instructions with *N* available ports for execution were scheduled with a probability of *1/N* to each of the ports.
|
||||
This behavior can be enforced by using the ``--fixed`` flag.
|
||||
|
||||
If one or more instruction forms are unknown to OSACA, it refuses to print an overall throughput, CP and
|
||||
LCD analysis and marks all unknown instruction forms with ``X`` next to the mnemonic.
|
||||
This is done so the user does not miss out on this unrecognized instruction and might assume an incorrect runtime prediction.
|
||||
To force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms, the flag ``--ignore-unknown`` can be specified.
|
||||
|
||||
To get a visualization of the analyzed kernel and its dependency chains, OSACA provides the option to additionally produce a graph as DOT file, which represents the kernel and all register dependencies inside of it.
|
||||
The tool highlights all LCDs and the CP.
|
||||
The graph generation is done by running OSACA with the ``--export-graph EXPORT_GRAPH`` flag.
|
||||
OSACA stores the DOT file either at the by ``EXPORT_GRAPH`` specified filepath or uses the default filename "osaca_dg.dot" in the current working directory.
|
||||
Subsequently, the DOT-graph can be adjusted in its appearance and converted to various output formats such as PDF, SVG, or PNG using the `dot command <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__, e.g., ``dot -Tpdf osaca_dg.dot -o
|
||||
graph.pdf`` to generate a PDF document.
|
||||
|
||||
Marker insertion
|
||||
----------------
|
||||
For extracting the right kernel, one has to mark it in beforehand.
|
||||
Currently, only the detection of markers in the assembly code and therefore the analysis of assembly files is supported by OSACA.
|
||||
|
||||
Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
|
||||
For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
|
||||
For the convience of the user, in x86 assembly IACA byte markers are used.
|
||||
IACA requires byte markers since it operates on opcode-level.
|
||||
To provide a trade-off between reusability for such tool and convenient usability, OSACA supports both byte markers and comment line markers.
|
||||
While the byte markers for x86 are equivalent to IACA byte markers, the comment keywords ``OSACA-BEGIN`` and ``OSACA-END`` are based on LLVM-MCA's markers.
|
||||
|
||||
**x86 Byte Markers**
|
||||
|
||||
.. code-block:: gas
|
||||
|
||||
movl $111,%ebx #IACA/OSACA START MARKER
|
||||
.byte 100,103,144 #IACA/OSACA START MARKER
|
||||
Loop:
|
||||
# ...
|
||||
movl $222,%ebx #IACA/OSACA END MARKER
|
||||
.byte 100,103,144 #IACA/OSACA END MARKER
|
||||
|
||||
**AArch64 Byte Markers**
|
||||
x86 markers
|
||||
^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
.. code-block:: asm
|
||||
|
||||
mov x1, #111 // OSACA START
|
||||
.byte 213,3,32,31 // OSACA START
|
||||
\\ ...
|
||||
mov x1, #222 // OSACA END
|
||||
.byte 213,3,32,31 // OSACA END
|
||||
movl $111,%ebx #IACA/OSACA START MARKER
|
||||
.byte 100,103,144 #IACA/OSACA START MARKER
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
movl $222,%ebx #IACA/OSACA END MARKER
|
||||
.byte 100,103,144 #IACA/OSACA END MARKER
|
||||
|
||||
.. Include new measurements into the data file
|
||||
.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
.. Running OSACA with the flag ``-i`` or ``--include-ibench`` and a specified micro architecture ``ARCH``, it takes the values given in an ibench output file and checks them for reasonability. If a value is not in the data file already, it will be added, otherwise OSACA prints out a warning message and keeps the old value in the data file. If a value does not pass the validation, a warning message is shown, however, OSACA will keep working with the new value. The handling of ibench is shortly described in the example section below.
|
||||
**Comment line markers**
|
||||
|
||||
Insert IACA markers
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
Using the ``--insert-marker`` flags for a given file, OSACA calls the implemented Kerncraft module for identifying and marking the inner-loop block in *manual mode*. More information about how this is done can be found in the `Kerncraft repository <https://github.com/RRZE-HPC/kerncraft>`_.
|
||||
Note that this currrently only works for x86 loop kernels
|
||||
.. code-block:: asm
|
||||
|
||||
Example
|
||||
=======
|
||||
# OSACA-BEGIN
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
# OSACA-END
|
||||
|
||||
AArch64 markers
|
||||
^^^^^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
::
|
||||
|
||||
mov x1, #111 // OSACA START
|
||||
.byte 213,3,32,31 // OSACA START
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
mov x1, #222 // OSACA END
|
||||
.byte 213,3,32,31 // OSACA END
|
||||
|
||||
**Comment line markers**
|
||||
|
||||
::
|
||||
|
||||
// OSACA-BEGIN
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
// OSACA-END
|
||||
|
||||
OSACA in combination with Kerncraft provides a functionality for the automatic detection of possible loop kernels and inserting markers.
|
||||
This can be done by using the ``--insert-marker`` flag together with the path to the target assembly file and the target architecture.
|
||||
|
||||
Benchmark import
|
||||
----------------
|
||||
OSACA supports the automatic integration of new instruction forms by parsing the output of the micro-
|
||||
benchmark tools `asmbench <https://github.com/RRZE-HPC/asmbench>`__ and `ibench <https://github.com/RRZE-HPC/ibench>`__.
|
||||
This can be achieved by running OSACA with the command line option ``--import MICROBENCH``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --import MICROBENCH file
|
||||
|
||||
``MICROBENCH`` specifies one of the currently supported benchmark tools, i.e., "asmbench" or "ibench".
|
||||
``ARCH`` defines the abbreviation of the target architecture for which the instructions will be added and file must be the path to the generated output file of the benchmark.
|
||||
The format of this file has to match either the basic command line output of ibench, e.g.,
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]-TP: 0.500 (clock cycles) [DEBUG - result: 1.000000]
|
||||
[INSTRUCTION FORM]-LT: 4.000 (clock cycles) [DEBUG - result: 1.000000]
|
||||
|
||||
or the command line output of asmbench including the name of the instruction form in a separate line at the
|
||||
beginning, e.g.:
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]
|
||||
Latency: 4.00 cycle
|
||||
Throughput: 0.50 cycle
|
||||
|
||||
|
||||
Note that there must be an empty line after each throughput measurement as part of the output so that one instruction form entry consists of four (4) lines.
|
||||
|
||||
To let OSACA import the instruction form with the correct operands, the naming conventions for the instruction form name must be followed:
|
||||
|
||||
* The first part of the name is the mnemonic and ends with the character "``-``" (not part of the mnemonic in the DB).
|
||||
|
||||
* The second part of the name are the operands. Each operand must be separated from another operand by the character "``_``".
|
||||
|
||||
* For each **x86** operand, one of the following symbols must be used:
|
||||
|
||||
* "``r``" for general purpose registers (rax, edi, r9, ...)
|
||||
* "``x``", "``y``", or "``z``" for xmm, ymm, or zmm registers, respectively
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more* than 1.
|
||||
|
||||
* For each **AArch64** operand, one of the following symbols must be used:
|
||||
|
||||
* "``w``", "``x``", "``b``", "``h``", "``s``", "``d``", or "``q``" for registers with the corresponding prefix.
|
||||
* "``v``" followed by a single character ("``b``", "``h``", "``s``", or "``d``") for vector registers with the corresponding lane width of the second character.
|
||||
If no second character is given, OSACA assumes a lane width of 64 bit (``d``) as default.
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more*
|
||||
than 1. Add "``r``" if the address format uses pre-indexing and "``p``" if it uses post-indexing.
|
||||
|
||||
Valid instruction form examples for x86 are ``vaddpd-x_x_x``, ``mov-r_mboi``, and ``vfmadd213pd-mbis_y_y``.
|
||||
|
||||
Valid instruction form examples for AArch64 are ``fadd-vd_vd_v``, ``ldp-d_d_mo``, and ``fmov-s_i``.
|
||||
|
||||
Note that the options to define operands are limited, therefore, one might need to adjust the instruction forms in the architecture DB after importing.
|
||||
OSACA parses the output for an arbitrary number of instruction forms and adds them as entries to the architecture DB.
|
||||
The user must edit the ISA DB in case the instruction form shows irregular source and destination operands for its ISA syntax. OSACA applies the following rules by default:
|
||||
|
||||
* If there is only one operand, it is considered as source operand
|
||||
|
||||
* In case of multiple operands the target operand (depending on the ISA syntax the last or first one) is considered to be the
|
||||
destination operand, all others are considered as source operands.
|
||||
|
||||
Database check
|
||||
--------------
|
||||
Since a manual adjustment of the ISA DB is currently indispensable when adding new instruction forms,
|
||||
OSACA provides a database sanity check using the --db-check flag. It can be executed via:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --db-check [-v] file
|
||||
|
||||
``ARCH`` defines the abbreviation of the target architecture of the database to check.
|
||||
The ``file`` argument needs to be specified as it is positional but may be any existing dummy path.
|
||||
When called, OSACA prints a summary of database information containing the amount of missing throughput values, latency values or μ-ops assignments for an instruction form.
|
||||
Furthermore, it shows the amount of duplicate instruction forms in both the architecture DB and the ISA DB and checks how many instruction forms in the ISA DB are non-existent in the architecture DB.
|
||||
Finally, it checks via simple heuristics how many of the instruction forms contained in the architecture DB might miss an ISA DB entry.
|
||||
Running the database check including the ``-v`` verbosity flag, OSACA prints in addition the specific name of the identified instruction forms so that the user can check the mentioned incidents.
|
||||
|
||||
Examples
|
||||
========
|
||||
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:
|
||||
|
||||
.. code-block:: c
|
||||
@@ -158,7 +313,7 @@ The code shows a simple scalar multiplication of a vector ``b`` and a floating-p
|
||||
The result is written in vector ``a``.
|
||||
After including the OSACA byte marker into the assembly, one can start the analysis typing
|
||||
|
||||
.. code:: bash
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch CSX PATH/TO/FILE
|
||||
|
||||
@@ -166,7 +321,7 @@ in the command line.
|
||||
|
||||
The output is:
|
||||
|
||||
.. code-block::
|
||||
::
|
||||
|
||||
Open Source Architecture Code Analyzer (OSACA) - v0.3
|
||||
Analyzed file: scale.s.csx.O3.s
|
||||
@@ -178,63 +333,31 @@ The output is:
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------
|
||||
170 | | | | | | | | | .L22:
|
||||
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | vmulpd (%r12,%rax), %ymm1, %ymm0
|
||||
172 | | | 0.50 | 0.50 | 1.00 | | | | vmovapd %ymm0, 0(%r13,%rax)
|
||||
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | | addq $32, %rax
|
||||
174 | 0.25 | 0.25 | | | | 0.25 | 0.25 | | cmpq %rax, %r14
|
||||
175 | | | | | | | | | * jne .L22
|
||||
Combined Analysis Report
|
||||
-----------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
170 | | | | | | | | || | | .L22:
|
||||
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
|
||||
172 | | | 0.50 | 0.50 | 1.00 | | | || 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
|
||||
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | || | 1.0 | addq $32, %rax
|
||||
174 | 0.00 | 0.00 | | | | 0.50 | 0.50 | || | | cmpq %rax, %r14
|
||||
175 | | | | | | | | || | | * jne .L22
|
||||
|
||||
1.00 1.00 1.00 0.50 1.00 0.50 1.00 0.50 0.50
|
||||
0.75 0.75 1.00 0.50 1.00 0.50 1.00 0.75 0.75 13.0 1.0
|
||||
|
||||
|
||||
Latency Analysis Report
|
||||
-----------------------
|
||||
171 | 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
|
||||
172 | 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
|
||||
|
||||
13.0
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
173 | 1.0 | addq $32, %rax | [173]
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
173 | 1.0 | addq $32, %rax | [173]
|
||||
It shows the whole kernel together with the optimized port pressure of each instruction form and the overall port binding.
|
||||
Furthermore, in the two columns on the right, the critical path (CP) and the longest loop-carried dependency (LCD) of the loop kernel.
|
||||
In the bottom, all loop-carried dependencies are shown, each with a list of line numbers being part of this dependency chain on the right.
|
||||
|
||||
|
||||
It shows the whole kernel together with the average port pressure of each instruction form and the overall port binding.
|
||||
Furthermore, the critical path of the loop kernel and all loop-carried dependencies, each with a list of line numbers being part of this dependency chain on the right.
|
||||
|
||||
.. For measuring the instruction forms with ibench we highly recommend to use an exclusively allocated node, so there is no other workload falsifying the results. For the correct function of ibench the benchmark files from OSACA need to be placed in a subdirectory of src in root so ibench can create the a folder with the subdirectory’s name and the shared objects. For running the tests the frequencies of all cores must set to a constant value and this has to be given as an argument together with the directory of the shared objects to ibench, e.g.:
|
||||
|
||||
.. .. code:: bash
|
||||
|
||||
./ibench ./AVX 2.2
|
||||
|
||||
.. for running ibench in the directory ``AVX`` with a core frequency of 2.2 GHz. We get an output like:
|
||||
|
||||
.. .. code:: bash
|
||||
|
||||
Using frequency 2.20GHz.
|
||||
add-mem_imd-TP: 1.023 (clock cycles) [DEBUG - result: 1.000000]
|
||||
add-mem_imd: 6.050 (clock cycles) [DEBUG - result: 1.000000]
|
||||
|
||||
.. The debug output as resulting value of register ``xmm0`` is additional validation information depending on the executed instruction form meant for the user and is not considered by OSACA. The ibench output information can be included by OSACA running the program with the flag ``--include-ibench`` or just ``-i`` and the specify micro architecture:
|
||||
|
||||
.. .. code-block:: bash
|
||||
|
||||
osaca --arch IVB -i PATH/TO/IBENCH-OUTPUTFILE
|
||||
|
||||
.. For now no automatic allocation of ports for a instruction form is implemented, so for getting an output in the Ports Pressure table, one must add the port occupation by hand. We know that the inserted instruction form must be assigned always to Port 2, 3 and 4 and additionally to either 0, 1 or 5, a valid data file therefore would look like this:
|
||||
|
||||
.. .. code:: bash
|
||||
|
||||
addl-mem_imd,1.0,6.0,"(0.33,0.33,1.00,1.00,1.00,0.33)"
|
||||
|
||||
You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
|
||||
|
||||
Credits
|
||||
=======
|
||||
@@ -242,4 +365,4 @@ Implementation: Jan Laukemann
|
||||
|
||||
License
|
||||
=======
|
||||
`AGPL-3.0 </LICENSE>`_
|
||||
`AGPL-3.0 </LICENSE>`__
|
||||
|
||||
3
codecov.yml
Normal file
3
codecov.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
ignore:
|
||||
- "tests" # ignore test folder and all its contents
|
||||
- "**/__init__.py" # ignore init files
|
||||
@@ -1,134 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="459.03406mm"
|
||||
height="217.28152mm"
|
||||
viewBox="0 0 1626.4986 769.8951"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.92.1 r15371"
|
||||
sodipodi:docname="OSACA-Logo_05.svg"
|
||||
inkscape:export-filename="/home/cip/2014/ol68umur/Desktop/logo/OSACA-Logo_03b.png"
|
||||
inkscape:export-xdpi="1104"
|
||||
inkscape:export-ydpi="1104">
|
||||
<defs
|
||||
id="defs4" />
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="0.5"
|
||||
inkscape:cx="858.02629"
|
||||
inkscape:cy="511.52256"
|
||||
inkscape:document-units="px"
|
||||
inkscape:current-layer="layer1"
|
||||
showgrid="false"
|
||||
fit-margin-top="0.5"
|
||||
fit-margin-left="0.5"
|
||||
fit-margin-right="0.5"
|
||||
fit-margin-bottom="0.5"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1081"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="49"
|
||||
inkscape:window-maximized="1" />
|
||||
<metadata
|
||||
id="metadata7">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Ebene 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(263.39161,902.34721)">
|
||||
<g
|
||||
id="g4583">
|
||||
<text
|
||||
transform="scale(1.0341487,0.96697893)"
|
||||
id="text4147"
|
||||
y="-333.24573"
|
||||
x="542.02954"
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;line-height:0%;font-family:'Open Sans book';-inkscape-font-specification:'Open Sans book, ';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
|
||||
xml:space="preserve"><tspan
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:1.25;font-family:'Futura Bk';-inkscape-font-specification:'Futura Bk'"
|
||||
y="-333.24573"
|
||||
x="542.02954"
|
||||
id="tspan4149"
|
||||
sodipodi:role="line">ACA</tspan></text>
|
||||
<text
|
||||
transform="scale(1.0341487,0.96697893)"
|
||||
id="text4147-3"
|
||||
y="-417.88809"
|
||||
x="-266.53079"
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:0%;font-family:'Open Sans book';-inkscape-font-specification:'Open Sans book, ';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
|
||||
xml:space="preserve"><tspan
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:374.99996948px;line-height:1.25;font-family:'Futura Bk';-inkscape-font-specification:'Futura Bk';stroke-width:1px"
|
||||
y="-417.88809"
|
||||
x="-266.53079"
|
||||
id="tspan4149-6"
|
||||
sodipodi:role="line">OS</tspan></text>
|
||||
<g
|
||||
id="g4571">
|
||||
<rect
|
||||
style="fill:#4dd9ff;fill-opacity:1;stroke:none;stroke-width:4.99469042;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-6-3"
|
||||
width="501.93356"
|
||||
height="46.874996"
|
||||
x="-900.57556"
|
||||
y="-486.72452"
|
||||
transform="rotate(90)" />
|
||||
<rect
|
||||
style="fill:#7fff00;fill-opacity:1;stroke:none;stroke-width:7.58152723;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-5"
|
||||
width="676.86932"
|
||||
height="46.874996"
|
||||
x="134.22374"
|
||||
y="-425.20099"
|
||||
transform="matrix(0,-1,-1,0,0,0)" />
|
||||
<rect
|
||||
style="fill:#f2ff19;fill-opacity:1;stroke:none;stroke-width:5.0525918;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-7-6"
|
||||
width="456.4649"
|
||||
height="46.874996"
|
||||
x="-682.95709"
|
||||
y="-363.67743"
|
||||
transform="rotate(90)" />
|
||||
<rect
|
||||
style="fill:#8071ff;fill-opacity:1;stroke:none;stroke-width:5.53460026;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-3-2"
|
||||
width="360.71481"
|
||||
height="46.874996"
|
||||
x="322.24228"
|
||||
y="-548.24811"
|
||||
transform="matrix(0,-1,-1,0,0,0)" />
|
||||
<rect
|
||||
style="fill:#ff2a2a;fill-opacity:1;stroke:none;stroke-width:5.43282366;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
id="rect4162-67-9"
|
||||
width="347.57031"
|
||||
height="46.874996"
|
||||
x="398.60016"
|
||||
y="-302.15387"
|
||||
transform="matrix(0,-1,-1,0,0,0)" />
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 5.4 KiB |
Binary file not shown.
67
docs/conf.py
Normal file
67
docs/conf.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
sys.path.insert(0, os.path.abspath('.'))
|
||||
from version_from_src import get_version
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'OSACA'
|
||||
copyright = '2020, Jan Laukemann'
|
||||
author = 'Jan Laukemann'
|
||||
html_logo = 'img/osaca-logo.png'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
version = get_version()
|
||||
release = get_version()
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.doctest',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.mathjax',
|
||||
'sphinx.ext.napoleon',
|
||||
'sphinx.ext.todo',
|
||||
'sphinx.ext.viewcode',
|
||||
]
|
||||
|
||||
add_module_names = False
|
||||
source_suffix = '.rst'
|
||||
master_doc = 'index'
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
# e.g., 'alabaster', 'sphinx_rtd_theme'
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = []
|
||||
htmlhelp_basename = 'osaca_doc'
|
||||
html_sidebars = {'**': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html']}
|
||||
|
||||
autodoc_member_order = 'bysource'
|
||||
BIN
docs/img/osaca-logo.pdf
Normal file
BIN
docs/img/osaca-logo.pdf
Normal file
Binary file not shown.
|
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 45 KiB |
|
Before Width: | Height: | Size: 206 KiB After Width: | Height: | Size: 206 KiB |
14
docs/index.rst
Normal file
14
docs/index.rst
Normal file
@@ -0,0 +1,14 @@
|
||||
OSACA -- Open Source Architecture Code Analyzer
|
||||
=================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
sphinx/home.rst
|
||||
sphinx/api.rst
|
||||
|
||||
.. image:: /img/osaca-logo.png
|
||||
:alt: OSACA logo
|
||||
:width: 80%
|
||||
|
||||
7
docs/sphinx/api.rst
Normal file
7
docs/sphinx/api.rst
Normal file
@@ -0,0 +1,7 @@
|
||||
API Reference
|
||||
=============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
osaca
|
||||
364
docs/sphinx/home.rst
Normal file
364
docs/sphinx/home.rst
Normal file
@@ -0,0 +1,364 @@
|
||||
.. image:: /img/osaca-logo.png
|
||||
:alt: OSACA logo
|
||||
:width: 80%
|
||||
|
||||
OSACA
|
||||
=====
|
||||
|
||||
Open Source Architecture Code Analyzer
|
||||
--------------------------------------
|
||||
|
||||
For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.
|
||||
|
||||
.. image:: https://travis-ci.org/RRZE-HPC/OSACA.svg?branch=master
|
||||
:target: https://travis-ci.org/RRZE-HPC/OSACA
|
||||
:alt: Build Status
|
||||
|
||||
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
|
||||
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
|
||||
:alt: Code Coverage
|
||||
|
||||
.. image:: https://readthedocs.org/projects/osaca/badge/?version=latest
|
||||
:target: https://osaca.readthedocs.io/en/latest/?badge=latest
|
||||
:alt: Documentation Status
|
||||
|
||||
.. image:: https://img.shields.io/badge/read-the_docs-blue
|
||||
:target: https://osaca.readthedocs.io/
|
||||
:alt: Docs
|
||||
|
||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
||||
:target: https://github.com/ambv/black
|
||||
:alt: Code Style
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
Installation
|
||||
------------
|
||||
On most systems with python pip and setuputils installed, just run:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install --user osaca
|
||||
|
||||
for the latest release.
|
||||
|
||||
To build OSACA from source, clone this repository using ``git clone https://github.com/RRZE-HPC/OSACA`` and run in the root directory:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
python ./setup.py install
|
||||
|
||||
After installation, OSACA can be started with the command ``osaca`` in the CLI.
|
||||
|
||||
Dependencies:
|
||||
-------------
|
||||
Additional requirements are:
|
||||
|
||||
- `Python3 <https://www.python.org/>`__
|
||||
- `Graphviz <https://www.graphviz.org/>`__ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
|
||||
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
|
||||
- `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
|
||||
|
||||
Design
|
||||
======
|
||||
A schematic design of OSACA's workflow is shown below:
|
||||
|
||||
.. image:: /img/osaca-workflow.png
|
||||
:alt: OSACA workflow
|
||||
:width: 80%
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
The usage of OSACA can be listed as:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
osaca [-h] [-V] [--arch ARCH] [--fixed] [--db-check]
|
||||
[--import MICROBENCH] [--insert-marker]
|
||||
[--export-graph GRAPHNAME] [--ignore-unknown] [--verbose]
|
||||
FILEPATH
|
||||
|
||||
-h, --help
|
||||
prints out the help message.
|
||||
-V, --version
|
||||
shows the program’s version number.
|
||||
--arch ARCH
|
||||
needs to be replaced with the target architecture abbreviation.
|
||||
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX`` and ``CSX`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
|
||||
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 architecture is available.
|
||||
--fixed
|
||||
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
|
||||
Otherwise, OSACA will print out the optimal port utilization for the kernel.
|
||||
--db-check
|
||||
Run a sanity check on the by "--arch" specified database.
|
||||
The output depends on the verbosity level.
|
||||
Keep in mind you have to provide an existing (dummy) filename in anyway.
|
||||
--import MICROBENCH
|
||||
Import a given microbenchmark output file into the corresponding architecture instruction database.
|
||||
Define the type of microbenchmark either as "ibench" or "asmbench".
|
||||
--insert-marker
|
||||
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`__ byte markers or OSACA AArch64 byte markers in suggested assembly blocks.
|
||||
--export-graph EXPORT_PATH
|
||||
Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
|
||||
After the file was created, you can convert it to a PDF file using `dot <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__.
|
||||
--ignore-unknown
|
||||
Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
|
||||
If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
|
||||
-v, --verbose
|
||||
Increases verbosity level
|
||||
|
||||
The **FILEPATH** describes the filepath to the file to work with and is always necessary
|
||||
|
||||
______________________
|
||||
|
||||
Hereinafter OSACA's scope of function will be described.
|
||||
|
||||
Throughput & Latency analysis
|
||||
-----------------------------
|
||||
As main functionality of OSACA, the tool starts the analysis on a marked assembly file by running the following command with one or more of the optional parameters:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH [--fixed] [--ignore-unknown]
|
||||
[--export-graph EXPORT_PATH]
|
||||
file
|
||||
|
||||
The ``file`` parameter specifies the target assembly file and is always mandatory.
|
||||
|
||||
The parameter ``ARCH`` is positional for the analysis and must be replaced by the target architecture abbreviation.
|
||||
|
||||
OSACA assumes an optimal scheduling for all instructions and assumes the processor to be able to schedule instructions in a way that it achieves a minimal reciprocal throughput.
|
||||
However, in older versions (<=v0.2.2) of OSACA, a fixed probability for port utilization was assumed.
|
||||
This means, instructions with *N* available ports for execution were scheduled with a probability of *1/N* to each of the ports.
|
||||
This behavior can be enforced by using the ``--fixed`` flag.
|
||||
|
||||
If one or more instruction forms are unknown to OSACA, it refuses to print an overall throughput, CP and
|
||||
LCD analysis and marks all unknown instruction forms with ``X`` next to the mnemonic.
|
||||
This is done so the user does not miss out on this unrecognized instruction and might assume an incorrect runtime prediction.
|
||||
To force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms, the flag ``--ignore-unknown`` can be specified.
|
||||
|
||||
To get a visualization of the analyzed kernel and its dependency chains, OSACA provides the option to additionally produce a graph as DOT file, which represents the kernel and all register dependencies inside of it.
|
||||
The tool highlights all LCDs and the CP.
|
||||
The graph generation is done by running OSACA with the ``--export-graph EXPORT_GRAPH`` flag.
|
||||
OSACA stores the DOT file either at the by ``EXPORT_GRAPH`` specified filepath or uses the default filename "osaca_dg.dot" in the current working directory.
|
||||
Subsequently, the DOT-graph can be adjusted in its appearance and converted to various output formats such as PDF, SVG, or PNG using the `dot command <https://graphviz.gitlab.io/_pages/pdf/dotguide.pdf>`__, e.g., ``dot -Tpdf osaca_dg.dot -o
|
||||
graph.pdf`` to generate a PDF document.
|
||||
|
||||
Marker insertion
|
||||
----------------
|
||||
For extracting the right kernel, one has to mark it in beforehand.
|
||||
Currently, only the detection of markers in the assembly code and therefore the analysis of assembly files is supported by OSACA.
|
||||
|
||||
Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
|
||||
For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
|
||||
IACA requires byte markers since it operates on opcode-level.
|
||||
To provide a trade-off between reusability for such tool and convenient usability, OSACA supports both byte markers and comment line markers.
|
||||
While the byte markers for x86 are equivalent to IACA byte markers, the comment keywords ``OSACA-BEGIN`` and ``OSACA-END`` are based on LLVM-MCA's markers.
|
||||
|
||||
x86 markers
|
||||
^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
.. code-block:: asm
|
||||
|
||||
movl $111,%ebx #IACA/OSACA START MARKER
|
||||
.byte 100,103,144 #IACA/OSACA START MARKER
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
movl $222,%ebx #IACA/OSACA END MARKER
|
||||
.byte 100,103,144 #IACA/OSACA END MARKER
|
||||
|
||||
**Comment line markers**
|
||||
|
||||
.. code-block:: asm
|
||||
|
||||
# OSACA-BEGIN
|
||||
.loop:
|
||||
# loop body
|
||||
jb .loop
|
||||
# OSACA-END
|
||||
|
||||
AArch64 markers
|
||||
^^^^^^^^^^^^^^^
|
||||
**Byte markers**
|
||||
|
||||
::
|
||||
|
||||
mov x1, #111 // OSACA START
|
||||
.byte 213,3,32,31 // OSACA START
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
mov x1, #222 // OSACA END
|
||||
.byte 213,3,32,31 // OSACA END
|
||||
|
||||
**Comment line markers**
|
||||
|
||||
::
|
||||
|
||||
// OSACA-BEGIN
|
||||
.loop:
|
||||
// loop body
|
||||
b.ne .loop
|
||||
// OSACA-END
|
||||
|
||||
OSACA in combination with Kerncraft provides a functionality for the automatic detection of possible loop kernels and inserting markers.
|
||||
This can be done by using the ``--insert-marker`` flag together with the path to the target assembly file and the target architecture.
|
||||
|
||||
Benchmark import
|
||||
----------------
|
||||
OSACA supports the automatic integration of new instruction forms by parsing the output of the micro-
|
||||
benchmark tools `asmbench <https://github.com/RRZE-HPC/asmbench>`__ and `ibench <https://github.com/RRZE-HPC/ibench>`__.
|
||||
This can be achieved by running OSACA with the command line option ``--import MICROBENCH``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --import MICROBENCH file
|
||||
|
||||
``MICROBENCH`` specifies one of the currently supported benchmark tools, i.e., "asmbench" or "ibench".
|
||||
``ARCH`` defines the abbreviation of the target architecture for which the instructions will be added and file must be the path to the generated output file of the benchmark.
|
||||
The format of this file has to match either the basic command line output of ibench, e.g.,
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]-TP: 0.500 (clock cycles) [DEBUG - result: 1.000000]
|
||||
[INSTRUCTION FORM]-LT: 4.000 (clock cycles) [DEBUG - result: 1.000000]
|
||||
|
||||
or the command line output of asmbench including the name of the instruction form in a separate line at the
|
||||
beginning, e.g.:
|
||||
|
||||
::
|
||||
|
||||
[INSTRUCTION FORM]
|
||||
Latency: 4.00 cycle
|
||||
Throughput: 0.50 cycle
|
||||
|
||||
|
||||
Note that there must be an empty line after each throughput measurement as part of the output so that one instruction form entry consists of four (4) lines.
|
||||
|
||||
To let OSACA import the instruction form with the correct operands, the naming conventions for the instruction form name must be followed:
|
||||
|
||||
* The first part of the name is the mnemonic and ends with the character "``-``" (not part of the mnemonic in the DB).
|
||||
|
||||
* The second part of the name are the operands. Each operand must be separated from another operand by the character "``_``".
|
||||
|
||||
* For each **x86** operand, one of the following symbols must be used:
|
||||
|
||||
* "``r``" for general purpose registers (rax, edi, r9, ...)
|
||||
* "``x``", "``y``", or "``z``" for xmm, ymm, or zmm registers, respectively
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more* than 1.
|
||||
|
||||
* For each **AArch64** operand, one of the following symbols must be used:
|
||||
|
||||
* "``w``", "``x``", "``b``", "``h``", "``s``", "``d``", or "``q``" for registers with the corresponding prefix.
|
||||
* "``v``" followed by a single character ("``b``", "``h``", "``s``", or "``d``") for vector registers with the corresponding lane width of the second character.
|
||||
If no second character is given, OSACA assumes a lane width of 64 bit (``d``) as default.
|
||||
* "``i``" for immediates
|
||||
* "``m``" for a memory address. Add "``b``" if the memory address contains a base register, "``o``" if it contains an offset,
|
||||
"``i``" if it contains an index register, and "``s``" if the index register additionally has a scale factor of *more*
|
||||
than 1. Add "``r``" if the address format uses pre-indexing and "``p``" if it uses post-indexing.
|
||||
|
||||
Valid instruction form examples for x86 are ``vaddpd-x_x_x``, ``mov-r_mboi``, and ``vfmadd213pd-mbis_y_y``.
|
||||
|
||||
Valid instruction form examples for AArch64 are ``fadd-vd_vd_v``, ``ldp-d_d_mo``, and ``fmov-s_i``.
|
||||
|
||||
Note that the options to define operands are limited, therefore, one might need to adjust the instruction forms in the architecture DB after importing.
|
||||
OSACA parses the output for an arbitrary number of instruction forms and adds them as entries to the architecture DB.
|
||||
The user must edit the ISA DB in case the instruction form shows irregular source and destination operands for its ISA syntax. OSACA applies the following rules by default:
|
||||
|
||||
* If there is only one operand, it is considered as source operand
|
||||
|
||||
* In case of multiple operands the target operand (depending on the ISA syntax the last or first one) is considered to be the
|
||||
destination operand, all others are considered as source operands.
|
||||
|
||||
Database check
|
||||
--------------
|
||||
Since a manual adjustment of the ISA DB is currently indispensable when adding new instruction forms,
|
||||
OSACA provides a database sanity check using the --db-check flag. It can be executed via:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch ARCH --db-check [-v] file
|
||||
|
||||
``ARCH`` defines the abbreviation of the target architecture of the database to check.
|
||||
The ``file`` argument needs to be specified as it is positional but may be any existing dummy path.
|
||||
When called, OSACA prints a summary of database information containing the amount of missing throughput values, latency values or μ-ops assignments for an instruction form.
|
||||
Furthermore, it shows the amount of duplicate instruction forms in both the architecture DB and the ISA DB and checks how many instruction forms in the ISA DB are non-existent in the architecture DB.
|
||||
Finally, it checks via simple heuristics how many of the instruction forms contained in the architecture DB might miss an ISA DB entry.
|
||||
Running the database check including the ``-v`` verbosity flag, OSACA prints in addition the specific name of the identified instruction forms so that the user can check the mentioned incidents.
|
||||
|
||||
Examples
|
||||
========
|
||||
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
double a[N], double b[N];
|
||||
double s;
|
||||
|
||||
// loop
|
||||
for(int i = 0; i < N; ++i)
|
||||
a[i] = s * b[i];
|
||||
|
||||
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``.
|
||||
The result is written in vector ``a``.
|
||||
After including the OSACA byte marker into the assembly, one can start the analysis typing
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
osaca --arch CSX PATH/TO/FILE
|
||||
|
||||
in the command line.
|
||||
|
||||
The output is:
|
||||
|
||||
::
|
||||
|
||||
Open Source Architecture Code Analyzer (OSACA) - v0.3
|
||||
Analyzed file: scale.s.csx.O3.s
|
||||
Architecture: csx
|
||||
Timestamp: 2019-10-03 23:36:21
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
-----------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
170 | | | | | | | | || | | .L22:
|
||||
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
|
||||
172 | | | 0.50 | 0.50 | 1.00 | | | || 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
|
||||
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | || | 1.0 | addq $32, %rax
|
||||
174 | 0.00 | 0.00 | | | | 0.50 | 0.50 | || | | cmpq %rax, %r14
|
||||
175 | | | | | | | | || | | * jne .L22
|
||||
|
||||
0.75 0.75 1.00 0.50 1.00 0.50 1.00 0.75 0.75 13.0 1.0
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
173 | 1.0 | addq $32, %rax | [173]
|
||||
|
||||
|
||||
It shows the whole kernel together with the optimized port pressure of each instruction form and the overall port binding.
|
||||
Furthermore, in the two columns on the right, the critical path (CP) and the longest loop-carried dependency (LCD) of the loop kernel.
|
||||
In the bottom, all loop-carried dependencies are shown, each with a list of line numbers being part of this dependency chain on the right.
|
||||
|
||||
You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
|
||||
|
||||
Credits
|
||||
=======
|
||||
Implementation: Jan Laukemann
|
||||
|
||||
License
|
||||
=======
|
||||
`AGPL-3.0 </LICENSE>`__
|
||||
20
docs/sphinx/osaca.api.rst
Normal file
20
docs/sphinx/osaca.api.rst
Normal file
@@ -0,0 +1,20 @@
|
||||
osaca.api package
|
||||
=================
|
||||
Provides interfaces to other tools.
|
||||
|
||||
osaca.api.kerncraft\_interface module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: osaca.api.kerncraft_interface
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: osaca.api
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
44
docs/sphinx/osaca.parser.rst
Normal file
44
docs/sphinx/osaca.parser.rst
Normal file
@@ -0,0 +1,44 @@
|
||||
osaca.parser package
|
||||
====================
|
||||
Parser module for parsing the assembly code.
|
||||
|
||||
osaca.parser.attr\_dict module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.attr_dict
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.parser.base\_parser module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.base_parser
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.parser.parser\_AArch64v81 module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.parser_AArch64v81
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.parser.parser\_x86att module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: osaca.parser.parser_x86att
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: osaca.parser
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
46
docs/sphinx/osaca.rst
Normal file
46
docs/sphinx/osaca.rst
Normal file
@@ -0,0 +1,46 @@
|
||||
osaca package
|
||||
=============
|
||||
|
||||
Subpackages
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
|
||||
osaca.api
|
||||
osaca.parser
|
||||
osaca.semantics
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
osaca.db\_interface module
|
||||
--------------------------
|
||||
|
||||
.. automodule:: osaca.db_interface
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.frontend module
|
||||
---------------------
|
||||
|
||||
.. automodule:: osaca.frontend
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.osaca module
|
||||
------------------
|
||||
|
||||
.. automodule:: osaca.osaca
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.utils module
|
||||
------------------
|
||||
|
||||
.. automodule:: osaca.utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
52
docs/sphinx/osaca.semantics.rst
Normal file
52
docs/sphinx/osaca.semantics.rst
Normal file
@@ -0,0 +1,52 @@
|
||||
osaca.semantics package
|
||||
=======================
|
||||
Semantic part of OSACA.
|
||||
|
||||
osaca.semantics.arch\_semantics module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.arch_semantics
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.hw\_model module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.hw_model
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.isa\_semantics module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.isa_semantics
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.kernel\_dg module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.kernel_dg
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
osaca.semantics.marker\_utils module
|
||||
------------------------------------
|
||||
|
||||
.. automodule:: osaca.semantics.marker_utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: osaca.semantics
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
31
docs/version_from_src.py
Normal file
31
docs/version_from_src.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
# Stolen from pip
|
||||
def __read(*names, **kwargs):
|
||||
"""Reads in file"""
|
||||
with io.open(
|
||||
os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")
|
||||
) as fp:
|
||||
return fp.read()
|
||||
|
||||
|
||||
# Stolen from pip
|
||||
def __find_version(*file_paths):
|
||||
"""Searches for a version attribute in the given file(s)"""
|
||||
version_file = __read(*file_paths)
|
||||
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
||||
if version_match:
|
||||
return version_match.group(1)
|
||||
raise RuntimeError('Unable to find version string.')
|
||||
|
||||
|
||||
def get_version():
|
||||
"""
|
||||
Gets the current OSACA version stated in the __init__ file
|
||||
|
||||
:returns: str -- the version string.
|
||||
"""
|
||||
return __find_version('../osaca/__init__.py')
|
||||
114
examples/README.md
Normal file
114
examples/README.md
Normal file
@@ -0,0 +1,114 @@
|
||||
# Examples
|
||||
We collected sample kernels for the user to run examples with OSACA.
|
||||
The assembly files contain only the extracted and already marked kernel for code compiled with on Intel Cascade Lake (CSX), AMD Zen and Marvell ThunderX2 (TX2), but can be run on any system supporting the ISA and supported by OSACA.
|
||||
The used compilers were Intel Parallel Studio 19.0up05 and GNU 9.1.0 in case of the x86 systems and ARM HPC Compiler for Linux version 19.2 and GNU 8.2.0 for the ARM-based TX2.
|
||||
|
||||
To analyze the kernels with OSACA, run
|
||||
```
|
||||
osaca --arch ARCH FILE
|
||||
```
|
||||
While all Zen and TX2 kernels use the comment-style OSACA markers, the kernels for Intel Cascade Lake (*.csx.*.s) use the byte markers to be able to be analyzed by IACA as well.
|
||||
For this use
|
||||
```
|
||||
gcc -c FILE.s
|
||||
iaca -arch SKX FILE.o
|
||||
```
|
||||
|
||||
------------
|
||||
The kernels currently contained in the examples are shown briefly in the following.
|
||||
|
||||
### Copy (`copy/`)
|
||||
```c
|
||||
double * restrict a, * restrict b;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Vector add (`add/`)
|
||||
```c
|
||||
double * restrict a, * restrict b, * restrict c;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i] + c[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Vector update (`update/`)
|
||||
```c
|
||||
double * restrict a;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = scale * a[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Sum reduction (`sum_reduction/`)
|
||||
```c
|
||||
double * restrict a;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
scale = scale + a[i];
|
||||
}
|
||||
```
|
||||
For this kernel we noticed an overlap of the loop bodies when using gcc with `-Ofast` flag (see this [blog post](https://blogs.fau.de/hager/archives/7658) for more information).
|
||||
We therefore compiled all gcc version additionally with `-O3` flag instead.
|
||||
These versions are named accordingly.
|
||||
|
||||
### DAXPY (`daxpy/`)
|
||||
```c
|
||||
double * restrict a, * restrict b;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = a[i] + scale * b[i];
|
||||
}
|
||||
```
|
||||
|
||||
### STREAM triad (`triad/`)
|
||||
```c
|
||||
double * restrict a, * restrict b, * restrict c;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i] + scale * c[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Schönauer triad (`striad/`)
|
||||
```c
|
||||
double * restrict a, * restrict b, * restrict c, * restrict d;
|
||||
|
||||
for(long i=0; i < size; ++i){
|
||||
a[i] = b[i] + c[i] * d[i];
|
||||
}
|
||||
```
|
||||
|
||||
### Gauss-Seidel method (`gs/`)
|
||||
```c
|
||||
double ** restrict a;
|
||||
|
||||
for(long k=1; k < size_k-1; ++k){
|
||||
for(long i=1; i < size_i-1; ++i){
|
||||
a[k][i] = scale * (
|
||||
a[k][i-1] + a[k+1][i]
|
||||
+ a[k][i+1] + a[k-1][i]
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Jacobi 2D (`j2d/`)
|
||||
```c
|
||||
double ** restrict a, ** restrict b;
|
||||
|
||||
for(long k=1; k < size_k-1; ++k){
|
||||
for(long i=1; i < size_i-1; ++i){
|
||||
a[k][i] = 0.25 * (
|
||||
b[k][i-1] + b[k+1][i]
|
||||
+ b[k][i+1] + b[k-1][i]
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
For this kernel we noticed a discrepancy between measurements and predcitions especially when using AVX-512 instructions.
|
||||
We therefore compiled the x86 kernels additionally with AVX/SSE instruction and marekd those kernels accordingly.
|
||||
36
examples/add/add.s.csx.gcc.s
Normal file
36
examples/add/add.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r14,%rax), %ymm3
|
||||
vmovupd 32(%r14,%rax), %ymm4
|
||||
vmovupd 64(%r14,%rax), %ymm6
|
||||
vmovupd 96(%r14,%rax), %ymm9
|
||||
vmovupd 128(%r14,%rax), %ymm11
|
||||
vmovupd 160(%r14,%rax), %ymm13
|
||||
vmovupd 192(%r14,%rax), %ymm15
|
||||
vmovupd 224(%r14,%rax), %ymm0
|
||||
vaddpd 0(%r13,%rax), %ymm3, %ymm7
|
||||
vaddpd 32(%r13,%rax), %ymm4, %ymm5
|
||||
vaddpd 64(%r13,%rax), %ymm6, %ymm8
|
||||
vaddpd 96(%r13,%rax), %ymm9, %ymm10
|
||||
vaddpd 128(%r13,%rax), %ymm11, %ymm12
|
||||
vaddpd 160(%r13,%rax), %ymm13, %ymm14
|
||||
vaddpd 192(%r13,%rax), %ymm15, %ymm1
|
||||
vaddpd 224(%r13,%rax), %ymm0, %ymm2
|
||||
vmovupd %ymm7, (%r12,%rax)
|
||||
vmovupd %ymm5, 32(%r12,%rax)
|
||||
vmovupd %ymm8, 64(%r12,%rax)
|
||||
vmovupd %ymm10, 96(%r12,%rax)
|
||||
vmovupd %ymm12, 128(%r12,%rax)
|
||||
vmovupd %ymm14, 160(%r12,%rax)
|
||||
vmovupd %ymm1, 192(%r12,%rax)
|
||||
vmovupd %ymm2, 224(%r12,%rax)
|
||||
addq $256, %rax
|
||||
cmpq %rax, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
19
examples/add/add.s.csx.icc.s
Normal file
19
examples/add/add.s.csx.icc.s
Normal file
@@ -0,0 +1,19 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.40: # Preds ..B1.40 ..B1.39
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%rcx,%rax,8), %zmm1 #78.5
|
||||
vmovups 64(%rcx,%rax,8), %zmm3 #78.5
|
||||
vaddpd (%r13,%rax,8), %zmm1, %zmm2 #78.5
|
||||
vaddpd 64(%r13,%rax,8), %zmm3, %zmm4 #78.5
|
||||
vmovupd %zmm2, (%r14,%rax,8) #78.5
|
||||
vmovupd %zmm4, 64(%r14,%rax,8) #78.5
|
||||
addq $16, %rax #78.5
|
||||
cmpq %r12, %rax #78.5
|
||||
jb ..B1.40 # Prob 82% #78.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
91
examples/add/add.s.tx2.clang.s
Normal file
91
examples/add/add.s.tx2.clang.s
Normal file
@@ -0,0 +1,91 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q4, q5, [x9, #-224]
|
||||
ldp q2, q3, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fadd v2.2d, v2.2d, v0.2d
|
||||
fadd v3.2d, v3.2d, v1.2d
|
||||
stp q2, q3, [x11, #-256]
|
||||
fadd v0.2d, v6.2d, v4.2d
|
||||
fadd v1.2d, v7.2d, v5.2d
|
||||
stp q0, q1, [x11, #-224]
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q16, q17, [x9, #-160]
|
||||
ldp q6, q7, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fadd v6.2d, v6.2d, v4.2d
|
||||
fadd v7.2d, v7.2d, v5.2d
|
||||
stp q6, q7, [x11, #-192]
|
||||
fadd v4.2d, v18.2d, v16.2d
|
||||
fadd v5.2d, v19.2d, v17.2d
|
||||
stp q4, q5, [x11, #-160]
|
||||
ldp q16, q17, [x9, #-128]
|
||||
ldp q19, q20, [x9, #-96]
|
||||
ldp q18, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fadd v16.2d, v18.2d, v16.2d
|
||||
fadd v18.2d, v21.2d, v17.2d
|
||||
stp q16, q18, [x11, #-128]
|
||||
fadd v17.2d, v22.2d, v19.2d
|
||||
fadd v19.2d, v23.2d, v20.2d
|
||||
stp q17, q19, [x11, #-96]
|
||||
ldp q20, q21, [x9, #-64]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
ldp q22, q23, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fadd v20.2d, v24.2d, v20.2d
|
||||
fadd v21.2d, v25.2d, v21.2d
|
||||
stp q20, q21, [x11, #-64]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q28, q29, [x10]
|
||||
fadd v22.2d, v26.2d, v22.2d
|
||||
fadd v23.2d, v27.2d, v23.2d
|
||||
stp q22, q23, [x11, #-32]
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q30, q31, [x10, #32]
|
||||
fadd v24.2d, v28.2d, v24.2d
|
||||
fadd v25.2d, v29.2d, v25.2d
|
||||
stp q24, q25, [x11]
|
||||
ldp q28, q29, [x9, #64]
|
||||
ldp q8, q10, [x10, #64]
|
||||
fadd v26.2d, v30.2d, v26.2d
|
||||
fadd v27.2d, v31.2d, v27.2d
|
||||
stp q26, q27, [x11, #32]
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldp q11, q12, [x10, #96]
|
||||
fadd v28.2d, v8.2d, v28.2d
|
||||
fadd v29.2d, v10.2d, v29.2d
|
||||
stp q28, q29, [x11, #64]
|
||||
ldp q8, q10, [x9, #128]
|
||||
ldp q13, q14, [x10, #128]
|
||||
ldp q3, q0, [x9, #192]
|
||||
ldp q1, q6, [x10, #192]
|
||||
fadd v30.2d, v11.2d, v30.2d
|
||||
fadd v31.2d, v12.2d, v31.2d
|
||||
stp q30, q31, [x11, #96]
|
||||
ldp q11, q12, [x9, #160]
|
||||
fadd v8.2d, v13.2d, v8.2d
|
||||
fadd v10.2d, v14.2d, v10.2d
|
||||
stp q8, q10, [x11, #128]
|
||||
ldp q13, q14, [x10, #160]
|
||||
fadd v1.2d, v1.2d, v3.2d
|
||||
ldp q3, q4, [x9, #224]
|
||||
fadd v0.2d, v6.2d, v0.2d
|
||||
stp q1, q0, [x11, #192]
|
||||
ldp q5, q6, [x10, #224]
|
||||
fadd v11.2d, v13.2d, v11.2d
|
||||
fadd v2.2d, v14.2d, v12.2d
|
||||
stp q11, q2, [x11, #160]
|
||||
fadd v3.2d, v5.2d, v3.2d
|
||||
fadd v4.2d, v6.2d, v4.2d
|
||||
stp q3, q4, [x11, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x12, x12, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
45
examples/add/add.s.tx2.gcc.s
Normal file
45
examples/add/add.s.tx2.gcc.s
Normal file
@@ -0,0 +1,45 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x0, x10, 16
|
||||
ldr q29, [x21, x10]
|
||||
ldr q30, [x20, x10]
|
||||
add x7, x10, 32
|
||||
ldr q31, [x21, x0]
|
||||
ldr q2, [x20, x0]
|
||||
add x6, x10, 48
|
||||
add x5, x10, 64
|
||||
ldr q5, [x21, x7]
|
||||
ldr q1, [x20, x7]
|
||||
add x4, x10, 80
|
||||
add x11, x10, 96
|
||||
ldr q4, [x21, x6]
|
||||
ldr q0, [x20, x6]
|
||||
add x2, x10, 112
|
||||
fadd v7.2d, v29.2d, v30.2d
|
||||
ldr q3, [x21, x5]
|
||||
ldr q9, [x20, x5]
|
||||
fadd v6.2d, v31.2d, v2.2d
|
||||
ldr q19, [x21, x4]
|
||||
ldr q18, [x20, x4]
|
||||
fadd v20.2d, v5.2d, v1.2d
|
||||
ldr q21, [x21, x11]
|
||||
ldr q17, [x20, x11]
|
||||
fadd v22.2d, v4.2d, v0.2d
|
||||
ldr q23, [x21, x2]
|
||||
ldr q16, [x20, x2]
|
||||
fadd v24.2d, v3.2d, v9.2d
|
||||
fadd v25.2d, v19.2d, v18.2d
|
||||
fadd v26.2d, v21.2d, v17.2d
|
||||
str q7, [x19, x10]
|
||||
add x10, x10, 128
|
||||
fadd v27.2d, v23.2d, v16.2d
|
||||
str q6, [x19, x0]
|
||||
str q20, [x19, x7]
|
||||
str q22, [x19, x6]
|
||||
str q24, [x19, x5]
|
||||
str q25, [x19, x4]
|
||||
str q26, [x19, x11]
|
||||
str q27, [x19, x2]
|
||||
cmp x24, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/add/add.s.zen.gcc.s
Normal file
30
examples/add/add.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%r13,%rax), %xmm0
|
||||
vmovups 16(%r13,%rax), %xmm3
|
||||
vmovups 32(%r13,%rax), %xmm4
|
||||
vmovups 48(%r13,%rax), %xmm6
|
||||
vmovups 64(%r13,%rax), %xmm9
|
||||
vmovups 80(%r13,%rax), %xmm11
|
||||
vmovups 96(%r13,%rax), %xmm13
|
||||
vmovups 112(%r13,%rax), %xmm15
|
||||
vaddpd (%r12,%rax), %xmm0, %xmm7
|
||||
vaddpd 16(%r12,%rax), %xmm3, %xmm2
|
||||
vaddpd 32(%r12,%rax), %xmm4, %xmm5
|
||||
vaddpd 48(%r12,%rax), %xmm6, %xmm8
|
||||
vaddpd 64(%r12,%rax), %xmm9, %xmm10
|
||||
vaddpd 80(%r12,%rax), %xmm11, %xmm12
|
||||
vaddpd 96(%r12,%rax), %xmm13, %xmm14
|
||||
vaddpd 112(%r12,%rax), %xmm15, %xmm1
|
||||
vmovups %xmm7, 0(%rbp,%rax)
|
||||
vmovups %xmm2, 16(%rbp,%rax)
|
||||
vmovups %xmm5, 32(%rbp,%rax)
|
||||
vmovups %xmm8, 48(%rbp,%rax)
|
||||
vmovups %xmm10, 64(%rbp,%rax)
|
||||
vmovups %xmm12, 80(%rbp,%rax)
|
||||
vmovups %xmm14, 96(%rbp,%rax)
|
||||
vmovups %xmm1, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rbx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
28
examples/copy/copy.s.csx.gcc.s
Normal file
28
examples/copy/copy.s.csx.gcc.s
Normal file
@@ -0,0 +1,28 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r12,%rcx), %ymm10
|
||||
vmovupd 32(%r12,%rcx), %ymm11
|
||||
vmovupd 64(%r12,%rcx), %ymm12
|
||||
vmovupd 96(%r12,%rcx), %ymm13
|
||||
vmovupd 128(%r12,%rcx), %ymm14
|
||||
vmovupd 160(%r12,%rcx), %ymm15
|
||||
vmovupd 192(%r12,%rcx), %ymm0
|
||||
vmovupd 224(%r12,%rcx), %ymm1
|
||||
vmovupd %ymm10, 0(%r13,%rcx)
|
||||
vmovupd %ymm11, 32(%r13,%rcx)
|
||||
vmovupd %ymm12, 64(%r13,%rcx)
|
||||
vmovupd %ymm13, 96(%r13,%rcx)
|
||||
vmovupd %ymm14, 128(%r13,%rcx)
|
||||
vmovupd %ymm15, 160(%r13,%rcx)
|
||||
vmovupd %ymm0, 192(%r13,%rcx)
|
||||
vmovupd %ymm1, 224(%r13,%rcx)
|
||||
addq $256, %rcx
|
||||
cmpq %rcx, %r10
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
15
examples/copy/copy.s.csx.icc.s
Normal file
15
examples/copy/copy.s.csx.icc.s
Normal file
@@ -0,0 +1,15 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.39: # Preds ..B1.39 ..B1.38
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r14,%rax,8), %zmm1 #79.5
|
||||
vmovupd %zmm1, (%r13,%rax,8) #79.5
|
||||
addq $8, %rax #79.5
|
||||
cmpq %r12, %rax #79.5
|
||||
jb ..B1.39 # Prob 82% #79.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
42
examples/copy/copy.s.tx2.clang.s
Normal file
42
examples/copy/copy.s.tx2.clang.s
Normal file
@@ -0,0 +1,42 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q2, q3, [x9, #-224]
|
||||
stp q0, q1, [x10, #-256]
|
||||
stp q2, q3, [x10, #-224]
|
||||
add x8, x8, #64 // =64
|
||||
ldp q0, q1, [x9]
|
||||
ldp q2, q3, [x9, #32]
|
||||
stp q0, q1, [x10]
|
||||
stp q2, q3, [x10, #32]
|
||||
ldp q0, q1, [x9, #-192]
|
||||
ldp q2, q3, [x9, #-160]
|
||||
stp q0, q1, [x10, #-192]
|
||||
stp q2, q3, [x10, #-160]
|
||||
ldp q0, q1, [x9, #64]
|
||||
ldp q2, q3, [x9, #96]
|
||||
stp q0, q1, [x10, #64]
|
||||
stp q2, q3, [x10, #96]
|
||||
ldp q0, q1, [x9, #-128]
|
||||
ldp q2, q3, [x9, #-96]
|
||||
stp q0, q1, [x10, #-128]
|
||||
stp q2, q3, [x10, #-96]
|
||||
ldp q0, q1, [x9, #128]
|
||||
ldp q2, q3, [x9, #160]
|
||||
stp q0, q1, [x10, #128]
|
||||
stp q2, q3, [x10, #160]
|
||||
ldp q0, q1, [x9, #-64]
|
||||
ldp q2, q3, [x9, #-32]
|
||||
stp q0, q1, [x10, #-64]
|
||||
stp q2, q3, [x10, #-32]
|
||||
ldp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x9, #224]
|
||||
add x9, x9, #512 // =512
|
||||
stp q0, q1, [x10, #192]
|
||||
stp q2, q3, [x10, #224]
|
||||
add x10, x10, #512 // =512
|
||||
adds x11, x11, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
29
examples/copy/copy.s.tx2.gcc.s
Normal file
29
examples/copy/copy.s.tx2.gcc.s
Normal file
@@ -0,0 +1,29 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x16, x15, 16
|
||||
ldr q9, [x19, x15]
|
||||
add x30, x15, 32
|
||||
add x17, x15, 48
|
||||
ldr q16, [x19, x16]
|
||||
ldr q18, [x19, x30]
|
||||
add x18, x15, 64
|
||||
add x1, x15, 80
|
||||
ldr q17, [x19, x17]
|
||||
ldr q19, [x19, x18]
|
||||
add x3, x15, 96
|
||||
add x2, x15, 112
|
||||
ldr q20, [x19, x1]
|
||||
ldr q21, [x19, x3]
|
||||
str q9, [x20, x15]
|
||||
ldr q22, [x19, x2]
|
||||
add x15, x15, 128
|
||||
str q16, [x20, x16]
|
||||
str q18, [x20, x30]
|
||||
str q17, [x20, x17]
|
||||
str q19, [x20, x18]
|
||||
str q20, [x20, x1]
|
||||
str q21, [x20, x3]
|
||||
str q22, [x20, x2]
|
||||
cmp x23, x15
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
22
examples/copy/copy.s.zen.gcc.s
Normal file
22
examples/copy/copy.s.zen.gcc.s
Normal file
@@ -0,0 +1,22 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%rbp,%r10), %xmm9
|
||||
vmovups 16(%rbp,%r10), %xmm10
|
||||
vmovups 32(%rbp,%r10), %xmm11
|
||||
vmovups 48(%rbp,%r10), %xmm12
|
||||
vmovups 64(%rbp,%r10), %xmm13
|
||||
vmovups 80(%rbp,%r10), %xmm14
|
||||
vmovups 96(%rbp,%r10), %xmm15
|
||||
vmovups 112(%rbp,%r10), %xmm0
|
||||
vmovups %xmm9, (%r12,%r10)
|
||||
vmovups %xmm10, 16(%r12,%r10)
|
||||
vmovups %xmm11, 32(%r12,%r10)
|
||||
vmovups %xmm12, 48(%r12,%r10)
|
||||
vmovups %xmm13, 64(%r12,%r10)
|
||||
vmovups %xmm14, 80(%r12,%r10)
|
||||
vmovups %xmm15, 96(%r12,%r10)
|
||||
vmovups %xmm0, 112(%r12,%r10)
|
||||
subq $-128, %r10
|
||||
cmpq %r10, %r15
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
36
examples/daxpy/daxpy.s.csx.gcc.s
Normal file
36
examples/daxpy/daxpy.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd 0(%r13,%rsi), %ymm14
|
||||
vmovupd 32(%r13,%rsi), %ymm15
|
||||
vmovupd 64(%r13,%rsi), %ymm1
|
||||
vmovupd 96(%r13,%rsi), %ymm0
|
||||
vmovupd 128(%r13,%rsi), %ymm3
|
||||
vmovupd 160(%r13,%rsi), %ymm4
|
||||
vmovupd 192(%r13,%rsi), %ymm5
|
||||
vmovupd 224(%r13,%rsi), %ymm7
|
||||
vfmadd213pd (%r12,%rsi), %ymm6, %ymm14
|
||||
vfmadd213pd 32(%r12,%rsi), %ymm6, %ymm15
|
||||
vfmadd213pd 64(%r12,%rsi), %ymm6, %ymm1
|
||||
vfmadd213pd 96(%r12,%rsi), %ymm6, %ymm0
|
||||
vfmadd213pd 128(%r12,%rsi), %ymm6, %ymm3
|
||||
vfmadd213pd 160(%r12,%rsi), %ymm6, %ymm4
|
||||
vfmadd213pd 192(%r12,%rsi), %ymm6, %ymm5
|
||||
vfmadd213pd 224(%r12,%rsi), %ymm6, %ymm7
|
||||
vmovupd %ymm14, (%r12,%rsi)
|
||||
vmovupd %ymm15, 32(%r12,%rsi)
|
||||
vmovupd %ymm1, 64(%r12,%rsi)
|
||||
vmovupd %ymm0, 96(%r12,%rsi)
|
||||
vmovupd %ymm3, 128(%r12,%rsi)
|
||||
vmovupd %ymm4, 160(%r12,%rsi)
|
||||
vmovupd %ymm5, 192(%r12,%rsi)
|
||||
vmovupd %ymm7, 224(%r12,%rsi)
|
||||
addq $256, %rsi
|
||||
cmpq %rsi, %r10
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
16
examples/daxpy/daxpy.s.csx.icc.s
Normal file
16
examples/daxpy/daxpy.s.csx.icc.s
Normal file
@@ -0,0 +1,16 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.39: # Preds ..B1.39 ..B1.38
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r13,%rax,8), %zmm1 #77.5
|
||||
vfmadd213pd (%r14,%rax,8), %zmm2, %zmm1 #77.5
|
||||
vmovupd %zmm1, (%r14,%rax,8) #77.5
|
||||
addq $8, %rax #77.5
|
||||
cmpq %rbx, %rax #77.5
|
||||
jb ..B1.39 # Prob 82% #77.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
90
examples/daxpy/daxpy.s.tx2.clang.s
Normal file
90
examples/daxpy/daxpy.s.tx2.clang.s
Normal file
@@ -0,0 +1,90 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q1, q2, [x9, #-256]
|
||||
ldp q3, q0, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fmla v1.2d, v4.2d, v31.2d
|
||||
fmla v2.2d, v5.2d, v31.2d
|
||||
stp q1, q2, [x9, #-256]
|
||||
fmla v3.2d, v6.2d, v31.2d
|
||||
fmla v0.2d, v7.2d, v31.2d
|
||||
stp q3, q0, [x9, #-224]
|
||||
ldp q5, q6, [x9, #-192]
|
||||
ldp q7, q4, [x9, #-160]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fmla v5.2d, v16.2d, v31.2d
|
||||
fmla v6.2d, v17.2d, v31.2d
|
||||
stp q5, q6, [x9, #-192]
|
||||
fmla v7.2d, v18.2d, v31.2d
|
||||
fmla v4.2d, v19.2d, v31.2d
|
||||
stp q7, q4, [x9, #-160]
|
||||
ldp q19, q18, [x9, #-128]
|
||||
ldp q16, q17, [x9, #-96]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fmla v18.2d, v21.2d, v31.2d
|
||||
fmla v16.2d, v22.2d, v31.2d
|
||||
ldp q21, q22, [x9, #-64]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
fmla v19.2d, v20.2d, v31.2d
|
||||
stp q19, q18, [x9, #-128]
|
||||
fmla v17.2d, v23.2d, v31.2d
|
||||
stp q16, q17, [x9, #-96]
|
||||
ldp q23, q20, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fmla v21.2d, v24.2d, v31.2d
|
||||
fmla v22.2d, v25.2d, v31.2d
|
||||
stp q21, q22, [x9, #-64]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q28, q29, [x10]
|
||||
fmla v23.2d, v26.2d, v31.2d
|
||||
fmla v20.2d, v27.2d, v31.2d
|
||||
stp q23, q20, [x9, #-32]
|
||||
ldp q26, q27, [x9, #32]
|
||||
fmla v24.2d, v28.2d, v31.2d
|
||||
fmla v25.2d, v29.2d, v31.2d
|
||||
stp q24, q25, [x9]
|
||||
ldp q28, q29, [x10, #32]
|
||||
fmla v26.2d, v28.2d, v31.2d
|
||||
fmla v27.2d, v29.2d, v31.2d
|
||||
stp q26, q27, [x9, #32]
|
||||
ldp q24, q25, [x9, #64]
|
||||
ldp q28, q29, [x10, #64]
|
||||
ldp q26, q27, [x9, #96]
|
||||
fmla v24.2d, v28.2d, v31.2d
|
||||
fmla v25.2d, v29.2d, v31.2d
|
||||
stp q24, q25, [x9, #64]
|
||||
ldp q28, q29, [x10, #96]
|
||||
fmla v26.2d, v28.2d, v31.2d
|
||||
fmla v27.2d, v29.2d, v31.2d
|
||||
stp q26, q27, [x9, #96]
|
||||
ldp q24, q25, [x9, #128]
|
||||
ldp q26, q27, [x10, #128]
|
||||
fmla v24.2d, v26.2d, v31.2d
|
||||
fmla v25.2d, v27.2d, v31.2d
|
||||
stp q24, q25, [x9, #128]
|
||||
ldp q26, q27, [x9, #160]
|
||||
ldp q1, q2, [x10, #160]
|
||||
fmla v26.2d, v1.2d, v31.2d
|
||||
fmla v27.2d, v2.2d, v31.2d
|
||||
stp q26, q27, [x9, #160]
|
||||
ldp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x10, #192]
|
||||
fmla v0.2d, v2.2d, v31.2d
|
||||
fmla v1.2d, v3.2d, v31.2d
|
||||
stp q0, q1, [x9, #192]
|
||||
ldp q2, q3, [x9, #224]
|
||||
ldp q4, q5, [x10, #224]
|
||||
fmla v2.2d, v4.2d, v31.2d
|
||||
fmla v3.2d, v5.2d, v31.2d
|
||||
stp q2, q3, [x9, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x11, x11, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
41
examples/daxpy/daxpy.s.tx2.gcc.s
Normal file
41
examples/daxpy/daxpy.s.tx2.gcc.s
Normal file
@@ -0,0 +1,41 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x5, x3
|
||||
ldr q23, [x10]
|
||||
ldr q24, [x5], 16
|
||||
mov x6, x10
|
||||
ldr q25, [x3, 16]
|
||||
ldr q26, [x3, 48]
|
||||
add x10, x10, 128
|
||||
add x3, x3, 128
|
||||
ldr q27, [x3, -64]
|
||||
ldr q28, [x3, -48]
|
||||
ldr q29, [x3, -32]
|
||||
ldr q30, [x3, -16]
|
||||
fmla v23.2d, v3.2d, v24.2d
|
||||
ldr q31, [x5, 16]
|
||||
str q23, [x6], 16
|
||||
ldr q0, [x10, -112]
|
||||
fmla v0.2d, v3.2d, v25.2d
|
||||
str q0, [x10, -112]
|
||||
ldr q2, [x6, 16]
|
||||
fmla v2.2d, v3.2d, v31.2d
|
||||
str q2, [x6, 16]
|
||||
ldr q5, [x10, -80]
|
||||
ldr q4, [x10, -64]
|
||||
ldr q6, [x10, -48]
|
||||
ldr q1, [x10, -32]
|
||||
ldr q7, [x10, -16]
|
||||
fmla v5.2d, v3.2d, v26.2d
|
||||
fmla v4.2d, v3.2d, v27.2d
|
||||
fmla v6.2d, v3.2d, v28.2d
|
||||
fmla v1.2d, v3.2d, v29.2d
|
||||
fmla v7.2d, v3.2d, v30.2d
|
||||
str q5, [x10, -80]
|
||||
str q4, [x10, -64]
|
||||
str q6, [x10, -48]
|
||||
str q1, [x10, -32]
|
||||
str q7, [x10, -16]
|
||||
cmp x23, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/daxpy/daxpy.s.zen.gcc.s
Normal file
30
examples/daxpy/daxpy.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups (%r12,%rax), %xmm12
|
||||
vmovups 16(%r12,%rax), %xmm13
|
||||
vmovups 32(%r12,%rax), %xmm14
|
||||
vmovups 48(%r12,%rax), %xmm15
|
||||
vmovups 64(%r12,%rax), %xmm1
|
||||
vmovups 80(%r12,%rax), %xmm0
|
||||
vmovups 96(%r12,%rax), %xmm4
|
||||
vmovups 112(%r12,%rax), %xmm5
|
||||
vfmadd213pd 0(%rbp,%rax), %xmm3, %xmm12
|
||||
vfmadd213pd 16(%rbp,%rax), %xmm3, %xmm13
|
||||
vfmadd213pd 32(%rbp,%rax), %xmm3, %xmm14
|
||||
vfmadd213pd 48(%rbp,%rax), %xmm3, %xmm15
|
||||
vfmadd213pd 64(%rbp,%rax), %xmm3, %xmm1
|
||||
vfmadd213pd 80(%rbp,%rax), %xmm3, %xmm0
|
||||
vfmadd213pd 96(%rbp,%rax), %xmm3, %xmm4
|
||||
vfmadd213pd 112(%rbp,%rax), %xmm3, %xmm5
|
||||
vmovups %xmm12, 0(%rbp,%rax)
|
||||
vmovups %xmm13, 16(%rbp,%rax)
|
||||
vmovups %xmm14, 32(%rbp,%rax)
|
||||
vmovups %xmm15, 48(%rbp,%rax)
|
||||
vmovups %xmm1, 64(%rbp,%rax)
|
||||
vmovups %xmm0, 80(%rbp,%rax)
|
||||
vmovups %xmm4, 96(%rbp,%rax)
|
||||
vmovups %xmm5, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %r15, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
67
examples/gs/gs.s.csx.gcc.s
Normal file
67
examples/gs/gs.s.csx.gcc.s
Normal file
@@ -0,0 +1,67 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L31:
|
||||
vmovsd (%rax,%rsi,8), %xmm7
|
||||
vaddsd (%rax,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 8(%rax), %xmm7, %xmm10
|
||||
leaq 8(%rax), %rdx
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, (%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd 16(%rax), %xmm14, %xmm15
|
||||
leaq 16(%rax), %rdx
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 8(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
vaddsd 24(%rax), %xmm2, %xmm4
|
||||
leaq 24(%rax), %rdx
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 32(%rax), %xmm7, %xmm10
|
||||
leaq 32(%rax), %rdx
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, 24(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd 40(%rax), %xmm14, %xmm15
|
||||
leaq 40(%rax), %rdx
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 32(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
vaddsd 48(%rax), %xmm2, %xmm4
|
||||
leaq 48(%rax), %rdx
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 40(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 56(%rax), %xmm7, %xmm10
|
||||
leaq 56(%rax), %rdx
|
||||
addq $64, %rax
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, -16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd (%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm8
|
||||
vmovsd %xmm8, -8(%rax)
|
||||
cmpq %r8, %rax
|
||||
jne .L31
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
36
examples/gs/gs.s.csx.icc.s
Normal file
36
examples/gs/gs.s.csx.icc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.58: # Preds ..B1.58 ..B1.57
|
||||
# Execution count [9.36e+01]
|
||||
vmovsd 8(%r11,%r10), %xmm2 #55.35
|
||||
incq %r15 #54.9
|
||||
vaddsd 16(%r11,%r12), %xmm2, %xmm3 #55.12
|
||||
vaddsd 8(%r11,%rbx), %xmm3, %xmm4 #55.12
|
||||
vaddsd %xmm1, %xmm4, %xmm1 #55.12
|
||||
vmulsd %xmm1, %xmm0, %xmm5 #55.12
|
||||
vmovsd %xmm5, 8(%r11,%r12) #55.12
|
||||
vaddsd 16(%r11,%r10), %xmm5, %xmm6 #55.48
|
||||
vaddsd 24(%r11,%r12), %xmm6, %xmm7 #55.63
|
||||
vaddsd 16(%r11,%rbx), %xmm7, %xmm8 #55.79
|
||||
vmulsd %xmm8, %xmm0, %xmm9 #55.12
|
||||
vmovsd %xmm9, 16(%r11,%r12) #55.12
|
||||
vaddsd 24(%r11,%r10), %xmm9, %xmm10 #55.48
|
||||
vaddsd 32(%r11,%r12), %xmm10, %xmm11 #55.63
|
||||
vaddsd 24(%r11,%rbx), %xmm11, %xmm12 #55.79
|
||||
vmulsd %xmm12, %xmm0, %xmm13 #55.12
|
||||
vmovsd %xmm13, 24(%r11,%r12) #55.12
|
||||
vaddsd 32(%r11,%r10), %xmm13, %xmm14 #55.48
|
||||
vaddsd 40(%r11,%r12), %xmm14, %xmm15 #55.63
|
||||
vaddsd 32(%r11,%rbx), %xmm15, %xmm16 #55.79
|
||||
vmulsd %xmm16, %xmm0, %xmm1 #55.12
|
||||
vmovsd %xmm1, 32(%r11,%r12) #55.12
|
||||
addq $32, %r11 #54.9
|
||||
cmpq %r14, %r15 #54.9
|
||||
jb ..B1.58 # Prob 28% #54.9
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
|
||||
19
examples/gs/gs.s.tx2.clang.s
Normal file
19
examples/gs/gs.s.tx2.clang.s
Normal file
@@ -0,0 +1,19 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB0_62: // %L.LB1_398.1
|
||||
// Parent Loop BB0_50 Depth=1
|
||||
// Parent Loop BB0_55 Depth=2
|
||||
// Parent Loop BB0_59 Depth=3
|
||||
// => This Inner Loop Header: Depth=4
|
||||
ldr d1, [x7], #8
|
||||
fadd d0, d1, d0
|
||||
ldr d2, [x22]
|
||||
ldr d3, [x23], #8
|
||||
fadd d2, d2, d3
|
||||
fadd d0, d0, d2
|
||||
sub w26, w26, #1 // =1
|
||||
fmul d0, d0, d9
|
||||
stur d0, [x22, #-8]
|
||||
add x22, x22, #8 // =8
|
||||
cmp w26, #2 // =2
|
||||
b.gt .LBB0_62
|
||||
// OSACA-END
|
||||
41
examples/gs/gs.s.tx2.gcc.s
Normal file
41
examples/gs/gs.s.tx2.gcc.s
Normal file
@@ -0,0 +1,41 @@
|
||||
// OSACA-BEGIN
|
||||
.L20:
|
||||
ldr d31, [x15, x18, lsl 3]
|
||||
ldr d0, [x15, 8]
|
||||
mov x14, x15
|
||||
add x16, x15, 24
|
||||
ldr d2, [x15, x30, lsl 3]
|
||||
add x15, x15, 32
|
||||
fadd d1, d31, d0
|
||||
fadd d3, d1, d30
|
||||
fadd d4, d3, d2
|
||||
fmul d5, d4, d9
|
||||
str d5, [x14], 8
|
||||
ldr d6, [x14, x18, lsl 3]
|
||||
ldr d16, [x14, 8]
|
||||
add x13, x14, 8
|
||||
ldr d7, [x14, x30, lsl 3]
|
||||
fadd d17, d6, d16
|
||||
fadd d18, d17, d5
|
||||
fadd d19, d18, d7
|
||||
fmul d20, d19, d9
|
||||
str d20, [x15, -24]
|
||||
ldr d21, [x13, x18, lsl 3]
|
||||
ldr d23, [x14, 16]
|
||||
ldr d22, [x13, x30, lsl 3]
|
||||
fadd d24, d21, d23
|
||||
fadd d25, d24, d20
|
||||
fadd d26, d25, d22
|
||||
fmul d27, d26, d9
|
||||
str d27, [x14, 8]
|
||||
ldr d30, [x15]
|
||||
ldr d28, [x16, x18, lsl 3]
|
||||
ldr d29, [x16, x30, lsl 3]
|
||||
fadd d31, d28, d30
|
||||
fadd d2, d31, d27
|
||||
fadd d0, d2, d29
|
||||
fmul d30, d0, d9
|
||||
str d30, [x15, -8]
|
||||
cmp x7, x15
|
||||
bne .L20
|
||||
// OSACA-END
|
||||
61
examples/gs/gs.s.zen.gcc.s
Normal file
61
examples/gs/gs.s.zen.gcc.s
Normal file
@@ -0,0 +1,61 @@
|
||||
# OSACA-BEGIN
|
||||
.L32:
|
||||
vmovsd (%rax,%rsi,8), %xmm7
|
||||
leaq 8(%rax), %rdx
|
||||
vaddsd (%rax,%rcx,8), %xmm8, %xmm11
|
||||
vaddsd 8(%rax), %xmm7, %xmm10
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, (%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
leaq 16(%rax), %rdx
|
||||
vaddsd 16(%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 8(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
leaq 24(%rax), %rdx
|
||||
vaddsd 24(%rax), %xmm2, %xmm4
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
leaq 32(%rax), %rdx
|
||||
vaddsd 32(%rax), %xmm7, %xmm10
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, 24(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
leaq 40(%rax), %rdx
|
||||
vaddsd 40(%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm3
|
||||
vmovsd %xmm3, 32(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm2
|
||||
vaddsd (%rdx,%rcx,8), %xmm3, %xmm5
|
||||
leaq 48(%rax), %rdx
|
||||
vaddsd 48(%rax), %xmm2, %xmm4
|
||||
vaddsd %xmm5, %xmm4, %xmm6
|
||||
vmulsd %xmm9, %xmm6, %xmm8
|
||||
vmovsd %xmm8, 40(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm7
|
||||
vaddsd (%rdx,%rcx,8), %xmm8, %xmm11
|
||||
leaq 56(%rax), %rdx
|
||||
vaddsd 56(%rax), %xmm7, %xmm10
|
||||
addq $64, %rax
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmulsd %xmm9, %xmm12, %xmm13
|
||||
vmovsd %xmm13, -16(%rax)
|
||||
vmovsd (%rdx,%rsi,8), %xmm14
|
||||
vaddsd (%rdx,%rcx,8), %xmm13, %xmm1
|
||||
vaddsd (%rax), %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm0
|
||||
vmulsd %xmm9, %xmm0, %xmm8
|
||||
vmovsd %xmm8, -8(%rax)
|
||||
cmpq %r8, %rax
|
||||
jne .L32
|
||||
# OSACA-END
|
||||
40
examples/j2d/j2d.s.csx.gcc.AVX.s
Normal file
40
examples/j2d/j2d.s.csx.gcc.AVX.s
Normal file
@@ -0,0 +1,40 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L21:
|
||||
vmovupd (%r8,%rax), %ymm11
|
||||
vmovupd (%rsi,%rax), %ymm13
|
||||
vaddpd (%r9,%rax), %ymm11, %ymm12
|
||||
vaddpd (%rdi,%rax), %ymm13, %ymm14
|
||||
vmovupd 32(%r8,%rax), %ymm1
|
||||
vmovupd 32(%rsi,%rax), %ymm2
|
||||
vaddpd %ymm14, %ymm12, %ymm15
|
||||
vaddpd 32(%r9,%rax), %ymm1, %ymm5
|
||||
vaddpd 32(%rdi,%rax), %ymm2, %ymm7
|
||||
vmulpd %ymm8, %ymm15, %ymm0
|
||||
vmovupd 64(%r8,%rax), %ymm10
|
||||
vaddpd %ymm7, %ymm5, %ymm6
|
||||
vmovupd 64(%rsi,%rax), %ymm12
|
||||
vmovupd 96(%rsi,%rax), %ymm5
|
||||
vmovupd %ymm0, (%rdx,%rax)
|
||||
vmovupd 96(%r8,%rax), %ymm0
|
||||
vaddpd 64(%r9,%rax), %ymm10, %ymm11
|
||||
vaddpd 64(%rdi,%rax), %ymm12, %ymm13
|
||||
vaddpd 96(%r9,%rax), %ymm0, %ymm1
|
||||
vaddpd 96(%rdi,%rax), %ymm5, %ymm2
|
||||
vaddpd %ymm13, %ymm11, %ymm14
|
||||
vmulpd %ymm8, %ymm6, %ymm9
|
||||
vaddpd %ymm2, %ymm1, %ymm7
|
||||
vmulpd %ymm8, %ymm14, %ymm15
|
||||
vmulpd %ymm8, %ymm7, %ymm6
|
||||
vmovupd %ymm9, 32(%rdx,%rax)
|
||||
vmovupd %ymm15, 64(%rdx,%rax)
|
||||
vmovupd %ymm6, 96(%rdx,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rax, %r15
|
||||
jne .L21
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
46
examples/j2d/j2d.s.csx.gcc.SSE.s
Normal file
46
examples/j2d/j2d.s.csx.gcc.SSE.s
Normal file
@@ -0,0 +1,46 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L28:
|
||||
movupd 16(%r8,%rax), %xmm11
|
||||
movupd 16(%rdi,%rax), %xmm12
|
||||
movupd 16(%rsi,%rax), %xmm13
|
||||
addpd %xmm11, %xmm15
|
||||
addpd %xmm13, %xmm12
|
||||
movupd 32(%rdi,%rax), %xmm14
|
||||
movupd 32(%rsi,%rax), %xmm0
|
||||
addpd %xmm15, %xmm12
|
||||
movupd 32(%r8,%rax), %xmm15
|
||||
addpd %xmm0, %xmm14
|
||||
addpd %xmm15, %xmm11
|
||||
movupd 48(%rdi,%rax), %xmm1
|
||||
movupd 48(%rsi,%rax), %xmm7
|
||||
addpd %xmm11, %xmm14
|
||||
addpd %xmm7, %xmm1
|
||||
mulpd %xmm2, %xmm12
|
||||
mulpd %xmm2, %xmm14
|
||||
movups %xmm12, 16(%rcx,%rax)
|
||||
movups %xmm14, 32(%rcx,%rax)
|
||||
movupd 48(%r8,%rax), %xmm14
|
||||
addpd %xmm14, %xmm15
|
||||
addpd %xmm15, %xmm1
|
||||
mulpd %xmm2, %xmm1
|
||||
movups %xmm1, 48(%rcx,%rax)
|
||||
addq $64, %rax
|
||||
.L21:
|
||||
movupd (%r8,%rax), %xmm15
|
||||
movupd (%rdi,%rax), %xmm0
|
||||
movupd (%rsi,%rax), %xmm1
|
||||
addpd %xmm15, %xmm14
|
||||
addpd %xmm1, %xmm0
|
||||
leaq 16(%rax), %r10
|
||||
addpd %xmm0, %xmm14
|
||||
mulpd %xmm2, %xmm14
|
||||
movups %xmm14, (%rcx,%rax)
|
||||
cmpq %r10, %r14
|
||||
jne .L28
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
37
examples/j2d/j2d.s.csx.icc.AVX.s
Normal file
37
examples/j2d/j2d.s.csx.icc.AVX.s
Normal file
@@ -0,0 +1,37 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.47: # Preds ..B1.47 ..B1.46
|
||||
# Execution count [1.15e+04]
|
||||
vmovupd 10016(%r8,%rcx,8), %ymm1 #94.5
|
||||
vmovupd 10048(%r8,%rcx,8), %ymm6 #94.5
|
||||
vmovupd 10080(%r8,%rcx,8), %ymm11 #94.5
|
||||
vaddpd 16(%r12,%rcx,8), %ymm1, %ymm2 #94.5
|
||||
vaddpd 48(%r12,%rcx,8), %ymm6, %ymm7 #94.5
|
||||
vaddpd 80(%r12,%rcx,8), %ymm11, %ymm12 #94.5
|
||||
vaddpd 20032(%r10,%rcx,8), %ymm2, %ymm3 #94.5
|
||||
vaddpd 20064(%r10,%rcx,8), %ymm7, %ymm8 #94.5
|
||||
vaddpd 20096(%r10,%rcx,8), %ymm12, %ymm13 #94.5
|
||||
vaddpd 10032(%r8,%rcx,8), %ymm3, %ymm4 #94.5
|
||||
vaddpd 10064(%r8,%rcx,8), %ymm8, %ymm9 #94.5
|
||||
vaddpd 10096(%r8,%rcx,8), %ymm13, %ymm14 #94.5
|
||||
vmovupd 10112(%r8,%rcx,8), %ymm1 #94.5
|
||||
vmulpd %ymm4, %ymm0, %ymm5 #94.5
|
||||
vmulpd %ymm9, %ymm0, %ymm10 #94.5
|
||||
vmulpd %ymm14, %ymm0, %ymm15 #94.5
|
||||
vaddpd 112(%r12,%rcx,8), %ymm1, %ymm2 #94.5
|
||||
vmovupd %ymm5, 10016(%r9,%rcx,8) #94.5
|
||||
vmovupd %ymm10, 10048(%r9,%rcx,8) #94.5
|
||||
vmovupd %ymm15, 10080(%r9,%rcx,8) #94.5
|
||||
vaddpd 20128(%r10,%rcx,8), %ymm2, %ymm3 #94.5
|
||||
vaddpd 10128(%r8,%rcx,8), %ymm3, %ymm4 #94.5
|
||||
vmulpd %ymm4, %ymm0, %ymm5 #94.5
|
||||
vmovupd %ymm5, 10112(%r9,%rcx,8) #94.5
|
||||
addq $16, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.47 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
69
examples/j2d/j2d.s.csx.icc.AVX512.s
Normal file
69
examples/j2d/j2d.s.csx.icc.AVX512.s
Normal file
@@ -0,0 +1,69 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.47: # Preds ..B1.63 ..B1.46
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.48: # Preds ..B1.47
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10032(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10016(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.51: # Preds ..B1.48
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 16(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20032(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10016(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.52: # Preds ..B1.51
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10096(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10080(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.55: # Preds ..B1.52
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 80(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20096(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10080(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.56: # Preds ..B1.55
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10160(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10144(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.59: # Preds ..B1.56
|
||||
# Execution count [1.15e+04]
|
||||
lea (%r12,%r11), %r8 #94.5
|
||||
vaddpd 144(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20160(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10144(%r9,%rcx,8) #94.5
|
||||
# LOE rcx rbx r8 r9 r10 r11 r12 r14 r13d r15d zmm4
|
||||
..B1.60: # Preds ..B1.59
|
||||
# Execution count [1.73e+04]
|
||||
vmovupd 10224(%r8,%rcx,8), %zmm2 #94.5
|
||||
vmovupd 10208(%r8,%rcx,8), %zmm0 #94.5
|
||||
# LOE rcx rbx r9 r10 r11 r12 r14 r13d r15d zmm0 zmm2 zmm4
|
||||
..B1.63: # Preds ..B1.60
|
||||
# Execution count [1.15e+04]
|
||||
vaddpd 208(%r12,%rcx,8), %zmm0, %zmm0 #94.5
|
||||
vaddpd 20224(%r10,%rcx,8), %zmm0, %zmm1 #94.5
|
||||
vaddpd %zmm2, %zmm1, %zmm2 #94.5
|
||||
vmulpd %zmm2, %zmm4, %zmm3 #94.5
|
||||
vmovupd %zmm3, 10208(%r9,%rcx,8) #94.5
|
||||
addq $32, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.47 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
40
examples/j2d/j2d.s.csx.icc.SSE.s
Normal file
40
examples/j2d/j2d.s.csx.icc.SSE.s
Normal file
@@ -0,0 +1,40 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.42: # Preds ..B1.42 ..B1.41
|
||||
# Execution count [1.15e+04]
|
||||
movups 10016(%r8,%rcx,8), %xmm0 #94.5
|
||||
addpd 16(%r12,%rcx,8), %xmm0 #94.5
|
||||
addpd 20032(%r10,%rcx,8), %xmm0 #94.5
|
||||
movups 10032(%r8,%rcx,8), %xmm2 #94.5
|
||||
movups 32(%r12,%rcx,8), %xmm1 #94.5
|
||||
addpd %xmm2, %xmm0 #94.5
|
||||
addpd %xmm1, %xmm2 #94.5
|
||||
mulpd %xmm7, %xmm0 #94.5
|
||||
addpd 20048(%r10,%rcx,8), %xmm2 #94.5
|
||||
movups 10048(%r8,%rcx,8), %xmm4 #94.5
|
||||
movups 48(%r12,%rcx,8), %xmm3 #94.5
|
||||
addpd %xmm4, %xmm2 #94.5
|
||||
addpd %xmm3, %xmm4 #94.5
|
||||
mulpd %xmm7, %xmm2 #94.5
|
||||
addpd 20064(%r10,%rcx,8), %xmm4 #94.5
|
||||
movups 10064(%r8,%rcx,8), %xmm6 #94.5
|
||||
movups 64(%r12,%rcx,8), %xmm5 #94.5
|
||||
addpd %xmm6, %xmm4 #94.5
|
||||
addpd %xmm5, %xmm6 #94.5
|
||||
mulpd %xmm7, %xmm4 #94.5
|
||||
addpd 20080(%r10,%rcx,8), %xmm6 #94.5
|
||||
addpd 10080(%r8,%rcx,8), %xmm6 #94.5
|
||||
mulpd %xmm7, %xmm6 #94.5
|
||||
movups %xmm0, 10016(%r9,%rcx,8) #94.5
|
||||
movups %xmm2, 10032(%r9,%rcx,8) #94.5
|
||||
movups %xmm4, 10048(%r9,%rcx,8) #94.5
|
||||
movups %xmm6, 10064(%r9,%rcx,8) #94.5
|
||||
addq $8, %rcx #94.5
|
||||
cmpq %r14, %rcx #94.5
|
||||
jb ..B1.42 # Prob 82% #94.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
131
examples/j2d/j2d.s.tx2.clang.s
Normal file
131
examples/j2d/j2d.s.tx2.clang.s
Normal file
@@ -0,0 +1,131 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_16 Depth=1
|
||||
// Parent Loop BB1_19 Depth=2
|
||||
// Parent Loop BB1_24 Depth=3
|
||||
// => This Inner Loop Header: Depth=4
|
||||
add x0, x5, x16
|
||||
add x18, x21, x16
|
||||
ldp q4, q5, [x0, #16]
|
||||
ldp q6, q7, [x0, #48]
|
||||
ldur q0, [x18, #8]
|
||||
ldur q1, [x18, #24]
|
||||
ldur q2, [x18, #40]
|
||||
ldur q3, [x18, #56]
|
||||
add x1, x28, x16
|
||||
add x15, x15, #32 // =32
|
||||
fadd v0.2d, v4.2d, v0.2d
|
||||
fadd v4.2d, v5.2d, v1.2d
|
||||
fadd v5.2d, v6.2d, v2.2d
|
||||
fadd v6.2d, v7.2d, v3.2d
|
||||
ldp q7, q16, [x1, #16]
|
||||
fadd v1.2d, v7.2d, v1.2d
|
||||
ldp q17, q18, [x1, #48]
|
||||
ldur q19, [x18, #72]
|
||||
fadd v0.2d, v0.2d, v1.2d
|
||||
fadd v1.2d, v16.2d, v2.2d
|
||||
fadd v2.2d, v17.2d, v3.2d
|
||||
fadd v3.2d, v18.2d, v19.2d
|
||||
ldp q16, q17, [x0, #80]
|
||||
ldp q18, q19, [x0, #112]
|
||||
fadd v1.2d, v4.2d, v1.2d
|
||||
fadd v2.2d, v5.2d, v2.2d
|
||||
fadd v3.2d, v6.2d, v3.2d
|
||||
ldur q4, [x18, #72]
|
||||
ldur q5, [x18, #88]
|
||||
ldur q6, [x18, #104]
|
||||
ldur q7, [x18, #120]
|
||||
fadd v4.2d, v16.2d, v4.2d
|
||||
fadd v16.2d, v17.2d, v5.2d
|
||||
fadd v17.2d, v18.2d, v6.2d
|
||||
fadd v18.2d, v19.2d, v7.2d
|
||||
ldp q19, q20, [x1, #80]
|
||||
fadd v5.2d, v19.2d, v5.2d
|
||||
ldp q21, q22, [x1, #112]
|
||||
ldur q23, [x18, #136]
|
||||
fadd v4.2d, v4.2d, v5.2d
|
||||
fadd v5.2d, v20.2d, v6.2d
|
||||
fadd v6.2d, v21.2d, v7.2d
|
||||
fadd v7.2d, v22.2d, v23.2d
|
||||
ldp q20, q21, [x0, #144]
|
||||
ldp q22, q23, [x0, #176]
|
||||
fadd v5.2d, v16.2d, v5.2d
|
||||
fadd v6.2d, v17.2d, v6.2d
|
||||
fadd v7.2d, v18.2d, v7.2d
|
||||
ldur q16, [x18, #136]
|
||||
ldur q17, [x18, #152]
|
||||
ldur q18, [x18, #168]
|
||||
ldur q19, [x18, #184]
|
||||
fadd v16.2d, v20.2d, v16.2d
|
||||
fadd v20.2d, v21.2d, v17.2d
|
||||
fadd v21.2d, v22.2d, v18.2d
|
||||
fadd v22.2d, v23.2d, v19.2d
|
||||
ldp q23, q24, [x1, #144]
|
||||
fadd v17.2d, v23.2d, v17.2d
|
||||
ldp q25, q26, [x1, #176]
|
||||
fadd v16.2d, v16.2d, v17.2d
|
||||
fadd v17.2d, v24.2d, v18.2d
|
||||
fadd v18.2d, v25.2d, v19.2d
|
||||
ldp q24, q25, [x0, #208]
|
||||
ldur q23, [x18, #200]
|
||||
fadd v17.2d, v20.2d, v17.2d
|
||||
fadd v18.2d, v21.2d, v18.2d
|
||||
ldur q20, [x18, #200]
|
||||
ldur q21, [x18, #216]
|
||||
fadd v19.2d, v26.2d, v23.2d
|
||||
fadd v20.2d, v24.2d, v20.2d
|
||||
fadd v24.2d, v25.2d, v21.2d
|
||||
ldp q25, q26, [x1, #208]
|
||||
fadd v21.2d, v25.2d, v21.2d
|
||||
fadd v20.2d, v20.2d, v21.2d
|
||||
ldp q21, q25, [x0, #240]
|
||||
fadd v19.2d, v22.2d, v19.2d
|
||||
ldur q22, [x18, #232]
|
||||
fadd v21.2d, v21.2d, v22.2d
|
||||
fadd v22.2d, v26.2d, v22.2d
|
||||
fadd v22.2d, v24.2d, v22.2d
|
||||
ldp q24, q26, [x1, #240]
|
||||
ldur q23, [x18, #248]
|
||||
fadd v25.2d, v25.2d, v23.2d
|
||||
fadd v23.2d, v24.2d, v23.2d
|
||||
add x18, x18, #264 // =264
|
||||
fmul v0.2d, v0.2d, v28.2d
|
||||
fmul v1.2d, v1.2d, v28.2d
|
||||
fmul v2.2d, v2.2d, v28.2d
|
||||
fmul v5.2d, v5.2d, v28.2d
|
||||
fadd v21.2d, v21.2d, v23.2d
|
||||
ldr q23, [x18]
|
||||
add x18, x25, x16
|
||||
stur q0, [x18, #8]
|
||||
stur q1, [x18, #24]
|
||||
fmul v3.2d, v3.2d, v28.2d
|
||||
stur q2, [x18, #40]
|
||||
fadd v23.2d, v26.2d, v23.2d
|
||||
stur q5, [x18, #88]
|
||||
fmul v4.2d, v4.2d, v28.2d
|
||||
stur q3, [x18, #56]
|
||||
fmul v6.2d, v6.2d, v28.2d
|
||||
stur q4, [x18, #72]
|
||||
fmul v0.2d, v7.2d, v28.2d
|
||||
stur q6, [x18, #104]
|
||||
fmul v1.2d, v16.2d, v28.2d
|
||||
stur q0, [x18, #120]
|
||||
fmul v2.2d, v17.2d, v28.2d
|
||||
stur q1, [x18, #136]
|
||||
fmul v4.2d, v19.2d, v28.2d
|
||||
stur q2, [x18, #152]
|
||||
fadd v5.2d, v25.2d, v23.2d
|
||||
stur q4, [x18, #184]
|
||||
fmul v3.2d, v18.2d, v28.2d
|
||||
stur q3, [x18, #168]
|
||||
fmul v6.2d, v20.2d, v28.2d
|
||||
stur q6, [x18, #200]
|
||||
fmul v0.2d, v22.2d, v28.2d
|
||||
stur q0, [x18, #216]
|
||||
fmul v1.2d, v21.2d, v28.2d
|
||||
stur q1, [x18, #232]
|
||||
add x16, x16, #256 // =256
|
||||
fmul v2.2d, v5.2d, v28.2d
|
||||
stur q2, [x18, #248]
|
||||
adds x17, x17, #4 // =4
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
43
examples/j2d/j2d.s.tx2.gcc.s
Normal file
43
examples/j2d/j2d.s.tx2.gcc.s
Normal file
@@ -0,0 +1,43 @@
|
||||
// OSACA-BEGIN
|
||||
.L93:
|
||||
add x5, x0, 16
|
||||
ldr q2, [x14, x0]
|
||||
ldr q5, [x25, x0]
|
||||
add x7, x0, 32
|
||||
ldr q13, [x22, x0]
|
||||
ldr q4, [x25, x5]
|
||||
add x6, x0, 48
|
||||
ldr x9, [sp, 144]
|
||||
ldr q19, [x22, x5]
|
||||
ldr q7, [x14, x5]
|
||||
ldr q6, [x14, x7]
|
||||
ldr q3, [x25, x7]
|
||||
ldr q18, [x22, x7]
|
||||
fadd v17.2d, v2.2d, v30.2d
|
||||
ldr q16, [x14, x6]
|
||||
ldr q20, [x25, x6]
|
||||
fadd v23.2d, v5.2d, v13.2d
|
||||
ldr q22, [x22, x6]
|
||||
fadd v24.2d, v4.2d, v19.2d
|
||||
fadd v25.2d, v7.2d, v2.2d
|
||||
fadd v27.2d, v6.2d, v7.2d
|
||||
fadd v26.2d, v3.2d, v18.2d
|
||||
fadd v28.2d, v16.2d, v6.2d
|
||||
mov v30.16b, v16.16b
|
||||
fadd v29.2d, v20.2d, v22.2d
|
||||
fadd v31.2d, v23.2d, v17.2d
|
||||
fadd v0.2d, v24.2d, v25.2d
|
||||
fadd v2.2d, v26.2d, v27.2d
|
||||
fadd v1.2d, v29.2d, v28.2d
|
||||
fmul v5.2d, v31.2d, v21.2d
|
||||
fmul v13.2d, v0.2d, v21.2d
|
||||
fmul v4.2d, v2.2d, v21.2d
|
||||
fmul v19.2d, v1.2d, v21.2d
|
||||
str q5, [x28, x0]
|
||||
add x0, x0, 64
|
||||
str q13, [x28, x5]
|
||||
str q4, [x28, x7]
|
||||
str q19, [x28, x6]
|
||||
cmp x9, x0
|
||||
bne .L93
|
||||
// OSACA-END
|
||||
36
examples/j2d/j2d.s.zen.gcc.s
Normal file
36
examples/j2d/j2d.s.zen.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
# OSACA-BEGIN
|
||||
.L28:
|
||||
vmovups (%r10,%rcx), %xmm5
|
||||
vmovups 32(%r10,%rax), %xmm13
|
||||
vmovups (%rdi,%rcx), %xmm1
|
||||
vmovups 32(%rdi,%rax), %xmm14
|
||||
vmovups 48(%rdi,%rax), %xmm9
|
||||
vaddpd (%r8,%rcx), %xmm1, %xmm10
|
||||
vaddpd 32(%r8,%rax), %xmm14, %xmm15
|
||||
vaddpd 48(%r8,%rax), %xmm9, %xmm1
|
||||
vaddpd %xmm5, %xmm8, %xmm8
|
||||
vaddpd %xmm13, %xmm5, %xmm6
|
||||
vmovups 48(%r10,%rax), %xmm5
|
||||
vaddpd %xmm8, %xmm10, %xmm11
|
||||
vaddpd %xmm6, %xmm15, %xmm0
|
||||
vmulpd %xmm2, %xmm11, %xmm12
|
||||
vaddpd %xmm5, %xmm13, %xmm4
|
||||
vmulpd %xmm2, %xmm0, %xmm7
|
||||
vaddpd %xmm4, %xmm1, %xmm10
|
||||
vmovups %xmm12, (%rsi,%rcx)
|
||||
vmovups %xmm7, 32(%rsi,%rax)
|
||||
vmulpd %xmm2, %xmm10, %xmm8
|
||||
vmovups %xmm8, 48(%rsi,%rax)
|
||||
addq $64, %rax
|
||||
.L21:
|
||||
vmovups (%r10,%rax), %xmm8
|
||||
leaq 16(%rax), %rcx
|
||||
vmovups (%rdi,%rax), %xmm9
|
||||
vaddpd (%r8,%rax), %xmm9, %xmm10
|
||||
vaddpd %xmm8, %xmm5, %xmm11
|
||||
vaddpd %xmm11, %xmm10, %xmm12
|
||||
vmulpd %xmm2, %xmm12, %xmm13
|
||||
vmovups %xmm13, (%rsi,%rax)
|
||||
cmpq %rcx, %r14
|
||||
jne .L28
|
||||
# OSACA-END
|
||||
44
examples/striad/striad.s.csx.gcc.s
Normal file
44
examples/striad/striad.s.csx.gcc.s
Normal file
@@ -0,0 +1,44 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r15,%rax), %ymm5
|
||||
vmovupd 0(%r13,%rax), %ymm6
|
||||
vmovupd 32(%r15,%rax), %ymm8
|
||||
vmovupd 32(%r13,%rax), %ymm7
|
||||
vmovupd 64(%r15,%rax), %ymm9
|
||||
vmovupd 64(%r13,%rax), %ymm10
|
||||
vmovupd 96(%r15,%rax), %ymm11
|
||||
vmovupd 96(%r13,%rax), %ymm12
|
||||
vmovupd 128(%r15,%rax), %ymm13
|
||||
vmovupd 128(%r13,%rax), %ymm14
|
||||
vmovupd 160(%r15,%rax), %ymm15
|
||||
vmovupd 160(%r13,%rax), %ymm2
|
||||
vmovupd 192(%r15,%rax), %ymm0
|
||||
vmovupd 192(%r13,%rax), %ymm1
|
||||
vmovupd 224(%r15,%rax), %ymm3
|
||||
vmovupd 224(%r13,%rax), %ymm4
|
||||
vfmadd132pd (%r14,%rax), %ymm6, %ymm5
|
||||
vfmadd132pd 32(%r14,%rax), %ymm7, %ymm8
|
||||
vfmadd132pd 64(%r14,%rax), %ymm10, %ymm9
|
||||
vfmadd132pd 96(%r14,%rax), %ymm12, %ymm11
|
||||
vfmadd132pd 128(%r14,%rax), %ymm14, %ymm13
|
||||
vfmadd132pd 160(%r14,%rax), %ymm2, %ymm15
|
||||
vfmadd132pd 192(%r14,%rax), %ymm1, %ymm0
|
||||
vfmadd132pd 224(%r14,%rax), %ymm4, %ymm3
|
||||
vmovupd %ymm5, (%r12,%rax)
|
||||
vmovupd %ymm8, 32(%r12,%rax)
|
||||
vmovupd %ymm9, 64(%r12,%rax)
|
||||
vmovupd %ymm11, 96(%r12,%rax)
|
||||
vmovupd %ymm13, 128(%r12,%rax)
|
||||
vmovupd %ymm15, 160(%r12,%rax)
|
||||
vmovupd %ymm0, 192(%r12,%rax)
|
||||
vmovupd %ymm3, 224(%r12,%rax)
|
||||
addq $256, %rax
|
||||
cmpq %rax, %r8
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
21
examples/striad/striad.s.csx.icc.s
Normal file
21
examples/striad/striad.s.csx.icc.s
Normal file
@@ -0,0 +1,21 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.41: # Preds ..B1.41 ..B1.40
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%rcx,%rax,8), %zmm2 #80.5
|
||||
vmovups 64(%rcx,%rax,8), %zmm4 #80.5
|
||||
vmovups (%r14,%rax,8), %zmm1 #80.5
|
||||
vmovups 64(%r14,%rax,8), %zmm3 #80.5
|
||||
vfmadd213pd (%r8,%rax,8), %zmm1, %zmm2 #80.5
|
||||
vfmadd213pd 64(%r8,%rax,8), %zmm3, %zmm4 #80.5
|
||||
vmovupd %zmm2, (%r13,%rax,8) #80.5
|
||||
vmovupd %zmm4, 64(%r13,%rax,8) #80.5
|
||||
addq $16, %rax #80.5
|
||||
cmpq %r12, %rax #80.5
|
||||
jb ..B1.41 # Prob 82% #80.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
112
examples/striad/striad.s.tx2.clang.s
Normal file
112
examples/striad/striad.s.tx2.clang.s
Normal file
@@ -0,0 +1,112 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x9, #-256]
|
||||
ldp q2, q3, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
ldp q16, q17, [x11, #-256]
|
||||
ldp q18, q19, [x11, #-224]
|
||||
fmla v0.2d, v16.2d, v4.2d
|
||||
fmla v1.2d, v17.2d, v5.2d
|
||||
stp q1, q0, [sp, #96] // 32-byte Folded Spill
|
||||
fmla v2.2d, v18.2d, v6.2d
|
||||
fmla v3.2d, v19.2d, v7.2d
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q6, q7, [x9, #-160]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
ldp q20, q21, [x11, #-192]
|
||||
ldp q22, q23, [x11, #-160]
|
||||
fmla v4.2d, v20.2d, v16.2d
|
||||
stp q3, q4, [x12, #-208]
|
||||
fmla v5.2d, v21.2d, v17.2d
|
||||
fmla v6.2d, v22.2d, v18.2d
|
||||
stp q5, q6, [x12, #-176]
|
||||
fmla v7.2d, v23.2d, v19.2d
|
||||
ldp q16, q18, [x9, #-128]
|
||||
ldp q17, q19, [x9, #-96]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
ldp q24, q25, [x11, #-128]
|
||||
ldp q26, q27, [x11, #-96]
|
||||
fmla v16.2d, v24.2d, v20.2d
|
||||
stp q7, q16, [x12, #-144]
|
||||
fmla v18.2d, v25.2d, v21.2d
|
||||
fmla v17.2d, v26.2d, v22.2d
|
||||
stp q18, q17, [x12, #-112]
|
||||
fmla v19.2d, v27.2d, v23.2d
|
||||
ldp q22, q23, [x9, #-64]
|
||||
ldp q20, q21, [x9, #-32]
|
||||
ldp q24, q25, [x10, #-64]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
ldp q28, q29, [x11, #-64]
|
||||
ldp q30, q31, [x11, #-32]
|
||||
fmla v22.2d, v28.2d, v24.2d
|
||||
stp q19, q22, [x12, #-80]
|
||||
fmla v23.2d, v29.2d, v25.2d
|
||||
fmla v20.2d, v30.2d, v26.2d
|
||||
stp q23, q20, [x12, #-48]
|
||||
fmla v21.2d, v31.2d, v27.2d
|
||||
stur q21, [x12, #-16]
|
||||
ldp q24, q25, [x9]
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q28, q29, [x10]
|
||||
ldp q30, q31, [x10, #32]
|
||||
ldp q8, q10, [x11]
|
||||
ldp q11, q12, [x11, #32]
|
||||
fmla v24.2d, v8.2d, v28.2d
|
||||
fmla v25.2d, v10.2d, v29.2d
|
||||
stp q24, q25, [x12]
|
||||
fmla v26.2d, v11.2d, v30.2d
|
||||
fmla v27.2d, v12.2d, v31.2d
|
||||
stp q26, q27, [x12, #32]
|
||||
ldp q28, q29, [x9, #64]
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldp q8, q10, [x10, #64]
|
||||
ldp q11, q12, [x10, #96]
|
||||
ldp q13, q14, [x11, #64]
|
||||
ldp q15, q9, [x11, #96]
|
||||
fmla v28.2d, v13.2d, v8.2d
|
||||
fmla v29.2d, v14.2d, v10.2d
|
||||
stp q28, q29, [x12, #64]
|
||||
fmla v30.2d, v15.2d, v11.2d
|
||||
fmla v31.2d, v9.2d, v12.2d
|
||||
stp q30, q31, [x12, #96]
|
||||
ldp q8, q9, [x9, #128]
|
||||
ldp q12, q13, [x10, #128]
|
||||
ldp q14, q15, [x11, #128]
|
||||
ldp q10, q11, [x9, #160]
|
||||
fmla v8.2d, v14.2d, v12.2d
|
||||
ldp q12, q14, [x10, #160]
|
||||
fmla v9.2d, v15.2d, v13.2d
|
||||
stp q8, q9, [x12, #128]
|
||||
ldp q13, q15, [x11, #160]
|
||||
fmla v10.2d, v13.2d, v12.2d
|
||||
fmla v11.2d, v15.2d, v14.2d
|
||||
stp q10, q11, [x12, #160]
|
||||
ldp q12, q13, [x9, #192]
|
||||
ldp q14, q15, [x10, #192]
|
||||
ldp q0, q1, [x11, #192]
|
||||
fmla v12.2d, v0.2d, v14.2d
|
||||
ldr q0, [sp, #112] // 16-byte Folded Reload
|
||||
stur q0, [x12, #-256]
|
||||
ldr q0, [sp, #96] // 16-byte Folded Reload
|
||||
stp q0, q2, [x12, #-240]
|
||||
ldp q0, q2, [x9, #224]
|
||||
ldp q3, q4, [x10, #224]
|
||||
ldp q5, q6, [x11, #224]
|
||||
fmla v13.2d, v1.2d, v15.2d
|
||||
stp q12, q13, [x12, #192]
|
||||
fmla v0.2d, v5.2d, v3.2d
|
||||
fmla v2.2d, v6.2d, v4.2d
|
||||
stp q0, q2, [x12, #224]
|
||||
add x8, x8, #64 // =64
|
||||
add x12, x12, #512 // =512
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x13, x13, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
53
examples/striad/striad.s.tx2.gcc.s
Normal file
53
examples/striad/striad.s.tx2.gcc.s
Normal file
@@ -0,0 +1,53 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x12, x11, 16
|
||||
ldr q29, [x22, x11]
|
||||
ldr q30, [x20, x11]
|
||||
add x7, x11, 32
|
||||
ldr q31, [x21, x11]
|
||||
ldr q7, [x22, x12]
|
||||
add x6, x11, 48
|
||||
add x5, x11, 64
|
||||
ldr q6, [x20, x12]
|
||||
ldr q2, [x21, x12]
|
||||
add x8, x11, 80
|
||||
add x0, x11, 96
|
||||
ldr q9, [x22, x7]
|
||||
ldr q5, [x20, x7]
|
||||
add x13, x11, 112
|
||||
ldr q1, [x21, x7]
|
||||
ldr q16, [x22, x6]
|
||||
ldr q4, [x20, x6]
|
||||
ldr q0, [x21, x6]
|
||||
fmla v30.2d, v29.2d, v31.2d
|
||||
ldr q23, [x22, x5]
|
||||
ldr q3, [x20, x5]
|
||||
fmla v6.2d, v7.2d, v2.2d
|
||||
ldr q22, [x21, x5]
|
||||
ldr q21, [x22, x8]
|
||||
ldr q24, [x20, x8]
|
||||
ldr q20, [x21, x8]
|
||||
fmla v5.2d, v9.2d, v1.2d
|
||||
ldr q19, [x22, x0]
|
||||
ldr q25, [x20, x0]
|
||||
fmla v4.2d, v16.2d, v0.2d
|
||||
ldr q18, [x21, x0]
|
||||
ldr q17, [x22, x13]
|
||||
ldr q26, [x20, x13]
|
||||
ldr q27, [x21, x13]
|
||||
fmla v3.2d, v23.2d, v22.2d
|
||||
fmla v24.2d, v21.2d, v20.2d
|
||||
str q30, [x19, x11]
|
||||
add x11, x11, 128
|
||||
str q6, [x19, x12]
|
||||
fmla v25.2d, v19.2d, v18.2d
|
||||
str q5, [x19, x7]
|
||||
fmla v26.2d, v17.2d, v27.2d
|
||||
str q4, [x19, x6]
|
||||
str q3, [x19, x5]
|
||||
str q24, [x19, x8]
|
||||
str q25, [x19, x0]
|
||||
str q26, [x19, x13]
|
||||
cmp x25, x11
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
38
examples/striad/striad.s.zen.gcc.s
Normal file
38
examples/striad/striad.s.zen.gcc.s
Normal file
@@ -0,0 +1,38 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups (%r14,%rax), %xmm0
|
||||
vmovups (%r12,%rax), %xmm5
|
||||
vmovups 16(%r14,%rax), %xmm3
|
||||
vmovups 16(%r12,%rax), %xmm6
|
||||
vmovups 32(%r14,%rax), %xmm4
|
||||
vmovups 32(%r12,%rax), %xmm7
|
||||
vmovups 48(%r14,%rax), %xmm8
|
||||
vmovups 48(%r12,%rax), %xmm9
|
||||
vmovups 64(%r14,%rax), %xmm10
|
||||
vmovups 64(%r12,%rax), %xmm11
|
||||
vmovups 80(%r14,%rax), %xmm12
|
||||
vmovups 80(%r12,%rax), %xmm13
|
||||
vmovups 96(%r14,%rax), %xmm14
|
||||
vmovups 96(%r12,%rax), %xmm15
|
||||
vmovups 112(%r14,%rax), %xmm2
|
||||
vmovups 112(%r12,%rax), %xmm1
|
||||
vfmadd132pd 0(%r13,%rax), %xmm5, %xmm0
|
||||
vfmadd132pd 16(%r13,%rax), %xmm6, %xmm3
|
||||
vfmadd132pd 32(%r13,%rax), %xmm7, %xmm4
|
||||
vfmadd132pd 48(%r13,%rax), %xmm9, %xmm8
|
||||
vfmadd132pd 64(%r13,%rax), %xmm11, %xmm10
|
||||
vfmadd132pd 80(%r13,%rax), %xmm13, %xmm12
|
||||
vfmadd132pd 96(%r13,%rax), %xmm15, %xmm14
|
||||
vfmadd132pd 112(%r13,%rax), %xmm1, %xmm2
|
||||
vmovups %xmm0, 0(%rbp,%rax)
|
||||
vmovups %xmm3, 16(%rbp,%rax)
|
||||
vmovups %xmm4, 32(%rbp,%rax)
|
||||
vmovups %xmm8, 48(%rbp,%rax)
|
||||
vmovups %xmm10, 64(%rbp,%rax)
|
||||
vmovups %xmm12, 80(%rbp,%rax)
|
||||
vmovups %xmm14, 96(%rbp,%rax)
|
||||
vmovups %xmm2, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rcx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
46
examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s
Normal file
46
examples/sum_reduction/sum_reduction.s.csx.gcc.O3.s
Normal file
@@ -0,0 +1,46 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
# LLVM-MCA-BEGIN
|
||||
.L19:
|
||||
vmovupd (%rcx), %ymm4
|
||||
vmovupd 32(%rcx), %ymm13
|
||||
vaddsd %xmm4, %xmm0, %xmm6
|
||||
vunpckhpd %xmm4, %xmm4, %xmm3
|
||||
vextractf64x2 $0x1, %ymm4, %xmm8
|
||||
vaddsd %xmm6, %xmm3, %xmm7
|
||||
vunpckhpd %xmm8, %xmm8, %xmm11
|
||||
vunpckhpd %xmm13, %xmm13, %xmm1
|
||||
vaddsd %xmm7, %xmm8, %xmm10
|
||||
vextractf64x2 $0x1, %ymm13, %xmm2
|
||||
vunpckhpd %xmm2, %xmm2, %xmm3
|
||||
vaddsd %xmm11, %xmm10, %xmm12
|
||||
vmovupd 64(%rcx), %ymm8
|
||||
vmovupd 96(%rcx), %ymm5
|
||||
vaddsd %xmm13, %xmm12, %xmm0
|
||||
vunpckhpd %xmm8, %xmm8, %xmm12
|
||||
vextractf64x2 $0x1, %ymm8, %xmm14
|
||||
vaddsd %xmm0, %xmm1, %xmm4
|
||||
vunpckhpd %xmm14, %xmm14, %xmm0
|
||||
vextractf64x2 $0x1, %ymm5, %xmm9
|
||||
vaddsd %xmm4, %xmm2, %xmm6
|
||||
subq $-128, %rcx
|
||||
vaddsd %xmm3, %xmm6, %xmm7
|
||||
vaddsd %xmm8, %xmm7, %xmm11
|
||||
vunpckhpd %xmm5, %xmm5, %xmm7
|
||||
vaddsd %xmm11, %xmm12, %xmm13
|
||||
vunpckhpd %xmm9, %xmm9, %xmm12
|
||||
vaddsd %xmm13, %xmm14, %xmm1
|
||||
vaddsd %xmm0, %xmm1, %xmm4
|
||||
vaddsd %xmm5, %xmm4, %xmm3
|
||||
vaddsd %xmm3, %xmm7, %xmm8
|
||||
vaddsd %xmm8, %xmm9, %xmm11
|
||||
vaddsd %xmm12, %xmm11, %xmm0
|
||||
cmpq %rcx, %r15
|
||||
jne .L19
|
||||
# LLVM-MCA-END
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
20
examples/sum_reduction/sum_reduction.s.csx.gcc.s
Normal file
20
examples/sum_reduction/sum_reduction.s.csx.gcc.s
Normal file
@@ -0,0 +1,20 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vaddpd (%rcx), %ymm3, %ymm4
|
||||
addq $256, %rcx
|
||||
vaddpd -224(%rcx), %ymm4, %ymm5
|
||||
vaddpd -192(%rcx), %ymm5, %ymm6
|
||||
vaddpd -160(%rcx), %ymm6, %ymm8
|
||||
vaddpd -128(%rcx), %ymm8, %ymm9
|
||||
vaddpd -96(%rcx), %ymm9, %ymm10
|
||||
vaddpd -64(%rcx), %ymm10, %ymm11
|
||||
vaddpd -32(%rcx), %ymm11, %ymm3
|
||||
cmpq %rcx, %r15
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
17
examples/sum_reduction/sum_reduction.s.csx.icc.s
Normal file
17
examples/sum_reduction/sum_reduction.s.csx.icc.s
Normal file
@@ -0,0 +1,17 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.38: # Preds ..B1.38 ..B1.37
|
||||
# Execution count [2.22e+03]
|
||||
vaddpd (%r13,%rax,8), %zmm4, %zmm4 #76.5
|
||||
vaddpd 64(%r13,%rax,8), %zmm3, %zmm3 #76.5
|
||||
vaddpd 128(%r13,%rax,8), %zmm2, %zmm2 #76.5
|
||||
vaddpd 192(%r13,%rax,8), %zmm1, %zmm1 #76.5
|
||||
addq $32, %rax #76.5
|
||||
cmpq %r14, %rax #76.5
|
||||
jb ..B1.38 # Prob 82% #76.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
57
examples/sum_reduction/sum_reduction.s.tx2.clang.s
Normal file
57
examples/sum_reduction/sum_reduction.s.tx2.clang.s
Normal file
@@ -0,0 +1,57 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q4, q5, [x9, #-256]
|
||||
fadd v0.2d, v4.2d, v0.2d
|
||||
fadd v1.2d, v5.2d, v1.2d
|
||||
ldp q4, q5, [x9, #-192]
|
||||
ldp q16, q17, [x9, #-128]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
ldp q6, q7, [x9, #-224]
|
||||
fadd v2.2d, v6.2d, v2.2d
|
||||
fadd v3.2d, v7.2d, v3.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
fadd v4.2d, v5.2d, v17.2d
|
||||
ldp q6, q7, [x9, #-160]
|
||||
ldp q18, q19, [x9, #-96]
|
||||
ldp q16, q17, [x9]
|
||||
add x8, x8, #64 // =64
|
||||
fadd v1.2d, v1.2d, v4.2d
|
||||
fadd v4.2d, v6.2d, v18.2d
|
||||
fadd v2.2d, v2.2d, v4.2d
|
||||
fadd v4.2d, v7.2d, v19.2d
|
||||
ldp q6, q7, [x9, #-32]
|
||||
ldp q18, q19, [x9, #32]
|
||||
fadd v6.2d, v6.2d, v18.2d
|
||||
fadd v7.2d, v7.2d, v19.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
ldp q4, q5, [x9, #-64]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
fadd v5.2d, v5.2d, v17.2d
|
||||
ldp q16, q17, [x9, #64]
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
fadd v5.2d, v5.2d, v17.2d
|
||||
ldp q16, q17, [x9, #128]
|
||||
fadd v0.2d, v0.2d, v16.2d
|
||||
fadd v1.2d, v1.2d, v17.2d
|
||||
ldp q16, q17, [x9, #192]
|
||||
ldp q18, q19, [x9, #96]
|
||||
fadd v6.2d, v6.2d, v18.2d
|
||||
fadd v7.2d, v7.2d, v19.2d
|
||||
fadd v4.2d, v4.2d, v16.2d
|
||||
ldp q18, q19, [x9, #160]
|
||||
fadd v2.2d, v2.2d, v18.2d
|
||||
fadd v3.2d, v3.2d, v19.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
fadd v4.2d, v5.2d, v17.2d
|
||||
ldp q18, q19, [x9, #224]
|
||||
add x9, x9, #512 // =512
|
||||
fadd v1.2d, v1.2d, v4.2d
|
||||
fadd v4.2d, v6.2d, v18.2d
|
||||
fadd v2.2d, v2.2d, v4.2d
|
||||
fadd v4.2d, v7.2d, v19.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
adds x10, x10, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
47
examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s
Normal file
47
examples/sum_reduction/sum_reduction.s.tx2.gcc.O3.s
Normal file
@@ -0,0 +1,47 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x17, x16
|
||||
ldr q4, [x17], 16
|
||||
ldr q5, [x16, 16]
|
||||
add x16, x16, 128
|
||||
ldr q3, [x16, -80]
|
||||
ldr q2, [x16, -64]
|
||||
ldr q0, [x16, -48]
|
||||
ldr q1, [x16, -32]
|
||||
ldr q7, [x16, -16]
|
||||
dup d16, v4.d[0]
|
||||
dup d6, v4.d[1]
|
||||
ldr q4, [x17, 16]
|
||||
dup d22, v5.d[0]
|
||||
dup d5, v5.d[1]
|
||||
dup d20, v3.d[0]
|
||||
dup d3, v3.d[1]
|
||||
dup d19, v2.d[0]
|
||||
dup d2, v2.d[1]
|
||||
dup d21, v4.d[0]
|
||||
dup d4, v4.d[1]
|
||||
fadd d10, d8, d16
|
||||
dup d18, v0.d[0]
|
||||
dup d0, v0.d[1]
|
||||
dup d8, v1.d[0]
|
||||
dup d1, v1.d[1]
|
||||
dup d17, v7.d[0]
|
||||
dup d7, v7.d[1]
|
||||
fadd d23, d6, d10
|
||||
fadd d24, d23, d22
|
||||
fadd d25, d5, d24
|
||||
fadd d26, d25, d21
|
||||
fadd d27, d4, d26
|
||||
fadd d28, d27, d20
|
||||
fadd d29, d3, d28
|
||||
fadd d30, d29, d19
|
||||
fadd d31, d2, d30
|
||||
fadd d16, d31, d18
|
||||
fadd d6, d0, d16
|
||||
fadd d22, d6, d8
|
||||
fadd d5, d1, d22
|
||||
fadd d20, d5, d17
|
||||
fadd d8, d7, d20
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
23
examples/sum_reduction/sum_reduction.s.tx2.gcc.s
Normal file
23
examples/sum_reduction/sum_reduction.s.tx2.gcc.s
Normal file
@@ -0,0 +1,23 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
mov x17, x16
|
||||
ldr q10, [x17], 16
|
||||
ldr q16, [x16, 16]
|
||||
add x16, x16, 128
|
||||
ldr q17, [x16, -80]
|
||||
ldr q18, [x16, -64]
|
||||
ldr q19, [x16, -48]
|
||||
ldr q20, [x16, -32]
|
||||
ldr q21, [x16, -16]
|
||||
fadd v22.2d, v1.2d, v10.2d
|
||||
ldr q23, [x17, 16]
|
||||
fadd v24.2d, v22.2d, v16.2d
|
||||
fadd v25.2d, v24.2d, v23.2d
|
||||
fadd v26.2d, v25.2d, v17.2d
|
||||
fadd v27.2d, v26.2d, v18.2d
|
||||
fadd v28.2d, v27.2d, v19.2d
|
||||
fadd v29.2d, v28.2d, v20.2d
|
||||
fadd v1.2d, v29.2d, v21.2d
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
38
examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s
Normal file
38
examples/sum_reduction/sum_reduction.s.zen.gcc.O3.s
Normal file
@@ -0,0 +1,38 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovsd (%r10), %xmm8
|
||||
vmovsd 8(%r10), %xmm10
|
||||
subq $-128, %r10
|
||||
vmovsd -112(%r10), %xmm12
|
||||
vmovsd -104(%r10), %xmm14
|
||||
vmovsd -96(%r10), %xmm1
|
||||
vmovsd -88(%r10), %xmm2
|
||||
vmovsd -80(%r10), %xmm3
|
||||
vmovsd -72(%r10), %xmm6
|
||||
vaddsd %xmm8, %xmm7, %xmm9
|
||||
vmovsd -64(%r10), %xmm8
|
||||
vaddsd %xmm9, %xmm10, %xmm11
|
||||
vmovsd -56(%r10), %xmm10
|
||||
vaddsd %xmm12, %xmm11, %xmm13
|
||||
vmovsd -48(%r10), %xmm12
|
||||
vaddsd %xmm13, %xmm14, %xmm15
|
||||
vmovsd -40(%r10), %xmm14
|
||||
vaddsd %xmm1, %xmm15, %xmm4
|
||||
vmovsd -32(%r10), %xmm1
|
||||
vaddsd %xmm4, %xmm2, %xmm0
|
||||
vmovsd -24(%r10), %xmm2
|
||||
vaddsd %xmm3, %xmm0, %xmm5
|
||||
vmovsd -16(%r10), %xmm3
|
||||
vaddsd %xmm5, %xmm6, %xmm7
|
||||
vmovsd -8(%r10), %xmm6
|
||||
vaddsd %xmm8, %xmm7, %xmm9
|
||||
vaddsd %xmm9, %xmm10, %xmm11
|
||||
vaddsd %xmm12, %xmm11, %xmm13
|
||||
vaddsd %xmm13, %xmm14, %xmm15
|
||||
vaddsd %xmm1, %xmm15, %xmm4
|
||||
vaddsd %xmm4, %xmm2, %xmm0
|
||||
vaddsd %xmm3, %xmm0, %xmm5
|
||||
vaddsd %xmm5, %xmm6, %xmm7
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
14
examples/sum_reduction/sum_reduction.s.zen.gcc.s
Normal file
14
examples/sum_reduction/sum_reduction.s.zen.gcc.s
Normal file
@@ -0,0 +1,14 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vaddpd (%r10), %xmm3, %xmm1
|
||||
subq $-128, %r10
|
||||
vaddpd -112(%r10), %xmm1, %xmm4
|
||||
vaddpd -96(%r10), %xmm4, %xmm5
|
||||
vaddpd -80(%r10), %xmm5, %xmm6
|
||||
vaddpd -64(%r10), %xmm6, %xmm8
|
||||
vaddpd -48(%r10), %xmm8, %xmm9
|
||||
vaddpd -32(%r10), %xmm9, %xmm10
|
||||
vaddpd -16(%r10), %xmm10, %xmm3
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
36
examples/triad/triad.s.csx.gcc.s
Normal file
36
examples/triad/triad.s.csx.gcc.s
Normal file
@@ -0,0 +1,36 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmovupd (%r14,%rsi), %ymm14
|
||||
vmovupd 32(%r14,%rsi), %ymm15
|
||||
vmovupd 64(%r14,%rsi), %ymm1
|
||||
vmovupd 96(%r14,%rsi), %ymm0
|
||||
vmovupd 128(%r14,%rsi), %ymm3
|
||||
vmovupd 160(%r14,%rsi), %ymm4
|
||||
vmovupd 192(%r14,%rsi), %ymm5
|
||||
vmovupd 224(%r14,%rsi), %ymm7
|
||||
vfmadd213pd 0(%r13,%rsi), %ymm6, %ymm14
|
||||
vfmadd213pd 32(%r13,%rsi), %ymm6, %ymm15
|
||||
vfmadd213pd 64(%r13,%rsi), %ymm6, %ymm1
|
||||
vfmadd213pd 96(%r13,%rsi), %ymm6, %ymm0
|
||||
vfmadd213pd 128(%r13,%rsi), %ymm6, %ymm3
|
||||
vfmadd213pd 160(%r13,%rsi), %ymm6, %ymm4
|
||||
vfmadd213pd 192(%r13,%rsi), %ymm6, %ymm5
|
||||
vfmadd213pd 224(%r13,%rsi), %ymm6, %ymm7
|
||||
vmovupd %ymm14, (%r12,%rsi)
|
||||
vmovupd %ymm15, 32(%r12,%rsi)
|
||||
vmovupd %ymm1, 64(%r12,%rsi)
|
||||
vmovupd %ymm0, 96(%r12,%rsi)
|
||||
vmovupd %ymm3, 128(%r12,%rsi)
|
||||
vmovupd %ymm4, 160(%r12,%rsi)
|
||||
vmovupd %ymm5, 192(%r12,%rsi)
|
||||
vmovupd %ymm7, 224(%r12,%rsi)
|
||||
addq $256, %rsi
|
||||
cmpq %rsi, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
16
examples/triad/triad.s.csx.icc.s
Normal file
16
examples/triad/triad.s.csx.icc.s
Normal file
@@ -0,0 +1,16 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.40: # Preds ..B1.40 ..B1.39
|
||||
# Execution count [2.22e+03]
|
||||
vmovups (%r13,%rax,8), %zmm1 #78.5
|
||||
vfmadd213pd (%rcx,%rax,8), %zmm2, %zmm1 #78.5
|
||||
vmovupd %zmm1, (%r14,%rax,8) #78.5
|
||||
addq $8, %rax #78.5
|
||||
cmpq %r12, %rax #78.5
|
||||
jb ..B1.40 # Prob 82% #78.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
118
examples/triad/triad.s.tx2.clang.s
Normal file
118
examples/triad/triad.s.tx2.clang.s
Normal file
@@ -0,0 +1,118 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_29: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q2, q3, [x9, #-256]
|
||||
ldp q0, q1, [x9, #-224]
|
||||
ldp q4, q5, [x10, #-256]
|
||||
ldp q6, q7, [x10, #-224]
|
||||
fmla v2.2d, v4.2d, v16.2d
|
||||
fmla v3.2d, v5.2d, v16.2d
|
||||
stp q2, q3, [x11, #-256]
|
||||
fmla v0.2d, v6.2d, v16.2d
|
||||
fmla v1.2d, v7.2d, v16.2d
|
||||
stp q0, q1, [x11, #-224]
|
||||
ldp q6, q7, [x9, #-192]
|
||||
ldp q16, q17, [x10, #-192]
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v6.2d, v16.2d, v20.2d
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q4, q5, [x9, #-160]
|
||||
ldp q18, q19, [x10, #-160]
|
||||
fmla v7.2d, v17.2d, v16.2d
|
||||
stp q6, q7, [x11, #-192]
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v4.2d, v18.2d, v16.2d
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v5.2d, v19.2d, v16.2d
|
||||
stp q4, q5, [x11, #-160]
|
||||
ldp q17, q19, [x9, #-128]
|
||||
ldp q20, q21, [x10, #-128]
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v17.2d, v20.2d, v24.2d
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q16, q18, [x9, #-96]
|
||||
ldp q22, q23, [x10, #-96]
|
||||
fmla v19.2d, v21.2d, v20.2d
|
||||
stp q17, q19, [x11, #-128]
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v16.2d, v22.2d, v20.2d
|
||||
ldr q20, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q24, q25, [x10, #-64]
|
||||
fmla v18.2d, v23.2d, v20.2d
|
||||
stp q16, q18, [x11, #-96]
|
||||
ldp q20, q22, [x9, #-64]
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v20.2d, v24.2d, v28.2d
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q21, q23, [x9, #-32]
|
||||
ldp q26, q27, [x10, #-32]
|
||||
fmla v22.2d, v25.2d, v24.2d
|
||||
stp q20, q22, [x11, #-64]
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v21.2d, v26.2d, v24.2d
|
||||
ldr q24, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q28, q29, [x10]
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q30, q31, [x10, #32]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v23.2d, v27.2d, v24.2d
|
||||
stp q21, q23, [x11, #-32]
|
||||
ldp q24, q25, [x9]
|
||||
fmla v24.2d, v28.2d, v8.2d
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q26, q27, [x9, #32]
|
||||
ldp q8, q10, [x10, #64]
|
||||
ldp q11, q12, [x10, #96]
|
||||
fmla v25.2d, v29.2d, v28.2d
|
||||
stp q24, q25, [x11]
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v26.2d, v30.2d, v28.2d
|
||||
ldr q28, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q13, q14, [x10, #128]
|
||||
ldr q2, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q1, q3, [x10, #192]
|
||||
fmla v27.2d, v31.2d, v28.2d
|
||||
stp q26, q27, [x11, #32]
|
||||
ldp q28, q29, [x9, #64]
|
||||
fmla v28.2d, v8.2d, v9.2d
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q30, q31, [x9, #96]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q6, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q5, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v29.2d, v10.2d, v8.2d
|
||||
stp q28, q29, [x11, #64]
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v30.2d, v11.2d, v8.2d
|
||||
ldr q8, [sp, #80] // 16-byte Folded Reload
|
||||
ldr q16, [sp, #80] // 16-byte Folded Reload
|
||||
add x8, x8, #64 // =64
|
||||
fmla v31.2d, v12.2d, v8.2d
|
||||
stp q30, q31, [x11, #96]
|
||||
ldp q8, q10, [x9, #128]
|
||||
fmla v8.2d, v13.2d, v9.2d
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
ldp q11, q12, [x9, #160]
|
||||
fmla v10.2d, v14.2d, v9.2d
|
||||
stp q8, q10, [x11, #128]
|
||||
ldp q13, q14, [x10, #160]
|
||||
fmla v12.2d, v14.2d, v2.2d
|
||||
ldp q2, q0, [x9, #192]
|
||||
ldr q9, [sp, #80] // 16-byte Folded Reload
|
||||
fmla v2.2d, v1.2d, v6.2d
|
||||
ldp q1, q4, [x9, #224]
|
||||
fmla v0.2d, v3.2d, v5.2d
|
||||
stp q2, q0, [x11, #192]
|
||||
ldp q3, q5, [x10, #224]
|
||||
fmla v11.2d, v13.2d, v9.2d
|
||||
stp q11, q12, [x11, #160]
|
||||
fmla v1.2d, v3.2d, v16.2d
|
||||
fmla v4.2d, v5.2d, v16.2d
|
||||
stp q1, q4, [x11, #224]
|
||||
add x11, x11, #512 // =512
|
||||
add x10, x10, #512 // =512
|
||||
add x9, x9, #512 // =512
|
||||
adds x12, x12, #8 // =8
|
||||
b.ne .LBB1_29
|
||||
// OSACA-END
|
||||
45
examples/triad/triad.s.tx2.gcc.s
Normal file
45
examples/triad/triad.s.tx2.gcc.s
Normal file
@@ -0,0 +1,45 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
add x0, x10, 16
|
||||
ldr q23, [x20, x10]
|
||||
ldr q24, [x21, x10]
|
||||
add x7, x10, 32
|
||||
ldr q25, [x20, x0]
|
||||
ldr q26, [x21, x0]
|
||||
add x6, x10, 48
|
||||
add x5, x10, 64
|
||||
ldr q27, [x20, x7]
|
||||
ldr q28, [x21, x7]
|
||||
add x4, x10, 80
|
||||
add x11, x10, 96
|
||||
ldr q29, [x20, x6]
|
||||
ldr q30, [x21, x6]
|
||||
add x2, x10, 112
|
||||
fmla v23.2d, v3.2d, v24.2d
|
||||
ldr q31, [x20, x5]
|
||||
ldr q4, [x21, x5]
|
||||
fmla v25.2d, v3.2d, v26.2d
|
||||
ldr q2, [x20, x4]
|
||||
ldr q5, [x21, x4]
|
||||
fmla v27.2d, v3.2d, v28.2d
|
||||
ldr q1, [x20, x11]
|
||||
ldr q6, [x21, x11]
|
||||
fmla v29.2d, v3.2d, v30.2d
|
||||
ldr q0, [x20, x2]
|
||||
ldr q7, [x21, x2]
|
||||
fmla v31.2d, v3.2d, v4.2d
|
||||
fmla v2.2d, v3.2d, v5.2d
|
||||
fmla v1.2d, v3.2d, v6.2d
|
||||
str q23, [x19, x10]
|
||||
add x10, x10, 128
|
||||
fmla v0.2d, v3.2d, v7.2d
|
||||
str q25, [x19, x0]
|
||||
str q27, [x19, x7]
|
||||
str q29, [x19, x6]
|
||||
str q31, [x19, x5]
|
||||
str q2, [x19, x4]
|
||||
str q1, [x19, x11]
|
||||
str q0, [x19, x2]
|
||||
cmp x24, x10
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
30
examples/triad/triad.s.zen.gcc.s
Normal file
30
examples/triad/triad.s.zen.gcc.s
Normal file
@@ -0,0 +1,30 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmovups 0(%r13,%rax), %xmm12
|
||||
vmovups 16(%r13,%rax), %xmm13
|
||||
vmovups 32(%r13,%rax), %xmm14
|
||||
vmovups 48(%r13,%rax), %xmm15
|
||||
vmovups 64(%r13,%rax), %xmm1
|
||||
vmovups 80(%r13,%rax), %xmm0
|
||||
vmovups 96(%r13,%rax), %xmm4
|
||||
vmovups 112(%r13,%rax), %xmm5
|
||||
vfmadd213pd (%r12,%rax), %xmm3, %xmm12
|
||||
vfmadd213pd 16(%r12,%rax), %xmm3, %xmm13
|
||||
vfmadd213pd 32(%r12,%rax), %xmm3, %xmm14
|
||||
vfmadd213pd 48(%r12,%rax), %xmm3, %xmm15
|
||||
vfmadd213pd 64(%r12,%rax), %xmm3, %xmm1
|
||||
vfmadd213pd 80(%r12,%rax), %xmm3, %xmm0
|
||||
vfmadd213pd 96(%r12,%rax), %xmm3, %xmm4
|
||||
vfmadd213pd 112(%r12,%rax), %xmm3, %xmm5
|
||||
vmovups %xmm12, 0(%rbp,%rax)
|
||||
vmovups %xmm13, 16(%rbp,%rax)
|
||||
vmovups %xmm14, 32(%rbp,%rax)
|
||||
vmovups %xmm15, 48(%rbp,%rax)
|
||||
vmovups %xmm1, 64(%rbp,%rax)
|
||||
vmovups %xmm0, 80(%rbp,%rax)
|
||||
vmovups %xmm4, 96(%rbp,%rax)
|
||||
vmovups %xmm5, 112(%rbp,%rax)
|
||||
subq $-128, %rax
|
||||
cmpq %rbx, %rax
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
28
examples/update/update.s.csx.gcc.s
Normal file
28
examples/update/update.s.csx.gcc.s
Normal file
@@ -0,0 +1,28 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.L19:
|
||||
vmulpd (%rcx), %ymm3, %ymm12
|
||||
vmulpd 32(%rcx), %ymm3, %ymm13
|
||||
vmulpd 64(%rcx), %ymm3, %ymm14
|
||||
vmulpd 96(%rcx), %ymm3, %ymm15
|
||||
vmulpd 128(%rcx), %ymm3, %ymm0
|
||||
vmulpd 160(%rcx), %ymm3, %ymm1
|
||||
vmulpd 192(%rcx), %ymm3, %ymm7
|
||||
vmulpd 224(%rcx), %ymm3, %ymm4
|
||||
vmovupd %ymm12, (%rcx)
|
||||
vmovupd %ymm13, 32(%rcx)
|
||||
vmovupd %ymm14, 64(%rcx)
|
||||
vmovupd %ymm15, 96(%rcx)
|
||||
vmovupd %ymm0, 128(%rcx)
|
||||
vmovupd %ymm1, 160(%rcx)
|
||||
vmovupd %ymm7, 192(%rcx)
|
||||
vmovupd %ymm4, 224(%rcx)
|
||||
addq $256, %rcx
|
||||
cmpq %r15, %rcx
|
||||
jne .L19
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
17
examples/update/update.s.csx.icc.s
Normal file
17
examples/update/update.s.csx.icc.s
Normal file
@@ -0,0 +1,17 @@
|
||||
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
..B1.38: # Preds ..B1.38 ..B1.37
|
||||
# Execution count [2.22e+03]
|
||||
vmulpd (%r13,%rax,8), %zmm3, %zmm1 #75.5
|
||||
vmulpd 64(%r13,%rax,8), %zmm3, %zmm2 #75.5
|
||||
vmovupd %zmm1, (%r13,%rax,8) #75.5
|
||||
vmovupd %zmm2, 64(%r13,%rax,8) #75.5
|
||||
addq $16, %rax #75.5
|
||||
cmpq %r14, %rax #75.5
|
||||
jb ..B1.38 # Prob 82% #75.5
|
||||
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
|
||||
15
examples/update/update.s.tx2.clang.s
Normal file
15
examples/update/update.s.tx2.clang.s
Normal file
@@ -0,0 +1,15 @@
|
||||
// OSACA-BEGIN
|
||||
.LBB1_32: // Parent Loop BB1_20 Depth=1
|
||||
// Parent Loop BB1_22 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
ldp q0, q1, [x8]
|
||||
ldp q2, q3, [x8, #-32]
|
||||
fmul v2.2d, v2.2d, v26.2d
|
||||
fmul v3.2d, v3.2d, v26.2d
|
||||
stp q2, q3, [x8, #-32]
|
||||
fmul v0.2d, v0.2d, v26.2d
|
||||
fmul v1.2d, v1.2d, v26.2d
|
||||
stp q0, q1, [x8], #64
|
||||
adds x9, x9, #1 // =1
|
||||
b.ne .LBB1_32
|
||||
// OSACA-END
|
||||
31
examples/update/update.s.tx2.gcc.s
Normal file
31
examples/update/update.s.tx2.gcc.s
Normal file
@@ -0,0 +1,31 @@
|
||||
// OSACA-BEGIN
|
||||
.L17:
|
||||
ldr q23, [x16]
|
||||
mov x17, x16
|
||||
add x16, x16, 128
|
||||
fmul v24.2d, v23.2d, v2.2d
|
||||
str q24, [x17], 16
|
||||
ldr q25, [x16, -112]
|
||||
fmul v26.2d, v25.2d, v2.2d
|
||||
str q26, [x16, -112]
|
||||
ldr q27, [x17, 16]
|
||||
fmul v28.2d, v27.2d, v2.2d
|
||||
str q28, [x17, 16]
|
||||
ldr q29, [x16, -80]
|
||||
ldr q30, [x16, -64]
|
||||
ldr q31, [x16, -48]
|
||||
ldr q1, [x16, -32]
|
||||
ldr q0, [x16, -16]
|
||||
fmul v5.2d, v29.2d, v2.2d
|
||||
fmul v4.2d, v30.2d, v2.2d
|
||||
fmul v3.2d, v31.2d, v2.2d
|
||||
fmul v6.2d, v1.2d, v2.2d
|
||||
fmul v7.2d, v0.2d, v2.2d
|
||||
str q5, [x16, -80]
|
||||
str q4, [x16, -64]
|
||||
str q3, [x16, -48]
|
||||
str q6, [x16, -32]
|
||||
str q7, [x16, -16]
|
||||
cmp x22, x16
|
||||
bne .L17
|
||||
// OSACA-END
|
||||
22
examples/update/update.s.zen.gcc.s
Normal file
22
examples/update/update.s.zen.gcc.s
Normal file
@@ -0,0 +1,22 @@
|
||||
# OSACA-BEGIN
|
||||
.L19:
|
||||
vmulpd (%r10), %xmm3, %xmm11
|
||||
subq $-128, %r10
|
||||
vmulpd -112(%r10), %xmm3, %xmm12
|
||||
vmulpd -96(%r10), %xmm3, %xmm13
|
||||
vmulpd -80(%r10), %xmm3, %xmm14
|
||||
vmulpd -64(%r10), %xmm3, %xmm15
|
||||
vmulpd -48(%r10), %xmm3, %xmm0
|
||||
vmovups %xmm11, -128(%r10)
|
||||
vmulpd -32(%r10), %xmm3, %xmm7
|
||||
vmovups %xmm12, -112(%r10)
|
||||
vmulpd -16(%r10), %xmm3, %xmm1
|
||||
vmovups %xmm13, -96(%r10)
|
||||
vmovups %xmm14, -80(%r10)
|
||||
vmovups %xmm15, -64(%r10)
|
||||
vmovups %xmm0, -48(%r10)
|
||||
vmovups %xmm7, -32(%r10)
|
||||
vmovups %xmm1, -16(%r10)
|
||||
cmpq %r10, %r14
|
||||
jne .L19
|
||||
# OSACA-END
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Open Source Architecture Code Analyzer"""
|
||||
name = 'osaca'
|
||||
__version__ = '0.3.1.dev1'
|
||||
__version__ = '0.3.11'
|
||||
|
||||
# To trigger travis deployment to pypi, do the following:
|
||||
# 1. Increment __version___
|
||||
|
||||
@@ -5,9 +5,9 @@ import sys
|
||||
from io import StringIO
|
||||
|
||||
from osaca.frontend import Frontend
|
||||
from osaca.parser import ParserAArch64v81, ParserX86ATT
|
||||
from osaca.parser import ParserAArch64, ParserX86ATT
|
||||
from osaca.semantics import (INSTR_FLAGS, KernelDG, MachineModel,
|
||||
SemanticsAppender, reduce_to_section)
|
||||
ArchSemantics, reduce_to_section)
|
||||
|
||||
|
||||
# Stolen from https://stackoverflow.com/a/16571630
|
||||
@@ -26,10 +26,10 @@ class Capturing(list):
|
||||
class KerncraftAPI(object):
|
||||
def __init__(self, arch, code):
|
||||
self.machine_model = MachineModel(arch=arch)
|
||||
self.semantics = SemanticsAppender(self.machine_model)
|
||||
self.semantics = ArchSemantics(self.machine_model)
|
||||
isa = self.machine_model.get_ISA().lower()
|
||||
if isa == 'aarch64':
|
||||
self.parser = ParserAArch64v81()
|
||||
self.parser = ParserAArch64()
|
||||
elif isa == 'x86':
|
||||
self.parser = ParserX86ATT()
|
||||
|
||||
@@ -40,9 +40,7 @@ class KerncraftAPI(object):
|
||||
def create_output(self, verbose=False):
|
||||
kernel_graph = KernelDG(self.kernel, self.parser, self.machine_model)
|
||||
frontend = Frontend(arch=self.machine_model.get_arch())
|
||||
with Capturing() as output:
|
||||
frontend.print_full_analysis(self.kernel, kernel_graph, verbose=verbose)
|
||||
return '\n'.join(output)
|
||||
return frontend.full_analysis(self.kernel, kernel_graph, verbose=verbose)
|
||||
|
||||
def get_unmatched_instruction_ratio(self):
|
||||
unmatched_counter = 0
|
||||
|
||||
31
osaca/data/_build_cache.py
Executable file
31
osaca/data/_build_cache.py
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
from glob import glob
|
||||
import os.path
|
||||
import sys
|
||||
sys.path[0:0] = ['../..']
|
||||
|
||||
failed = False
|
||||
try:
|
||||
from osaca.semantics.hw_model import MachineModel
|
||||
except ModuleNotFoundError:
|
||||
print("Unable to import MachineModel, probably some dependency is not yet installed. SKIPPING. "
|
||||
"First run of OSACA may take a while to build caches, subsequent runs will be as fast as "
|
||||
"ever.")
|
||||
sys.exit()
|
||||
|
||||
print('Building cache: ', end='')
|
||||
sys.stdout.flush()
|
||||
|
||||
# Iterating architectures
|
||||
for f in glob(os.path.join(os.path.dirname(__file__), '*.yml')):
|
||||
MachineModel(path_to_yaml=f)
|
||||
print('.', end='')
|
||||
sys.stdout.flush()
|
||||
|
||||
# Iterating ISAs
|
||||
for f in glob(os.path.join(os.path.dirname(__file__), 'isa/*.yml')):
|
||||
MachineModel(path_to_yaml=f)
|
||||
print('+', end='')
|
||||
sys.stdout.flush()
|
||||
|
||||
print()
|
||||
1040
osaca/data/a64fx.yml
Normal file
1040
osaca/data/a64fx.yml
Normal file
File diff suppressed because it is too large
Load Diff
44150
osaca/data/bdw.yml
Normal file
44150
osaca/data/bdw.yml
Normal file
File diff suppressed because it is too large
Load Diff
70792
osaca/data/csx.yml
70792
osaca/data/csx.yml
File diff suppressed because it is too large
Load Diff
793
osaca/data/generate_mov_entries.py
Executable file
793
osaca/data/generate_mov_entries.py
Executable file
@@ -0,0 +1,793 @@
|
||||
#!/usr/bin/env python3
|
||||
from collections import OrderedDict, defaultdict
|
||||
from fractions import Fraction
|
||||
|
||||
|
||||
class MOVEntryBuilder:
|
||||
@staticmethod
|
||||
def compute_throughput(port_pressure):
|
||||
port_occupancy = defaultdict(Fraction)
|
||||
for uops, ports in port_pressure:
|
||||
for p in ports:
|
||||
port_occupancy[p] += Fraction(uops, len(ports))
|
||||
return float(max(list(port_occupancy.values()) + [0]))
|
||||
|
||||
@staticmethod
|
||||
def classify(operands_types):
|
||||
load = 'mem' in operands_types[:-1]
|
||||
store = 'mem' in operands_types[-1:]
|
||||
assert not (load and store), "Can not process a combined load-store instruction."
|
||||
return load, store
|
||||
|
||||
def build_description(
|
||||
self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
|
||||
):
|
||||
if comment:
|
||||
comment = " # " + comment
|
||||
else:
|
||||
comment = ""
|
||||
description = '- name: {}{}\n operands:\n'.format(instruction_name, comment)
|
||||
|
||||
for ot in operand_types:
|
||||
if ot == 'imd':
|
||||
description += ' - class: immediate\n imd: int\n'
|
||||
elif ot.startswith('mem'):
|
||||
description += ' - class: memory\n' ' base: "*"\n' ' offset: "*"\n'
|
||||
if ot == 'mem_simple':
|
||||
description += ' index: ~\n'
|
||||
elif ot == 'mem_complex':
|
||||
description += ' index: gpr\n'
|
||||
else:
|
||||
description += ' index: "*"\n'
|
||||
description += ' scale: "*"\n'
|
||||
else:
|
||||
description += ' - class: register\n name: {}\n'.format(ot)
|
||||
|
||||
description += (
|
||||
' latency: {latency}\n'
|
||||
' port_pressure: {port_pressure!r}\n'
|
||||
' throughput: {throughput}\n'
|
||||
' uops: {uops}\n'
|
||||
).format(
|
||||
latency=latency,
|
||||
port_pressure=port_pressure,
|
||||
throughput=self.compute_throughput(port_pressure),
|
||||
uops=sum([i for i, p in port_pressure]),
|
||||
)
|
||||
return description
|
||||
|
||||
def parse_port_pressure(self, port_pressure_str):
|
||||
"""
|
||||
Example:
|
||||
1*p45+2*p0 -> [[1, '45'], [2, '0']]
|
||||
"""
|
||||
port_pressure = []
|
||||
if port_pressure_str:
|
||||
for p in port_pressure_str.split('+'):
|
||||
cycles, ports = p.split('*p')
|
||||
port_pressure.append([int(cycles), ports])
|
||||
return port_pressure
|
||||
|
||||
def process_item(self, instruction_form, resources):
|
||||
"""
|
||||
Example:
|
||||
('mov xmm mem', ('1*p45+2*p0', 7) -> ('mov', ['xmm', 'mem'], [[1, '45'], [2, '0']], 7)
|
||||
"""
|
||||
instr_elements = instruction_form.split(' ')
|
||||
latency = resources[1]
|
||||
port_pressure = self.parse_port_pressure(resources[0])
|
||||
instruction_name = instr_elements[0]
|
||||
operand_types = instr_elements[1:]
|
||||
return self.build_description(instruction_name, operand_types, port_pressure, latency)
|
||||
|
||||
|
||||
class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
|
||||
# for SNB and IVB
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
load, store = self.classify(operand_types)
|
||||
|
||||
comment = None
|
||||
if load:
|
||||
port_pressure += [[1, '23'], [1, ['2D', '3D']]]
|
||||
latency += 4
|
||||
comment = "with load"
|
||||
if store:
|
||||
port_pressure += [[1, '23'], [1, '4']]
|
||||
latency += 0
|
||||
comment = "with store"
|
||||
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency, comment
|
||||
)
|
||||
|
||||
|
||||
class MOVEntryBuilderIntelWithPort7AGU(MOVEntryBuilder):
|
||||
# for HSW, BDW, SKX and CSX
|
||||
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
load, store = self.classify(operand_types)
|
||||
|
||||
if load:
|
||||
port_pressure += [[1, '23'], [1, ['2D', '3D']]]
|
||||
latency += 4
|
||||
comment = "with load"
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency, comment
|
||||
)
|
||||
if store:
|
||||
port_pressure_simple = port_pressure + [[1, '237'], [1, '4']]
|
||||
operands_simple = ['mem_simple' if o == 'mem' else o for o in operand_types]
|
||||
port_pressure_complex = port_pressure + [[1, '23'], [1, '4']]
|
||||
operands_complex = ['mem_complex' if o == 'mem' else o for o in operand_types]
|
||||
latency += 0
|
||||
return (
|
||||
MOVEntryBuilder.build_description(
|
||||
self,
|
||||
instruction_name,
|
||||
operands_simple,
|
||||
port_pressure_simple,
|
||||
latency,
|
||||
"with store, simple AGU",
|
||||
)
|
||||
+ '\n'
|
||||
+ MOVEntryBuilder.build_description(
|
||||
self,
|
||||
instruction_name,
|
||||
operands_complex,
|
||||
port_pressure_complex,
|
||||
latency,
|
||||
"with store, complex AGU",
|
||||
)
|
||||
)
|
||||
|
||||
# Register only:
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency
|
||||
)
|
||||
|
||||
|
||||
np7 = MOVEntryBuilderIntelNoPort7AGU()
|
||||
p7 = MOVEntryBuilderIntelWithPort7AGU()
|
||||
|
||||
# SNB
|
||||
snb_mov_instructions = [
|
||||
# https://www.felixcloutier.com/x86/mov
|
||||
('mov gpr gpr', ('1*p015', 1)),
|
||||
('mov gpr mem', ('', 0)),
|
||||
('mov mem gpr', ('', 0)),
|
||||
('mov imd gpr', ('1*p015', 1)),
|
||||
('mov imd mem', ('', 0)),
|
||||
('movabs imd gpr', ('1*p015', 1)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/movapd
|
||||
('movapd xmm xmm', ('1*p5', 1)),
|
||||
('movapd xmm mem', ('', 0)),
|
||||
('movapd mem xmm', ('', 0)),
|
||||
('vmovapd xmm xmm', ('1*p5', 1)),
|
||||
('vmovapd xmm mem', ('', 0)),
|
||||
('vmovapd mem xmm', ('', 0)),
|
||||
('vmovapd ymm ymm', ('1*p5', 1)),
|
||||
('vmovapd ymm mem', ('', 0)),
|
||||
('vmovapd mem ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movaps
|
||||
('movaps xmm xmm', ('1*p5', 1)),
|
||||
('movaps xmm mem', ('', 0)),
|
||||
('movaps mem xmm', ('', 0)),
|
||||
('vmovaps xmm xmm', ('1*p5', 1)),
|
||||
('movaps xmm mem', ('', 0)),
|
||||
('movaps mem xmm', ('', 0)),
|
||||
('vmovaps ymm ymm', ('1*p5', 1)),
|
||||
('movaps ymm mem', ('', 0)),
|
||||
('movaps mem ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movd:movq
|
||||
('movd gpr mm', ('1*p5', 1)),
|
||||
('movd mem mm', ('', 0)),
|
||||
('movq gpr mm', ('1*p5', 1)),
|
||||
('movq mem mm', ('', 0)),
|
||||
('movd mm gpr', ('1*p0', 1)),
|
||||
('movd mm mem', ('', 0)),
|
||||
('movq mm gpr', ('1*p0', 1)),
|
||||
('movq mm mem', ('', 0)),
|
||||
('movd gpr xmm', ('1*p5', 1)),
|
||||
('movd mem xmm', ('', 0)),
|
||||
('movq gpr xmm', ('1*p5', 1)),
|
||||
('movq mem xmm', ('', 0)),
|
||||
('movd xmm gpr', ('1*p0', 1)),
|
||||
('movd xmm mem', ('', 0)),
|
||||
('movq xmm gpr', ('1*p0', 1)),
|
||||
('movq xmm mem', ('', 0)),
|
||||
('vmovd gpr xmm', ('1*p5', 1)),
|
||||
('vmovd mem xmm', ('', 0)),
|
||||
('vmovq gpr xmm', ('1*p5', 1)),
|
||||
('vmovq mem xmm', ('', 0)),
|
||||
('vmovd xmm gpr', ('1*p0', 1)),
|
||||
('vmovd xmm mem', ('', 0)),
|
||||
('vmovq xmm gpr', ('1*p0', 1)),
|
||||
('vmovq xmm mem', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movddup
|
||||
('movddup xmm xmm', ('1*p5', 1)),
|
||||
('movddup mem xmm', ('', 0)),
|
||||
('vmovddup xmm xmm', ('1*p5', 1)),
|
||||
('vmovddup mem xmm', ('', 0)),
|
||||
('vmovddup ymm ymm', ('1*p5', 1)),
|
||||
('vmovddup mem ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movdq2q
|
||||
('movdq2q xmm mm', ('1*p015+1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
|
||||
('movdqa xmm xmm', ('1*p015', 1)),
|
||||
('movdqa mem xmm', ('', 0)),
|
||||
('movdqa xmm mem', ('', 0)),
|
||||
('vmovdqa xmm xmm', ('1*p015', 1)),
|
||||
('vmovdqa mem xmm', ('', 0)),
|
||||
('vmovdqa xmm mem', ('', 0)),
|
||||
('vmovdqa ymm ymm', ('1*p05', 1)),
|
||||
('vmovdqa mem ymm', ('', 0)),
|
||||
('vmovdqa ymm mem', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
|
||||
('movdqu xmm xmm', ('1*p015', 1)),
|
||||
('movdqu mem xmm', ('', 0)),
|
||||
('movdqu xmm mem', ('', 0)),
|
||||
('vmovdqu xmm xmm', ('1*p015', 1)),
|
||||
('vmovdqu mem xmm', ('', 0)),
|
||||
('vmovdqu xmm mem', ('', 0)),
|
||||
('vmovdqu ymm ymm', ('1*p05', 1)),
|
||||
('vmovdqu mem ymm', ('', 0)),
|
||||
('vmovdqu ymm mem', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movhlps
|
||||
('movhlps xmm xmm', ('1*p5', 1)),
|
||||
('vmovhlps xmm xmm xmm', ('1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/movhpd
|
||||
('movhpd mem xmm', ('1*p5', 1)),
|
||||
('vmovhpd mem xmm xmm', ('1*p5', 1)),
|
||||
('movhpd xmm mem', ('', 0)),
|
||||
('vmovhpd mem xmm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movhps
|
||||
('movhps mem xmm', ('1*p5', 1)),
|
||||
('vmovhps mem xmm xmm', ('1*p5', 1)),
|
||||
('movhps xmm mem', ('', 0)),
|
||||
('vmovhps mem xmm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movlhps
|
||||
('movlhps xmm xmm', ('1*p5', 1)),
|
||||
('vmovlhps xmm xmm xmm', ('1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/movlpd
|
||||
('movlpd mem xmm', ('1*p5', 1)),
|
||||
('vmovlpd mem xmm xmm', ('1*p5', 1)),
|
||||
('movlpd xmm mem', ('', 0)),
|
||||
('vmovlpd mem xmm', ('1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/movlps
|
||||
('movlps mem xmm', ('1*p5', 1)),
|
||||
('vmovlps mem xmm xmm', ('1*p5', 1)),
|
||||
('movlps xmm mem', ('', 0)),
|
||||
('vmovlps mem xmm', ('1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/movmskpd
|
||||
('movmskpd xmm gpr', ('1*p0', 2)),
|
||||
('vmovmskpd xmm gpr', ('1*p0', 2)),
|
||||
('vmovmskpd ymm gpr', ('1*p0', 2)),
|
||||
# https://www.felixcloutier.com/x86/movmskps
|
||||
('movmskps xmm gpr', ('1*p0', 1)),
|
||||
('vmovmskps xmm gpr', ('1*p0', 1)),
|
||||
('vmovmskps ymm gpr', ('1*p0', 1)),
|
||||
# https://www.felixcloutier.com/x86/movntdq
|
||||
('movntdq xmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
('vmovntdq xmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
('vmovntdq ymm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntdqa
|
||||
('movntdqa mem xmm', ('', 0)),
|
||||
('vmovntdqa mem xmm', ('', 0)),
|
||||
('vmovntdqa mem ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movnti
|
||||
('movnti gpr mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntpd
|
||||
('movntpd xmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
('vmovntpd xmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
('vmovntpd ymm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntps
|
||||
('movntps xmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
('vmovntps xmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
('vmovntps ymm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntq
|
||||
('movntq mm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movq
|
||||
('movq mm mm', ('', 0)),
|
||||
('movq mem mm', ('', 0)),
|
||||
('movq mm mem', ('', 0)),
|
||||
('movq xmm xmm', ('1*p015', 1)),
|
||||
('movq mem xmm', ('', 0)),
|
||||
('movq xmm mem', ('', 0)),
|
||||
('vmovq xmm xmm', ('1*p015', 1)),
|
||||
('vmovq mem xmm', ('', 0)),
|
||||
('vmovq xmm mem', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movq2dq
|
||||
('movq2dq mm xmm', ('1*p015', 1)),
|
||||
# https://www.felixcloutier.com/x86/movs:movsb:movsw:movsd:movsq
|
||||
# TODO combined load-store is currently not supported
|
||||
# ('movs mem mem', ()),
|
||||
# https://www.felixcloutier.com/x86/movsd
|
||||
('movsd xmm xmm', ('1*p5', 1)),
|
||||
('movsd mem xmm', ('', 0)),
|
||||
('movsd xmm mem', ('', 0)),
|
||||
('vmovsd xmm xmm xmm', ('1*p5', 1)),
|
||||
('vmovsd mem xmm', ('', 0)),
|
||||
('vmovsd xmm mem', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movshdup
|
||||
('movshdup xmm xmm', ('1*p5', 1)),
|
||||
('movshdup mem xmm', ('', 0)),
|
||||
('vmovshdup xmm xmm', ('1*p5', 1)),
|
||||
('vmovshdup mem xmm', ('', 0)),
|
||||
('vmovshdup ymm ymm', ('1*p5', 1)),
|
||||
('vmovshdup mem ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movsldup
|
||||
('movsldup xmm xmm', ('1*p5', 1)),
|
||||
('movsldup mem xmm', ('', 0)),
|
||||
('vmovsldup xmm xmm', ('1*p5', 1)),
|
||||
('vmovsldup mem xmm', ('', 0)),
|
||||
('vmovsldup ymm ymm', ('1*p5', 1)),
|
||||
('vmovsldup mem ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movss
|
||||
('movss xmm xmm', ('1*p5', 1)),
|
||||
('movss mem xmm', ('', 0)),
|
||||
('vmovss xmm xmm xmm', ('1*p5', 1)),
|
||||
('vmovss mem xmm', ('', 0)),
|
||||
('vmovss xmm xmm', ('1*p5', 1)),
|
||||
('vmovss xmm mem', ('', 0)),
|
||||
('movss mem xmm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movsx:movsxd
|
||||
('movsx gpr gpr', ('1*p015', 1)),
|
||||
('movsx mem gpr', ('', 0)),
|
||||
('movsxd gpr gpr', ('', 0)),
|
||||
('movsxd mem gpr', ('', 0)),
|
||||
('movsb gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movsb mem gpr', ('', 0)), # AT&T version
|
||||
('movsw gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movsw mem gpr', ('', 0)), # AT&T version
|
||||
('movsl gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movsl mem gpr', ('', 0)), # AT&T version
|
||||
('movsq gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movsq mem gpr', ('', 0)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/movupd
|
||||
('movupd xmm xmm', ('1*p5', 1)),
|
||||
('movupd mem xmm', ('', 0)),
|
||||
('movupd xmm mem', ('', 0)),
|
||||
('vmovupd xmm xmm', ('1*p5', 1)),
|
||||
('vmovupd mem xmm', ('', 0)),
|
||||
('vmovupd xmm mem', ('', 0)),
|
||||
('vmovupd ymm ymm', ('1*p5', 1)),
|
||||
('vmovupd mem ymm', ('', 0)),
|
||||
('vmovupd ymm mem', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movups
|
||||
('movups xmm xmm', ('1*p5', 1)),
|
||||
('movups mem xmm', ('', 0)),
|
||||
('movups xmm mem', ('', 0)),
|
||||
('vmovups xmm xmm', ('1*p5', 1)),
|
||||
('vmovups mem xmm', ('', 0)),
|
||||
('vmovups xmm mem', ('', 0)),
|
||||
('vmovups ymm ymm', ('1*p5', 1)),
|
||||
('vmovups mem ymm', ('', 0)),
|
||||
('vmovups ymm mem', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movzx
|
||||
('movzx gpr gpr', ('1*p015', 1)),
|
||||
('movzx mem gpr', ('', 0)),
|
||||
('movzb gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movzb mem gpr', ('', 0)), # AT&T version
|
||||
('movzw gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movzw mem gpr', ('', 0)), # AT&T version
|
||||
('movzl gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movzl mem gpr', ('', 0)), # AT&T version
|
||||
('movzq gpr gpr', ('1*p015', 1)), # AT&T version
|
||||
('movzq mem gpr', ('', 0)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/cmovcc
|
||||
('cmova gpr gpr', ('1*p015+2*p05', 2)),
|
||||
('cmova mem gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovae gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovae mem gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovb gpr gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovb mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovbe gpr gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovbe mem gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovc gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovc mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmove gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmove mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovg gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovg mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovge gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovge mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovl gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovl mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovle gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovle mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovna gpr gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovna mem gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovnae gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnae mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnb gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnb mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnbe gpr gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovnbe mem gpr', ('1*p015+2*p05', 2)),
|
||||
('cmovnb gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnb mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnc gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnc mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovne gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovne mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovng gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovng mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnge gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnge mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnl gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnl mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovno gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovno mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnp gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnp mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovns gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovns mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnz gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovnz mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovo gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovo mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovp gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovp mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovpe gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovpe mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovpo gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovpo mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovs gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovs mem gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovz gpr gpr', ('1*p015+1*p05', 2)),
|
||||
('cmovz mem gpr', ('1*p015+1*p05', 2)),
|
||||
# https://www.felixcloutier.com/x86/pmovmskb
|
||||
('pmovmskb mm gpr', ('1*p0', 2)),
|
||||
('pmovmskb xmm gpr', ('1*p0', 2)),
|
||||
('vpmovmskb xmm gpr', ('1*p0', 2)),
|
||||
# https://www.felixcloutier.com/x86/pmovsx
|
||||
('pmovsxbw xmm xmm', ('1*p15', 1)),
|
||||
('pmovsxbw mem xmm', ('1*p15', 1)),
|
||||
('pmovsxbd xmm xmm', ('1*p15', 1)),
|
||||
('pmovsxbd mem xmm', ('1*p15', 1)),
|
||||
('pmovsxbq xmm xmm', ('1*p15', 1)),
|
||||
('pmovsxbq mem xmm', ('1*p15', 1)),
|
||||
('vpmovsxbw xmm xmm', ('1*p15', 1)),
|
||||
('vpmovsxbw mem xmm', ('1*p15', 1)),
|
||||
('vpmovsxbd xmm xmm', ('1*p15', 1)),
|
||||
('vpmovsxbd mem xmm', ('1*p15', 1)),
|
||||
('vpmovsxbq xmm xmm', ('1*p15', 1)),
|
||||
('vpmovsxbq mem xmm', ('1*p15', 1)),
|
||||
('vpmovsxbw ymm ymm', ('1*p15', 1)),
|
||||
('vpmovsxbw mem ymm', ('1*p15', 1)),
|
||||
('vpmovsxbd ymm ymm', ('1*p15', 1)),
|
||||
('vpmovsxbd mem ymm', ('1*p15', 1)),
|
||||
('vpmovsxbq ymm ymm', ('1*p15', 1)),
|
||||
('vpmovsxbq mem ymm', ('1*p15', 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovzx
|
||||
('pmovzxbw xmm xmm', ('1*p15', 1)),
|
||||
('pmovzxbw mem xmm', ('1*p15', 1)),
|
||||
('vpmovzxbw xmm xmm', ('1*p15', 1)),
|
||||
('vpmovzxbw mem xmm', ('1*p15', 1)),
|
||||
('vpmovzxbw ymm ymm', ('1*p15', 1)),
|
||||
('vpmovzxbw mem ymm', ('1*p15', 1)),
|
||||
]
|
||||
|
||||
ivb_mov_instructions = list(
|
||||
OrderedDict(
|
||||
snb_mov_instructions
|
||||
+ [
|
||||
# https://www.felixcloutier.com/x86/mov
|
||||
('mov gpr gpr', ('', 0)),
|
||||
('mov imd gpr', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movapd
|
||||
('movapd xmm xmm', ('', 0)),
|
||||
('vmovapd xmm xmm', ('', 0)),
|
||||
('vmovapd ymm ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movaps
|
||||
('movaps xmm xmm', ('', 0)),
|
||||
('vmovaps xmm xmm', ('', 0)),
|
||||
('vmovaps ymm ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
|
||||
('movdqa xmm xmm', ('', 0)),
|
||||
('vmovdqa xmm xmm', ('', 0)),
|
||||
('vmovdqa ymm ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
|
||||
('movdqu xmm xmm', ('', 0)),
|
||||
('vmovdqu xmm xmm', ('', 0)),
|
||||
('vmovdqu ymm ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movupd
|
||||
('movupd xmm xmm', ('', 0)),
|
||||
('vmovupd xmm xmm', ('', 0)),
|
||||
('vmovupd ymm ymm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movupd
|
||||
('movups xmm xmm', ('', 0)),
|
||||
('vmovups xmm xmm', ('', 0)),
|
||||
('vmovups ymm ymm', ('', 0)),
|
||||
]
|
||||
).items()
|
||||
)
|
||||
|
||||
hsw_mov_instructions = list(
|
||||
OrderedDict(
|
||||
ivb_mov_instructions
|
||||
+ [
|
||||
# https://www.felixcloutier.com/x86/mov
|
||||
('mov imd gpr', ('1*p0156', 1)),
|
||||
('mov gpr gpr', ('1*p0156', 1)),
|
||||
('movabs imd gpr', ('1*p0156', 1)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/movbe
|
||||
('movbe gpr mem', ('1*p15', 6)),
|
||||
('movbe mem gpr', ('1*p15', 6)),
|
||||
# https://www.felixcloutier.com/x86/movmskpd
|
||||
('movmskpd xmm gpr', ('1*p0', 3)),
|
||||
('vmovmskpd xmm gpr', ('1*p0', 3)),
|
||||
('vmovmskpd ymm gpr', ('1*p0', 3)),
|
||||
# https://www.felixcloutier.com/x86/movmskps
|
||||
('movmskps xmm gpr', ('1*p0', 3)),
|
||||
('vmovmskps xmm gpr', ('1*p0', 3)),
|
||||
('vmovmskps ymm gpr', ('1*p0', 3)),
|
||||
# https://www.felixcloutier.com/x86/movsx:movsxd
|
||||
('movsx gpr gpr', ('1*p0156', 1)),
|
||||
('movsb gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
('movsw gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
('movsl gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
('movsq gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/movzx
|
||||
('movzx gpr gpr', ('1*p0156', 1)),
|
||||
('movzb gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
('movzw gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
('movzl gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
('movzq gpr gpr', ('1*p0156', 1)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/cmovcc
|
||||
('cmova gpr gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmova mem gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovae gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovae mem gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovb gpr gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovb mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovbe gpr gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovbe mem gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovc gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovc mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmove gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmove mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovg gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovg mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovge gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovge mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovl gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovl mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovle gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovle mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovna gpr gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovna mem gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovnae gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnae mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnb gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnb mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnbe gpr gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovnbe mem gpr', ('1*p0156+2*p06', 2)),
|
||||
('cmovnb gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnb mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnc gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnc mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovne gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovne mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovng gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovng mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnge gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnge mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnl gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnl mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovno gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovno mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnp gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnp mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovns gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovns mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnz gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovnz mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovo gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovo mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovp gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovp mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovpe gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovpe mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovpo gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovpo mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovs gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovs mem gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovz gpr gpr', ('1*p0156+1*p06', 2)),
|
||||
('cmovz mem gpr', ('1*p0156+1*p06', 2)),
|
||||
# https://www.felixcloutier.com/x86/pmovmskb
|
||||
('pmovmskb mm gpr', ('1*p0', 3)),
|
||||
('pmovmskb xmm gpr', ('1*p0', 3)),
|
||||
('vpmovmskb xmm gpr', ('1*p0', 3)),
|
||||
('vpmovmskb ymm gpr', ('1*p0', 3)),
|
||||
# https://www.felixcloutier.com/x86/pmovsx
|
||||
('pmovsxbw xmm xmm', ('1*p5', 1)),
|
||||
('pmovsxbw mem xmm', ('1*p5', 1)),
|
||||
('pmovsxbd xmm xmm', ('1*p5', 1)),
|
||||
('pmovsxbd mem xmm', ('1*p5', 1)),
|
||||
('pmovsxbq xmm xmm', ('1*p5', 1)),
|
||||
('pmovsxbq mem xmm', ('1*p5', 1)),
|
||||
('vpmovsxbw xmm xmm', ('1*p5', 1)),
|
||||
('vpmovsxbw mem xmm', ('1*p5', 1)),
|
||||
('vpmovsxbd xmm xmm', ('1*p5', 1)),
|
||||
('vpmovsxbd mem xmm', ('1*p5', 1)),
|
||||
('vpmovsxbq xmm xmm', ('1*p5', 1)),
|
||||
('vpmovsxbq mem xmm', ('1*p5', 1)),
|
||||
('vpmovsxbw ymm ymm', ('1*p5', 1)),
|
||||
('vpmovsxbw mem ymm', ('1*p5', 1)),
|
||||
('vpmovsxbd ymm ymm', ('1*p5', 1)),
|
||||
('vpmovsxbd mem ymm', ('1*p5', 1)),
|
||||
('vpmovsxbq ymm ymm', ('1*p5', 1)),
|
||||
('vpmovsxbq mem ymm', ('1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovzx
|
||||
('pmovzxbw xmm xmm', ('1*p5', 1)),
|
||||
('pmovzxbw mem xmm', ('1*p5', 1)),
|
||||
('vpmovzxbw xmm xmm', ('1*p5', 1)),
|
||||
('vpmovzxbw mem xmm', ('1*p5', 1)),
|
||||
('vpmovzxbw ymm ymm', ('1*p5', 1)),
|
||||
('vpmovzxbw mem ymm', ('1*p5', 1)),
|
||||
]
|
||||
).items()
|
||||
)
|
||||
|
||||
bdw_mov_instructions = list(
|
||||
OrderedDict(
|
||||
hsw_mov_instructions
|
||||
+ [
|
||||
# https://www.felixcloutier.com/x86/cmovcc
|
||||
('cmova gpr gpr', ('2*p06', 1)),
|
||||
('cmova mem gpr', ('2*p06', 1)),
|
||||
('cmovae gpr gpr', ('1*p06', 1)),
|
||||
('cmovae mem gpr', ('2*p06', 1)),
|
||||
('cmovb gpr gpr', ('2*p06', 1)),
|
||||
('cmovb mem gpr', ('1*p06', 1)),
|
||||
('cmovbe gpr gpr', ('2*p06', 1)),
|
||||
('cmovbe mem gpr', ('2*p06', 1)),
|
||||
('cmovc gpr gpr', ('1*p06', 1)),
|
||||
('cmovc mem gpr', ('1*p06', 1)),
|
||||
('cmove gpr gpr', ('1*p06', 1)),
|
||||
('cmove mem gpr', ('1*p06', 1)),
|
||||
('cmovg gpr gpr', ('1*p06', 1)),
|
||||
('cmovg mem gpr', ('1*p06', 1)),
|
||||
('cmovge gpr gpr', ('1*p06', 1)),
|
||||
('cmovge mem gpr', ('1*p06', 1)),
|
||||
('cmovl gpr gpr', ('1*p06', 1)),
|
||||
('cmovl mem gpr', ('1*p06', 1)),
|
||||
('cmovle gpr gpr', ('1*p06', 1)),
|
||||
('cmovle mem gpr', ('1*p06', 1)),
|
||||
('cmovna gpr gpr', ('2*p06', 1)),
|
||||
('cmovna mem gpr', ('2*p06', 1)),
|
||||
('cmovnae gpr gpr', ('1*p06', 1)),
|
||||
('cmovnae mem gpr', ('1*p06', 1)),
|
||||
('cmovnb gpr gpr', ('1*p06', 1)),
|
||||
('cmovnb mem gpr', ('1*p06', 1)),
|
||||
('cmovnbe gpr gpr', ('2*p06', 1)),
|
||||
('cmovnbe mem gpr', ('2*p06', 1)),
|
||||
('cmovnb gpr gpr', ('1*p06', 1)),
|
||||
('cmovnb mem gpr', ('1*p06', 1)),
|
||||
('cmovnc gpr gpr', ('1*p06', 1)),
|
||||
('cmovnc mem gpr', ('1*p06', 1)),
|
||||
('cmovne gpr gpr', ('1*p06', 1)),
|
||||
('cmovne mem gpr', ('1*p06', 1)),
|
||||
('cmovng gpr gpr', ('1*p06', 1)),
|
||||
('cmovng mem gpr', ('1*p06', 1)),
|
||||
('cmovnge gpr gpr', ('1*p06', 1)),
|
||||
('cmovnge mem gpr', ('1*p06', 1)),
|
||||
('cmovnl gpr gpr', ('1*p06', 1)),
|
||||
('cmovnl mem gpr', ('1*p06', 1)),
|
||||
('cmovno gpr gpr', ('1*p06', 1)),
|
||||
('cmovno mem gpr', ('1*p06', 1)),
|
||||
('cmovnp gpr gpr', ('1*p06', 1)),
|
||||
('cmovnp mem gpr', ('1*p06', 1)),
|
||||
('cmovns gpr gpr', ('1*p06', 1)),
|
||||
('cmovns mem gpr', ('1*p06', 1)),
|
||||
('cmovnz gpr gpr', ('1*p06', 1)),
|
||||
('cmovnz mem gpr', ('1*p06', 1)),
|
||||
('cmovo gpr gpr', ('1*p06', 1)),
|
||||
('cmovo mem gpr', ('1*p06', 1)),
|
||||
('cmovp gpr gpr', ('1*p06', 1)),
|
||||
('cmovp mem gpr', ('1*p06', 1)),
|
||||
('cmovpe gpr gpr', ('1*p06', 1)),
|
||||
('cmovpe mem gpr', ('1*p06', 1)),
|
||||
('cmovpo gpr gpr', ('1*p06', 1)),
|
||||
('cmovpo mem gpr', ('1*p06', 1)),
|
||||
('cmovs gpr gpr', ('1*p06', 1)),
|
||||
('cmovs mem gpr', ('1*p06', 1)),
|
||||
('cmovz gpr gpr', ('1*p06', 1)),
|
||||
('cmovz mem gpr', ('1*p06', 1)),
|
||||
]
|
||||
).items()
|
||||
)
|
||||
|
||||
skx_mov_instructions = list(
|
||||
OrderedDict(
|
||||
bdw_mov_instructions
|
||||
+ [
|
||||
# https://www.felixcloutier.com/x86/movapd
|
||||
# TODO with masking!
|
||||
# TODO the following may eliminate or be bound to 1*p0156:
|
||||
# ('movapd xmm xmm', ('1*p5', 1)),
|
||||
# ('vmovapd xmm xmm', ('1*p5', 1)),
|
||||
# ('vmovapd ymm ymm', ('1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/movaps
|
||||
# TODO with masking!
|
||||
# TODO the following may eliminate or be bound to 1*p0156:
|
||||
# ('movaps xmm xmm', ('1*p5', 1)),
|
||||
# ('vmovaps xmm xmm', ('1*p5', 1)),
|
||||
# ('vmovaps ymm ymm', ('1*p5', 1)),
|
||||
# https://www.felixcloutier.com/x86/movbe
|
||||
('movbe gpr mem', ('1*p15', 4)),
|
||||
('movbe mem gpr', ('1*p15', 4)),
|
||||
# https://www.felixcloutier.com/x86/movddup
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movntdq
|
||||
('vmovntdq zmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntdqa
|
||||
('vmovntdqa mem zmm', ('', 0)),
|
||||
# https://www.felixcloutier.com/x86/movntpd
|
||||
('vmovntpd zmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntps
|
||||
('vmovntps zmm mem', ('', 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movq2dq
|
||||
('movq2dq mm xmm', ('1*p0+1*p015', 1)),
|
||||
# https://www.felixcloutier.com/x86/movsd
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movshdup
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movsldup
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movss
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movupd
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movups
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/pmovsx
|
||||
# TODO with masking!
|
||||
('vpmovsxbw ymm zmm', ('1*p5', 3)),
|
||||
('vpmovsxbw mem zmm', ('1*p5', 1)),
|
||||
]
|
||||
).items()
|
||||
)
|
||||
|
||||
csx_mov_instructions = OrderedDict(skx_mov_instructions + []).items()
|
||||
|
||||
|
||||
def get_description(arch, rhs_comment=None):
|
||||
descriptions = {
|
||||
'snb': '\n'.join([np7.process_item(*item) for item in snb_mov_instructions]),
|
||||
'ivb': '\n'.join([np7.process_item(*item) for item in ivb_mov_instructions]),
|
||||
'hsw': '\n'.join([p7.process_item(*item) for item in hsw_mov_instructions]),
|
||||
'bdw': '\n'.join([p7.process_item(*item) for item in bdw_mov_instructions]),
|
||||
'skx': '\n'.join([p7.process_item(*item) for item in skx_mov_instructions]),
|
||||
'csx': '\n'.join([p7.process_item(*item) for item in csx_mov_instructions]),
|
||||
}
|
||||
|
||||
description = descriptions[arch]
|
||||
|
||||
if rhs_comment is not None:
|
||||
max_length = max([len(l) for l in descriptions[arch].split('\n')])
|
||||
|
||||
commented_description = ""
|
||||
for l in descriptions[arch].split('\n'):
|
||||
commented_description += ("{:<" + str(max_length) + "} # {}\n").format(l, rhs_comment)
|
||||
description = commented_description
|
||||
|
||||
return description
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: {} (snb|ivb|hsw|bdw|skx|csx)".format(sys.argv[0]))
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
print(get_description(sys.argv[1], rhs_comment=' '.join(sys.argv)))
|
||||
except KeyError:
|
||||
print("Unknown architecture.")
|
||||
sys.exit(1)
|
||||
15568
osaca/data/hsw.yml
Normal file
15568
osaca/data/hsw.yml
Normal file
File diff suppressed because it is too large
Load Diff
36318
osaca/data/icl.yml
Normal file
36318
osaca/data/icl.yml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,374 +1,162 @@
|
||||
osaca_version: 0.3.0
|
||||
osaca_version: 0.3.7
|
||||
isa: "AArch64"
|
||||
# Contains all operand-irregular instruction forms OSACA supports for AArch64.
|
||||
# Operand-regular for a AArch64 instruction form with N operands in the shape of
|
||||
# mnemonic op1 ... opN
|
||||
# means that op1 is the only destination operand and op2 to op(N) are source operands.
|
||||
instruction_forms:
|
||||
- name: "fmla"
|
||||
- name: fmla
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
prefix: "*"
|
||||
shape: "*"
|
||||
source: true
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
prefix: "*"
|
||||
shape: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
prefix: "*"
|
||||
shape: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- name: "fmla"
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
source: true
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
source: true
|
||||
destination: false
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
base: "*"
|
||||
offset: "*"
|
||||
index: "*"
|
||||
scale: "*"
|
||||
pre-indexed: "*"
|
||||
post-indexed: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- name: "ldp"
|
||||
- name: [ldr, ldur]
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
source: true
|
||||
destination: false
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
base: "*"
|
||||
offset: "*"
|
||||
index: "*"
|
||||
scale: "*"
|
||||
pre-indexed: "*"
|
||||
post-indexed: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- name: "ldp"
|
||||
- name: stp
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: true
|
||||
destination: false
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
source: true
|
||||
destination: false
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: true
|
||||
destination: false
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
source: false
|
||||
destination: true
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: true
|
||||
post-indexed: false
|
||||
source: true
|
||||
destination: true
|
||||
- name: "stp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
base: "*"
|
||||
offset: "*"
|
||||
index: "*"
|
||||
scale: "*"
|
||||
pre-indexed: "*"
|
||||
post-indexed: "*"
|
||||
source: false
|
||||
destination: true
|
||||
- name: "stp"
|
||||
destination: true
|
||||
- name: [str, stur]
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
base: "*"
|
||||
offset: "*"
|
||||
index: "*"
|
||||
scale: "*"
|
||||
pre-indexed: "*"
|
||||
post-indexed: "*"
|
||||
source: false
|
||||
destination: true
|
||||
- name: "stp"
|
||||
- name: cmp
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: false
|
||||
destination: true
|
||||
- name: "stp"
|
||||
- name: cmp
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "immediate"
|
||||
imd: "int"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: True
|
||||
source: false
|
||||
destination: true
|
||||
- name: "stp"
|
||||
- name: cmn
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: false
|
||||
destination: true
|
||||
- name: "str"
|
||||
- name: cmn
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: false
|
||||
destination: true
|
||||
- name: "str"
|
||||
- class: "immediate"
|
||||
imd: "int"
|
||||
source: true
|
||||
destination: false
|
||||
- name: fcmp
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: false
|
||||
destination: true
|
||||
- name: "str"
|
||||
- class: "register"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- name: fcmp
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
source: false
|
||||
destination: true
|
||||
- name: "str"
|
||||
- class: "immediate"
|
||||
imd: "double"
|
||||
source: true
|
||||
destination: false
|
||||
- name: fcmp
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
prefix: "*"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: "x"
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: false
|
||||
destination: true
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "immediate"
|
||||
imd: "float"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
source: false
|
||||
destination: true
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
source: false
|
||||
destination: true
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
source: true
|
||||
destination: false
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: "x"
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
source: false
|
||||
destination: true
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
16787
osaca/data/ivb.yml
Normal file
16787
osaca/data/ivb.yml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,118 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import re
|
||||
import os.path
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from distutils.version import StrictVersion
|
||||
from itertools import groupby, product
|
||||
|
||||
from ruamel import yaml
|
||||
|
||||
from osaca.db_interface import add_entries_to_db
|
||||
from osaca.parser import ParserAArch64v81, ParserX86ATT
|
||||
from osaca.parser import get_parser
|
||||
from osaca.semantics import MachineModel
|
||||
|
||||
ARCH_DICT = {
|
||||
'vulcan': 'aarch64',
|
||||
'snb': 'x86',
|
||||
'ivb': 'x86',
|
||||
'hsw': 'x86',
|
||||
'bdw': 'x86',
|
||||
'skl': 'x86',
|
||||
'skx': 'x86',
|
||||
'csx': 'x86',
|
||||
}
|
||||
intel_archs = [
|
||||
'CON',
|
||||
'WOL',
|
||||
'NHM',
|
||||
'WSM',
|
||||
'SNB',
|
||||
'IVB',
|
||||
'HSW',
|
||||
'BDW',
|
||||
'SKL',
|
||||
'SKX',
|
||||
'KBL',
|
||||
'CFL',
|
||||
'CNL',
|
||||
'ICL',
|
||||
]
|
||||
amd_archs = ['ZEN1', 'ZEN+', 'ZEN2']
|
||||
|
||||
|
||||
def port_pressure_from_tag_attributes(attrib, arch, ports):
|
||||
# apply cycles for D ports
|
||||
data_port = re.compile(r'[0-9]D$')
|
||||
data_ports = [x[:-1] for x in filter(data_port.match, ports)]
|
||||
def port_pressure_from_tag_attributes(attrib):
|
||||
# '1*p015+1*p1+1*p23+1*p4+3*p5' ->
|
||||
# [[1, '015'], [1, '1'], [1, '23'], [1, '4'], [3, '5']]
|
||||
port_occupation = []
|
||||
for p in attrib['ports'].split('+'):
|
||||
cycles, ports = p.split('*')
|
||||
ports = ports.lstrip('p')
|
||||
ports = ports.lstrip('FP')
|
||||
port_occupation.append([int(cycles), ports])
|
||||
|
||||
# format attributes
|
||||
cycles = attrib['ports'].split('+')
|
||||
cycles = [c.split('*') for c in cycles]
|
||||
for i, c in enumerate(cycles):
|
||||
cycles[i][0] = int(c[0])
|
||||
if str(c[1]).startswith('p'):
|
||||
cycles[i][1] = [p for p in c[1][1:]]
|
||||
if data_ports and data_ports == cycles[i][1]:
|
||||
# uops for data ports
|
||||
cycles.append([c[0], [x + 'D' for x in data_ports]])
|
||||
cycles[i][0] = [
|
||||
cycles[i][0] / num for num in range(1, len(cycles[i][1]) + 1) for _ in range(num)
|
||||
]
|
||||
cycles = [list(product(c[0], c[1])) for c in cycles]
|
||||
all_options = []
|
||||
|
||||
# iterate over all combinations of all uop options
|
||||
for cycles_combs in cycles:
|
||||
options = []
|
||||
tmp_opt = []
|
||||
total = cycles_combs[0][0]
|
||||
# iterate over all combinations of each uop option
|
||||
for comb in cycles_combs:
|
||||
# add options until they reach the total num of uops
|
||||
tmp_opt.append(comb)
|
||||
if sum([c[0] for c in tmp_opt]) == total:
|
||||
# copy this option as one of several to the cycle option list
|
||||
options.append(tmp_opt.copy())
|
||||
tmp_opt = []
|
||||
if len(tmp_opt) != 0:
|
||||
raise ValueError('Cannot compute port pressure')
|
||||
options = [x for x, _ in groupby(options)]
|
||||
all_options.append(options)
|
||||
all_options = list(product(*all_options))
|
||||
|
||||
# find best scheduling
|
||||
port_pressure = {}
|
||||
for p in ports:
|
||||
port_pressure[p] = 0.0
|
||||
first = calculate_port_pressure(all_options[0])
|
||||
for key in first:
|
||||
port_pressure[key] = first[key]
|
||||
for option in all_options[1:]:
|
||||
tmp = calculate_port_pressure(option)
|
||||
if (max(list(tmp.values())) <= max(list(port_pressure.values()))) and (
|
||||
len(tmp) > len([x for x in port_pressure.values() if x != 0.0])
|
||||
):
|
||||
for k in port_pressure:
|
||||
port_pressure[k] = tmp[k] if k in tmp else 0.0
|
||||
|
||||
# check if calculation equals given throughput
|
||||
if abs(max(list(port_pressure.values())) - float(attrib['TP_ports'])) > 0.01:
|
||||
print('Contradicting TP value compared to port_pressure. Ignore port pressure.')
|
||||
for p in port_pressure:
|
||||
port_pressure[p] = 0.0
|
||||
return port_pressure
|
||||
|
||||
# Also consider DIV pipeline
|
||||
# Also consider div on DIV pipeline
|
||||
if 'div_cycles' in attrib:
|
||||
div_port = re.compile(r'[0-9]DV$')
|
||||
div_ports = [x for x in filter(div_port.match, ports)]
|
||||
for dp in div_ports:
|
||||
port_pressure[dp] += int(attrib['div_cycles']) / len(div_ports)
|
||||
return port_pressure
|
||||
port_occupation.append([int(attrib['div_cycles']), ['DIV']])
|
||||
|
||||
return port_occupation
|
||||
|
||||
|
||||
def calculate_port_pressure(pp_option):
|
||||
ports = {}
|
||||
for option in pp_option:
|
||||
for port in option:
|
||||
if port[1] in ports:
|
||||
ports[port[1]] += port[0]
|
||||
else:
|
||||
ports[port[1]] = port[0]
|
||||
return ports
|
||||
|
||||
|
||||
def extract_paramters(instruction_tag, arch):
|
||||
isa = ARCH_DICT[arch.lower()]
|
||||
parser = ParserX86ATT()
|
||||
if isa == 'aarch64':
|
||||
parser = ParserAArch64v81()
|
||||
elif isa == 'x86':
|
||||
parser = ParserX86ATT()
|
||||
def extract_paramters(instruction_tag, parser, isa):
|
||||
# Extract parameter components
|
||||
parameters = [] # used to store string representations
|
||||
parameter_tags = sorted(instruction_tag.findall("operand"), key=lambda p: int(p.attrib['idx']))
|
||||
@@ -129,10 +61,10 @@ def extract_paramters(instruction_tag, arch):
|
||||
parameters.append(parameter)
|
||||
elif p_type == 'mem':
|
||||
parameter['class'] = 'memory'
|
||||
parameter['base'] = 'gpr'
|
||||
parameter['offset'] = None
|
||||
parameter['index'] = None
|
||||
parameter['scale'] = 1
|
||||
parameter['base'] = "*"
|
||||
parameter['offset'] = "*"
|
||||
parameter['index'] = "*"
|
||||
parameter['scale'] = "*"
|
||||
parameters.append(parameter)
|
||||
elif p_type == 'reg':
|
||||
parameter['class'] = 'register'
|
||||
@@ -161,32 +93,39 @@ def extract_paramters(instruction_tag, arch):
|
||||
parameter['class'] = 'identifier'
|
||||
parameters.append(parameter)
|
||||
elif p_type == 'agen':
|
||||
# FIXME actually only address generation
|
||||
parameter['class'] = 'memory'
|
||||
parameter['base'] = 'gpr'
|
||||
parameter['offset'] = None
|
||||
parameter['index'] = None
|
||||
parameter['scale'] = 1
|
||||
parameters.append(parameter)
|
||||
parameter['base'] = "*"
|
||||
parameter['offset'] = "*"
|
||||
parameter['index'] = "*"
|
||||
parameter['scale'] = "*"
|
||||
parameters.append(parameter)
|
||||
else:
|
||||
raise ValueError("Unknown paramter type {}".format(parameter_tag.attrib))
|
||||
return parameters
|
||||
|
||||
|
||||
def extract_model(tree, arch):
|
||||
mm = MachineModel(arch.lower())
|
||||
ports = mm._data['ports']
|
||||
model_data = []
|
||||
def extract_model(tree, arch, skip_mem=True):
|
||||
try:
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
except Exception:
|
||||
print("Skipping...", file=sys.stderr)
|
||||
return None
|
||||
mm = MachineModel(isa=isa)
|
||||
parser = get_parser(isa)
|
||||
|
||||
for instruction_tag in tree.findall('.//instruction'):
|
||||
ignore = False
|
||||
|
||||
mnemonic = instruction_tag.attrib['asm']
|
||||
iform = instruction_tag.attrib['iform']
|
||||
# skip any mnemonic which contain spaces (e.g., "REX CRC32")
|
||||
if ' ' in mnemonic:
|
||||
continue
|
||||
|
||||
# Extract parameter components
|
||||
try:
|
||||
parameters = extract_paramters(instruction_tag, arch)
|
||||
if ARCH_DICT[arch.lower()] == 'x86':
|
||||
parameters = extract_paramters(instruction_tag, parser, isa)
|
||||
if isa == 'x86':
|
||||
parameters.reverse()
|
||||
except ValueError as e:
|
||||
print(e, file=sys.stderr)
|
||||
@@ -196,6 +135,26 @@ def extract_model(tree, arch):
|
||||
arch_tag = instruction_tag.find('architecture[@name="' + arch.upper() + '"]')
|
||||
if arch_tag is None:
|
||||
continue
|
||||
# skip any instructions without port utilization
|
||||
if not any(['ports' in x.attrib for x in arch_tag.findall('measurement')]):
|
||||
print("Couldn't find port utilization, skip: ", iform, file=sys.stderr)
|
||||
continue
|
||||
# skip if computed and measured TP don't match
|
||||
if not [x.attrib['TP_ports'] == x.attrib['TP'] for x in arch_tag.findall('measurement')][
|
||||
0
|
||||
]:
|
||||
print(
|
||||
"Calculated TP from port utilization doesn't match TP, skip: ",
|
||||
iform,
|
||||
file=sys.stderr,
|
||||
)
|
||||
continue
|
||||
# skip if instruction contains memory operand
|
||||
if skip_mem and any(
|
||||
[x.attrib['type'] == 'mem' for x in instruction_tag.findall('operand')]
|
||||
):
|
||||
print("Contains memory operand, skip: ", iform, file=sys.stderr)
|
||||
continue
|
||||
# We collect all measurement and IACA information and compare them later
|
||||
for measurement_tag in arch_tag.iter('measurement'):
|
||||
if 'TP_ports' in measurement_tag.attrib:
|
||||
@@ -208,9 +167,7 @@ def extract_model(tree, arch):
|
||||
int(measurement_tag.attrib['uops']) if 'uops' in measurement_tag.attrib else None
|
||||
)
|
||||
if 'ports' in measurement_tag.attrib:
|
||||
port_pressure.append(
|
||||
port_pressure_from_tag_attributes(measurement_tag.attrib, arch, ports)
|
||||
)
|
||||
port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib))
|
||||
latencies = [
|
||||
int(l_tag.attrib['cycles'])
|
||||
for l_tag in measurement_tag.iter('latency')
|
||||
@@ -223,46 +180,74 @@ def extract_model(tree, arch):
|
||||
if 'max_cycles' in l_tag.attrib
|
||||
]
|
||||
if latencies[1:] != latencies[:-1]:
|
||||
print("Contradicting latencies found:", mnemonic, file=sys.stderr)
|
||||
ignore = True
|
||||
elif latencies:
|
||||
latency = latencies[0]
|
||||
print(
|
||||
"Contradicting latencies found, using smallest:",
|
||||
iform,
|
||||
latencies,
|
||||
file=sys.stderr,
|
||||
)
|
||||
if latencies:
|
||||
latency = min(latencies)
|
||||
if ignore:
|
||||
continue
|
||||
|
||||
# Ordered by IACA version (newest last)
|
||||
for iaca_tag in sorted(
|
||||
arch_tag.iter('IACA'), key=lambda i: StrictVersion(i.attrib['version'])
|
||||
):
|
||||
if 'ports' in iaca_tag.attrib:
|
||||
port_pressure.append(
|
||||
port_pressure_from_tag_attributes(iaca_tag.attrib, arch, ports)
|
||||
)
|
||||
if ignore:
|
||||
continue
|
||||
port_pressure.append(port_pressure_from_tag_attributes(iaca_tag.attrib))
|
||||
|
||||
# Check if all are equal
|
||||
if port_pressure:
|
||||
if port_pressure[1:] != port_pressure[:-1]:
|
||||
print(
|
||||
"Contradicting port occupancies, using latest IACA:", mnemonic, file=sys.stderr
|
||||
)
|
||||
print("Contradicting port occupancies, using latest IACA:", iform, file=sys.stderr)
|
||||
port_pressure = port_pressure[-1]
|
||||
throughput = max(list(port_pressure.values()) + [0.0])
|
||||
else:
|
||||
# print("No data available for this architecture:", mnemonic, file=sys.stderr)
|
||||
continue
|
||||
# ---------------------------------------------
|
||||
model_data.append(
|
||||
{
|
||||
'name': mnemonic,
|
||||
'operands': parameters,
|
||||
'uops': uops,
|
||||
'throughput': throughput,
|
||||
'latency': latency,
|
||||
'port_pressure': port_pressure,
|
||||
}
|
||||
)
|
||||
|
||||
return model_data
|
||||
# Adding Intel's 2D and 3D pipelines on Intel µarchs, without Ice Lake:
|
||||
if arch.upper() in intel_archs and not arch.upper() in ['ICL']:
|
||||
if any([p['class'] == 'memory' for p in parameters]):
|
||||
# We have a memory parameter, if ports 2 & 3 are present, also add 2D & 3D
|
||||
# TODO remove port7 on 'hsw' onward and split entries depending on addressing mode
|
||||
port_23 = False
|
||||
port_4 = False
|
||||
for i, pp in enumerate(port_pressure):
|
||||
if '2' in pp[1] and '3' in pp[1]:
|
||||
port_23 = True
|
||||
if '4' in pp[1]:
|
||||
port_4 = True
|
||||
# Add (X, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
|
||||
# X = 2 on SNB and IVB IFF used in combination with ymm register, otherwise X = 1
|
||||
if arch.upper() in ['SNB', 'IVB'] and \
|
||||
any([p['class'] == 'register' and p['name'] == 'ymm' for p in parameters]):
|
||||
data_port_throughput = 2
|
||||
else:
|
||||
data_port_throughput = 1
|
||||
if port_23 and not port_4:
|
||||
port_pressure.append((data_port_throughput, ['2D', '3D']))
|
||||
|
||||
# Add missing ports:
|
||||
for ports in [pp[1] for pp in port_pressure]:
|
||||
for p in ports:
|
||||
mm.add_port(p)
|
||||
|
||||
throughput = max(mm.average_port_pressure(port_pressure))
|
||||
|
||||
mm.set_instruction(mnemonic, parameters, latency, port_pressure, throughput, uops)
|
||||
# TODO eliminate entries which could be covered by automatic load / store expansion
|
||||
return mm
|
||||
|
||||
|
||||
def rhs_comment(uncommented_string, comment):
|
||||
max_length = max([len(l) for l in uncommented_string.split('\n')])
|
||||
|
||||
commented_string = ""
|
||||
for l in uncommented_string.split('\n'):
|
||||
commented_string += ("{:<" + str(max_length) + "} # {}\n").format(l, comment)
|
||||
return commented_string
|
||||
|
||||
|
||||
def architectures(tree):
|
||||
@@ -278,16 +263,37 @@ def main():
|
||||
help='architecture to extract, use IACA abbreviations (e.g., SNB). '
|
||||
'if not given, all will be extracted and saved to file in CWD.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--mem',
|
||||
dest='skip_mem',
|
||||
action='store_false',
|
||||
help='add instruction forms including memory addressing operands, which are '
|
||||
'skipped by default'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
basename = os.path.basename(__file__)
|
||||
|
||||
tree = ET.parse(args.xml)
|
||||
print('# Available architectures:', ', '.join(architectures(tree)))
|
||||
if args.arch:
|
||||
model_data = extract_model(tree, args.arch)
|
||||
print(yaml.dump(model_data, allow_unicode=True))
|
||||
print('# Chosen architecture: {}'.format(args.arch))
|
||||
model = extract_model(tree, args.arch, args.skip_mem)
|
||||
if model is not None:
|
||||
print(
|
||||
rhs_comment(
|
||||
model.dump(), "uops.info import"
|
||||
)
|
||||
)
|
||||
else:
|
||||
for arch in architectures(tree):
|
||||
model_data = extract_model(tree, arch)
|
||||
add_entries_to_db(arch, model_data)
|
||||
print(arch, end='')
|
||||
model = extract_model(tree, arch.lower(), args.skip_mem)
|
||||
if model:
|
||||
model_string = rhs_comment(model.dump(), basename + " " + arch)
|
||||
|
||||
with open('{}.yml'.format(arch.lower()), 'w') as f:
|
||||
f.write(model_string)
|
||||
print('.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
771
osaca/data/n1.yml
Normal file
771
osaca/data/n1.yml
Normal file
@@ -0,0 +1,771 @@
|
||||
osaca_version: 0.3.4
|
||||
micro_architecture: Arm Neoverse N1
|
||||
arch_code: n1
|
||||
isa: AArch64
|
||||
ROB_size: 128 # wikichip
|
||||
retired_uOps_per_cycle: 8 # wikichip
|
||||
scheduler_size: 120 # wikichip
|
||||
hidden_loads: false
|
||||
load_latency: {w: 4.0, x: 4.0, b: 4.0, h: 4.0, s: 4.0, d: 5.0, q: 6.0, v: 5.0, z: 4.0}
|
||||
load_throughput:
|
||||
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '67']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '67']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '67']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '67']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [[1, '67'], [1, '123']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [[1, '67'], [1, '123']]}
|
||||
load_throughput_default: [[1, '67']]
|
||||
store_throughput: []
|
||||
store_throughput_default: [[1, '56'], [1, '67']]
|
||||
ports: ['0', '1', '2', '3', '4', '4DV', '5', '6', '7']
|
||||
port_model_scheme: |
|
||||
+----------------------------------------------------------------------------+
|
||||
| 120 entries |
|
||||
+----------------------------------------------------------------------------+
|
||||
0 |BR 1 |IS0 2 |IS1 3 |IM0 4 |FP0 5 |FP1 6 |LDST 7 |LDST
|
||||
\/ \/ \/ \/ \/ \/ \/ \/
|
||||
+------+ +-----+ +-----+ +-----+ +--------+ +--------+ +-------+ +-------+
|
||||
|Branch| | INT | | INT | | INT | | FP ALU | | FP ALU | | AGU | | AGU |
|
||||
+------+ | ALU | | ALU | | ALU | +--------+ +--------+ +-------+ +-------+
|
||||
+-----+ +-----+ +-----+ +--------+ +--------+ +-------+ +-------+
|
||||
+-----+ +-----+ | FP MUL | | FP MUL | |LD DATA| |LD DATA|
|
||||
| ST | | INT | +--------+ +--------+ +-------+ +-------+
|
||||
| INT | | MUL | +--------+ +---------+
|
||||
+-----+ +-----+ | FP DIV | |SIMD SHFT|
|
||||
+-----+ +--------+ +---------+
|
||||
| INT | +--------+ +--------+
|
||||
| DIV | | FMA | | FMA |
|
||||
+-----+ +--------+ +--------+
|
||||
+-----+ +--------+ +--------+
|
||||
|SHIFT| | ST SIMD| | ST SIMD|
|
||||
+-----+ | DATA | | DATA |
|
||||
+-----+ +--------+ +--------+
|
||||
| ST |
|
||||
| INT |
|
||||
+-----+
|
||||
instruction_forms:
|
||||
- name: add
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p123
|
||||
port_pressure: [[1, '123']]
|
||||
- name: add
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p123
|
||||
port_pressure: [[1, '123']]
|
||||
- name: adds
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p123
|
||||
port_pressure: [[1, '132']]
|
||||
- name: b.ne
|
||||
operands:
|
||||
- class: identifier
|
||||
throughput: 1.0
|
||||
latency: 0.0
|
||||
port_pressure: [[1, '0']]
|
||||
- name: b.gt
|
||||
operands:
|
||||
- class: identifier
|
||||
throughput: 1.0
|
||||
latency: 0.0
|
||||
port_pressure: [[1, '0']]
|
||||
- name: bne
|
||||
operands:
|
||||
- class: identifier
|
||||
throughput: 1.0
|
||||
latency: 0.0
|
||||
port_pressure: [[1, '0']]
|
||||
- name: cmp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: w
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p123
|
||||
port_pressure: [[1, '123']]
|
||||
- name: cmp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.3333333
|
||||
latency: 1.0 # 1*p123
|
||||
port_pressure: [[1, '123']]
|
||||
- name: dup
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: d
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fdiv
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: 128
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: 128
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: 128
|
||||
throughput: 6.0
|
||||
latency: 8.0 # 1*p4+6*p4DV
|
||||
port_pressure: [[1, '4'], [6, [4DV]]]
|
||||
- name: fdiv
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: 128
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: 128
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: 128
|
||||
throughput: 10.0
|
||||
latency: 12.0 # 1*p4+10*p4DV
|
||||
port_pressure: [[4, '0'], [10, [4DV]]]
|
||||
- name: fmla
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fmla
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fmov
|
||||
operands:
|
||||
- {class: register, prefix: s}
|
||||
- {class: immediate, imd: double}
|
||||
latency: ~ # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
throughput: 0.5
|
||||
- name: fmul
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 3.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fmul
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 3.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fmul
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
throughput: 0.5
|
||||
latency: 3.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: frecpe
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
throughput: 2.0
|
||||
latency: 4.0 # 1*p4
|
||||
port_pressure: [[2, '4']]
|
||||
- name: frecpe
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
throughput: 1.0
|
||||
latency: 3.0 # 1*p4
|
||||
port_pressure: [[1, '4']]
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: imd
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 5.0 # 2*p67, from n1 opt guide
|
||||
port_pressure: [[2, '67']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: imd
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 5.0 # 2*p67+1*p123, from n1 opt guide
|
||||
port_pressure: [[2, '67'], [1, '123']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 7.0 # 2*p67, from n1 opt guide
|
||||
port_pressure: [[2, '67']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 7.0 # 2*p67+1*p123, from n1 opt guide
|
||||
port_pressure: [[2, '56'], [1, '123']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 7.0 # 2*p67
|
||||
port_pressure: [[2, '67']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: true
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 7.0 # 2*p67+1*p123
|
||||
port_pressure: [[2, '67'], [1, '123']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 5.0 # 2*p67+1*p123
|
||||
port_pressure: [[2, '67'], [1, '123']]
|
||||
- name: ldur # JL: assumed from n1 opt guide
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p67
|
||||
port_pressure: [[1, '67']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p67
|
||||
port_pressure: [[1, '67']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 1*p67
|
||||
port_pressure: [[1, '67']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: imd
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 1*p67
|
||||
port_pressure: [[1, '67']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 1*p67
|
||||
port_pressure: [[1, '67']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: mov
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.25
|
||||
latency: 1.0 # 1*p3456
|
||||
port_pressure: [[1, '3456']]
|
||||
- name: mov
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: b
|
||||
width: '*'
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: b
|
||||
width: '*'
|
||||
throughput: 0.5
|
||||
latency: 2.0 # 1*p45
|
||||
port_pressure: [[1, '45']]
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 0 # 2*p45+1*p67
|
||||
port_pressure: [[2, '45'], [1, '67']]
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 0 # 2*p45+2*p67+1*123
|
||||
port_pressure: [[2, '45'], [2, '67'], [1, '123']]
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 0 # 2*p45+2*p67
|
||||
port_pressure: [[2, '45'], [2, '67']]
|
||||
- name: stur # JL: assumed from n1 opt guide
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 0 # 1*p67+1*p23
|
||||
port_pressure: [[1, '56'], [1, '23']]
|
||||
- name: stur # JL: assumed from n1 opt guide
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 0 # 2*p67+1*p45
|
||||
port_pressure: [[2, '67'], [1, '45']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 0 # 1*p67+1*p23
|
||||
port_pressure: [[1, '56'], [1, '23']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 0 # 1*p67+1*p45
|
||||
port_pressure: [[1, '67'], [1, '45']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 0.5
|
||||
latency: 0 # 1*p67+1*p45+1*p123
|
||||
port_pressure: [[1, '67'], [1, '45'], [1, '123']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 0 # 2*p67+1*p45
|
||||
port_pressure: [[1, '67'], [1, '45']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p67+1*p45+1*123
|
||||
port_pressure: [[1, '67'], [1, '45'], [1, '123']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p67+1*p23+1*p123
|
||||
port_pressure: [[1, '67'], [1, '23'], [1, '123']]
|
||||
- name: sub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: w
|
||||
- class: register
|
||||
prefix: w
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p123
|
||||
port_pressure: [[1, '123']]
|
||||
120521
osaca/data/skx.yml
Normal file
120521
osaca/data/skx.yml
Normal file
File diff suppressed because it is too large
Load Diff
16369
osaca/data/snb.yml
Normal file
16369
osaca/data/snb.yml
Normal file
File diff suppressed because it is too large
Load Diff
757
osaca/data/tx2.yml
Normal file
757
osaca/data/tx2.yml
Normal file
@@ -0,0 +1,757 @@
|
||||
osaca_version: 0.3.4
|
||||
micro_architecture: Thunder X2
|
||||
arch_code: tx2
|
||||
isa: AArch64
|
||||
ROB_size: 180
|
||||
retired_uOps_per_cycle: 4
|
||||
scheduler_size: 60
|
||||
hidden_loads: false
|
||||
load_latency: {w: 4.0, x: 4.0, b: 4.0, h: 4.0, s: 4.0, d: 4.0, q: 4.0, v: 4.0}
|
||||
load_throughput:
|
||||
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '34']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '34']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '34']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '34']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [[1, '34'], [1, '012']]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [[1, '34'], [1, '012']]}
|
||||
load_throughput_default: [[1, '34']]
|
||||
store_throughput: []
|
||||
store_throughput_default: [[1, '34'], [1, '5']]
|
||||
ports: ['0', 0DV, '1', 1DV, '2', '3', '4', '5']
|
||||
port_model_scheme: |
|
||||
+-----------------------------------------------------------+
|
||||
| 60 entry unified scheduler |
|
||||
+-----------------------------------------------------------+
|
||||
0 | 1 | 2 | 3 | 4 | 5 |
|
||||
\/ \/ \/ \/ \/ \/
|
||||
+------+ +------+ +------+ +------+ +------+ +------+
|
||||
| ALU | | ALU | | ALU/ | | LD | | LD | | ST |
|
||||
+------+ +------+ | BR | +------+ +------+ +------+
|
||||
+------+ +------+ +------+ +------+ +------+
|
||||
| FP/ | | FP/ | | AGU | | AGU |
|
||||
| NEON | | NEON | +------+ +------+
|
||||
+------+ +------+
|
||||
+------+
|
||||
| INT |
|
||||
| MUL/ |
|
||||
| DIV |
|
||||
+------+
|
||||
+------+
|
||||
|CRYPTO|
|
||||
+------+
|
||||
instruction_forms:
|
||||
- name: add
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p012
|
||||
port_pressure: [[1, '012']]
|
||||
- name: add
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p012
|
||||
port_pressure: [[1, '012']]
|
||||
- name: adds
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p012
|
||||
port_pressure: [[1, '012']]
|
||||
- name: b.ne
|
||||
operands:
|
||||
- class: identifier
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: b.gt
|
||||
operands:
|
||||
- class: identifier
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: bne
|
||||
operands:
|
||||
- class: identifier
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: cmp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: w
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p012
|
||||
port_pressure: [[1, '012']]
|
||||
- name: cmp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p012
|
||||
port_pressure: [[1, '012']]
|
||||
- name: dup
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fadd
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fdiv
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
throughput: 8.5
|
||||
latency: 16.0 # 1*p01+17*p0DV1DV
|
||||
port_pressure: [[1, '01'], [17.0, [0DV, 1DV]]]
|
||||
- name: fdiv
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
throughput: 12.0
|
||||
latency: 23.0 # 1*p01+24*p0DV1DV
|
||||
port_pressure: [[1, '01'], [24.0, [0DV, 1DV]]]
|
||||
- name: fmla
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fmla
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fmov
|
||||
operands:
|
||||
- {class: register, prefix: s}
|
||||
- {class: immediate, imd: double}
|
||||
latency: ~ # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
throughput: 0.5
|
||||
- name: fmul
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fmul
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fmul
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: frecpe
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: frecpe
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: s
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: fsub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: d
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: imd
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: imd
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34'], [1, '012']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34'], [1, '012']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: true
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34'], [1, '012']]
|
||||
- name: ldp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 2*p34
|
||||
port_pressure: [[2.0, '34'], [1, '012']]
|
||||
- name: ldur # JL: assumed from ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0 # 1*p34
|
||||
port_pressure: [[1.0, '34']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0 # 1*p34
|
||||
port_pressure: [[1.0, '34']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0 # 1*p34
|
||||
port_pressure: [[1.0, '34']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: imd
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0 # 1*p34
|
||||
port_pressure: [[1.0, '34']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0 # 1*p34
|
||||
port_pressure: [[1.0, '34']]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
throughput: 0.0
|
||||
latency: 0.0
|
||||
port_pressure: []
|
||||
- name: mov
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: register
|
||||
prefix: x
|
||||
throughput: 0.5
|
||||
latency: 1.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: mov
|
||||
operands:
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: b
|
||||
- class: register
|
||||
prefix: v
|
||||
shape: b
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 1*p01
|
||||
port_pressure: [[1, '01']]
|
||||
- name: prfm
|
||||
operands:
|
||||
- class: prfop
|
||||
type: pld
|
||||
target: l1
|
||||
policy: keep
|
||||
- class: memory
|
||||
base: x
|
||||
offset: imd
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: ~
|
||||
latency: ~
|
||||
port_pressure: []
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 2.0
|
||||
latency: 0 # 2*p34+2*p5
|
||||
port_pressure: [[2.0, '34'], [2.0, '5']]
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 2.0
|
||||
latency: 0 # 2*p34+2*p5+1*012
|
||||
port_pressure: [[2.0, '34'], [2.0, '5'], [1, '012']]
|
||||
- name: stp
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 2.0
|
||||
latency: 0 # 2*p34+2*p5
|
||||
port_pressure: [[2.0, '34'], [2.0, '5']]
|
||||
- name: stur # JL: assumed from str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5']]
|
||||
- name: stur # JL: assumed from str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: d
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5'], [1, '012']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: q
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5'], [1, '012']]
|
||||
- name: str
|
||||
operands:
|
||||
- class: register
|
||||
prefix: x
|
||||
- class: memory
|
||||
base: x
|
||||
offset: '*'
|
||||
index: '*'
|
||||
scale: '*'
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 0 # 1*p34+1*p5
|
||||
port_pressure: [[1.0, '34'], [1.0, '5'], [1, '012']]
|
||||
- name: sub
|
||||
operands:
|
||||
- class: register
|
||||
prefix: w
|
||||
- class: register
|
||||
prefix: w
|
||||
- class: immediate
|
||||
imd: int
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 1*p012
|
||||
port_pressure: [[1, '012']]
|
||||
@@ -1,698 +0,0 @@
|
||||
osaca_version: 0.3.0
|
||||
micro_architecture: "Cavium Vulcan"
|
||||
arch_code: "Vulcan"
|
||||
isa: "AArch64"
|
||||
ROB_size: 180
|
||||
retired_uOps_per_cycle: 4
|
||||
scheduler_size: 60
|
||||
hidden_loads: false
|
||||
load_latency: {w: 4.0, x: 4.0, b: 4.0, h: 4.0, s: 4.0, d: 4.0, q: 4.0, v: 4.0}
|
||||
load_throughput:
|
||||
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [0,0,0,0,0,0.5,0.5,0]}
|
||||
ports: ["0", "0DV", "1", "1DV", "2", "3", "4", "5"]
|
||||
port_model_scheme: |
|
||||
┌-----------------------------------------------------------┐
|
||||
| 60 entry unified scheduler |
|
||||
└-----------------------------------------------------------┘
|
||||
0 | 1 | 2 | 3 | 4 | 5 |
|
||||
▼ ▼ ▼ ▼ ▼ ▼
|
||||
┌------┐ ┌------┐ ┌------┐ ┌------┐ ┌------┐ ┌------┐
|
||||
| ALU | | ALU | | ALU/ | | LD | | LD | | ST |
|
||||
└------┘ └------┘ | BR | └------┘ └------┘ └------┘
|
||||
┌------┐ ┌------┐ └------┘ ┌------┐ ┌------┐
|
||||
| FP/ | | FP/ | | AGU | | AGU |
|
||||
| NEON | | NEON | └------┘ └------┘
|
||||
└------┘ └------┘
|
||||
┌------┐
|
||||
| INT |
|
||||
| MUL/ |
|
||||
| DIV |
|
||||
└------┘
|
||||
┌------┐
|
||||
|CRYPTO|
|
||||
└------┘
|
||||
instruction_forms:
|
||||
- name: "add"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.33333333, 0.0, 0.33333333, 0.0, 0.33333333, 0.0, 0.0, 0.0]
|
||||
- name: "add"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "immediate"
|
||||
imd: "int"
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.33333333, 0.0, 0.33333333, 0.0, 0.33333333, 0.0, 0.0, 0.0]
|
||||
- name: "adds"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "immediate"
|
||||
imd: "int"
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.33333333, 0.0, 0.33333333, 0.0, 0.33333333, 0.0, 0.0, 0.0]
|
||||
- name: "b.ne"
|
||||
operands:
|
||||
- class: 'identifier'
|
||||
throughput: 0.0
|
||||
latency: 0.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0, 0, 0, 0, 0, 0, 0, 0]
|
||||
- name: "b.gt"
|
||||
operands:
|
||||
- class: 'identifier'
|
||||
throughput: 0.0
|
||||
latency: 0.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0, 0, 0, 0, 0, 0, 0, 0]
|
||||
- name: "bne"
|
||||
operands:
|
||||
- class: 'identifier'
|
||||
throughput: 0.0
|
||||
latency: 0.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0, 0, 0, 0, 0, 0, 0, 0]
|
||||
- name: "cmp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "w"
|
||||
- class: "immediate"
|
||||
imd: "int"
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.33333333, 0.0, 0.33333333, 0.0, 0.33333333, 0.0, 0.0, 0.0]
|
||||
- name: "cmp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
throughput: 0.33333333
|
||||
latency: 1.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.33333333, 0.0, 0.33333333, 0.0, 0.33333333, 0.0, 0.0, 0.0]
|
||||
- name: "fadd"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fadd"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fadd"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fdiv"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
throughput: 8.5
|
||||
latency: 16.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [1.0, 8.5, 1.0, 8.5, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fdiv"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
throughput: 12.0
|
||||
latency: 23.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [1.0, 12.5, 1.0, 12.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fmla"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fmla"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- latency: ~
|
||||
name: "fmov"
|
||||
operands:
|
||||
- {class: "register", prefix: "s"}
|
||||
- {class: "immediate", imd: "double"}
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
throughput: 0.5
|
||||
- name: "fmul"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fmul"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fmul"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fsub"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "s"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "fsub"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "d"
|
||||
throughput: 0.5
|
||||
latency: 6.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: true
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: "ldp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0]
|
||||
- name: ldr
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: "x"
|
||||
scale: 8
|
||||
post-indexed: false
|
||||
pre-indexed: false
|
||||
throughput: 0.5
|
||||
latency: 4.0
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0]
|
||||
- name: "ldr"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
throughput: 0.0
|
||||
latency: 0.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "ldr"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
throughput: 0.0
|
||||
latency: 0.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "ldr"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
throughput: 0.0
|
||||
latency: 0.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "mov"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
throughput: 0.5
|
||||
latency: 1.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "mov"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "b"
|
||||
- class: "register"
|
||||
prefix: "v"
|
||||
shape: "b"
|
||||
throughput: 0.5
|
||||
latency: 5.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
- name: "prfm"
|
||||
operands:
|
||||
- class: "prfop"
|
||||
type: "pld"
|
||||
target: "l1"
|
||||
policy: "keep"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: ~
|
||||
latency: ~
|
||||
port_pressure: ~
|
||||
- name: "stp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 2.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 0.0]
|
||||
- name: "stp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 2.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 0.0]
|
||||
- name: "stp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 2.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0]
|
||||
- name: "stp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 2.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0]
|
||||
- name: "stp"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 2.0
|
||||
latency: ~ # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0]
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: "imd"
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 1.0]
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "d"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 1.0]
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: "x"
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 1.0]
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "q"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 1.0]
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: true
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 1.0]
|
||||
- name: "str"
|
||||
operands:
|
||||
- class: "register"
|
||||
prefix: "x"
|
||||
- class: "memory"
|
||||
base: "x"
|
||||
offset: ~
|
||||
index: "x"
|
||||
scale: 1
|
||||
pre-indexed: false
|
||||
post-indexed: false
|
||||
throughput: 1.0
|
||||
latency: 4.0 # 0 0DV 1 1DV 2 3 4 5
|
||||
port_pressure: [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 1.0]
|
||||
1169
osaca/data/zen1.yml
1169
osaca/data/zen1.yml
File diff suppressed because it is too large
Load Diff
8736
osaca/data/zen2.yml
Normal file
8736
osaca/data/zen2.yml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -2,94 +2,31 @@
|
||||
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
|
||||
import ruamel.yaml
|
||||
|
||||
from osaca.semantics import MachineModel
|
||||
|
||||
|
||||
def add_entry_to_db(arch: str, entry):
|
||||
"""Adds entry to the user database in ~/.osaca/data
|
||||
|
||||
Args:
|
||||
arch: string representation of the architecture as abbreviation.
|
||||
Database for this architecture must already exist.
|
||||
entry: DB entry which will be added. Should consist at best out of
|
||||
'name', 'operand(s)' ('register', 'memory', 'immediate', 'identifier', ...),
|
||||
'throughput', 'latency', 'port_pressure'.
|
||||
def sanity_check(arch: str, verbose=False, internet_check=False, output_file=sys.stdout):
|
||||
"""
|
||||
# load yaml
|
||||
arch = arch.lower()
|
||||
filepath = os.path.join(os.path.expanduser('~/.osaca/data/' + arch + '.yml'))
|
||||
assert os.path.exists(filepath)
|
||||
yaml = _create_yaml_object()
|
||||
with open(filepath, 'r') as f:
|
||||
data = yaml.load(f)
|
||||
# check parameter of entry
|
||||
if 'name' not in entry:
|
||||
raise ValueError('No name for instruction specified. No import possible')
|
||||
if 'operands' not in entry:
|
||||
entry['operands'] = []
|
||||
if 'throughput' not in entry:
|
||||
entry['throughput'] = None
|
||||
if 'latency' not in entry:
|
||||
entry['latency'] = None
|
||||
if 'port_pressure' not in entry:
|
||||
entry['port_pressure'] = None
|
||||
if 'uops' not in entry:
|
||||
entry['uops'] = None
|
||||
data['instruction_forms'].append(entry)
|
||||
# __dump_data_to_yaml(filepath, data)
|
||||
with open(filepath, 'w') as f:
|
||||
yaml.dump(data, f)
|
||||
Checks the database for missing TP/LT values, instructions might missing int the ISA DB and
|
||||
duplicate instructions.
|
||||
|
||||
:param arch: micro-arch key to define DB to check
|
||||
:type arch: str
|
||||
:param verbose: verbose output flag, defaults to `False`
|
||||
:type verbose: bool, optional
|
||||
:param internet_check: indicates if OSACA should try to look up the src/dst distribution in the internet, defaults to False
|
||||
:type internet_check: boolean, optional
|
||||
:param output_file: output stream specifying where to write output, defaults to :class:`sys.stdout`
|
||||
:type output_file: stream, optional
|
||||
|
||||
def add_entries_to_db(arch: str, entries: list) -> None:
|
||||
"""Adds entries to the user database in ~/.osaca/data
|
||||
|
||||
Args:
|
||||
arch: string representation of the architecture as abbreviation.
|
||||
Database for this architecture must already exist.
|
||||
entries: :class:`list` of DB entries which will be added. Should consist at best out of
|
||||
'name', 'operand(s)' ('register', 'memory', 'immediate', 'identifier', ...),
|
||||
'throughput', 'latency', 'port_pressure', 'uops'.
|
||||
"""
|
||||
# load yaml
|
||||
arch = arch.lower()
|
||||
filepath = os.path.join(os.path.expanduser('~/.osaca/data/' + arch + '.yml'))
|
||||
assert os.path.exists(filepath)
|
||||
yaml = _create_yaml_object()
|
||||
with open(filepath, 'r') as f:
|
||||
data = yaml.load(f)
|
||||
# check parameter of entry and append it to list
|
||||
for entry in entries:
|
||||
if 'name' not in entry:
|
||||
print(
|
||||
'No name for instruction \n\t{}\nspecified. No import possible'.format(entry),
|
||||
file=sys.stderr,
|
||||
)
|
||||
# remove entry from list
|
||||
entries.remove(entry)
|
||||
continue
|
||||
if 'operands' not in entry:
|
||||
entry['operands'] = []
|
||||
if 'throughput' not in entry:
|
||||
entry['throughput'] = None
|
||||
if 'latency' not in entry:
|
||||
entry['latency'] = None
|
||||
if 'port_pressure' not in entry:
|
||||
entry['port_pressure'] = None
|
||||
if 'uops' not in entry:
|
||||
entry['uops'] = None
|
||||
data['instruction_forms'].append(entry)
|
||||
# __dump_data_to_yaml(filepath, data)
|
||||
with open(filepath, 'w') as f:
|
||||
yaml.dump(data, f)
|
||||
|
||||
|
||||
def sanity_check(arch: str, verbose=False):
|
||||
# load arch machine model
|
||||
arch_mm = MachineModel(arch=arch)
|
||||
data = arch_mm['instruction_forms']
|
||||
@@ -105,11 +42,11 @@ def sanity_check(arch: str, verbose=False):
|
||||
missing_port_pressure,
|
||||
suspicious_instructions,
|
||||
duplicate_instr_arch,
|
||||
) = _check_sanity_arch_db(arch_mm, isa_mm)
|
||||
) = _check_sanity_arch_db(arch_mm, isa_mm, internet_check=internet_check)
|
||||
# check ISA DB entries
|
||||
duplicate_instr_isa, only_in_isa = _check_sanity_isa_db(arch_mm, isa_mm)
|
||||
|
||||
_print_sanity_report(
|
||||
report = _get_sanity_report(
|
||||
num_of_instr,
|
||||
missing_throughput,
|
||||
missing_latency,
|
||||
@@ -119,10 +56,24 @@ def sanity_check(arch: str, verbose=False):
|
||||
duplicate_instr_isa,
|
||||
only_in_isa,
|
||||
verbose=verbose,
|
||||
colors=True if output_file == sys.stdout else False,
|
||||
)
|
||||
print(report, file=output_file)
|
||||
|
||||
|
||||
def import_benchmark_output(arch, bench_type, filepath):
|
||||
def import_benchmark_output(arch, bench_type, filepath, output=sys.stdout):
|
||||
"""
|
||||
Import benchmark results from micro-benchmarks.
|
||||
|
||||
:param arch: target architecture key
|
||||
:type arch: str
|
||||
:param bench_type: key for defining type of benchmark output
|
||||
:type bench_type: str
|
||||
:param filepath: filepath to the output file
|
||||
:type filepath: str
|
||||
:param output: output stream to dump, defaults to sys.stdout
|
||||
:type output: stream
|
||||
"""
|
||||
supported_bench_outputs = ['ibench', 'asmbench']
|
||||
assert os.path.exists(filepath)
|
||||
if bench_type not in supported_bench_outputs:
|
||||
@@ -130,12 +81,18 @@ def import_benchmark_output(arch, bench_type, filepath):
|
||||
with open(filepath, 'r') as f:
|
||||
input_data = f.readlines()
|
||||
db_entries = None
|
||||
mm = MachineModel(arch)
|
||||
if bench_type == 'ibench':
|
||||
db_entries = _get_ibench_output(input_data)
|
||||
db_entries = _get_ibench_output(input_data, mm.get_ISA())
|
||||
elif bench_type == 'asmbench':
|
||||
raise NotImplementedError
|
||||
db_entries = _get_asmbench_output(input_data, mm.get_ISA())
|
||||
# write entries to DB
|
||||
add_entries_to_db(arch, list(db_entries.values()))
|
||||
for entry in db_entries:
|
||||
mm.set_instruction_entry(db_entries[entry])
|
||||
if output is None:
|
||||
print(mm.dump())
|
||||
else:
|
||||
mm.dump(stream=output)
|
||||
|
||||
|
||||
##################
|
||||
@@ -143,7 +100,58 @@ def import_benchmark_output(arch, bench_type, filepath):
|
||||
##################
|
||||
|
||||
|
||||
def _get_ibench_output(input_data):
|
||||
def _get_asmbench_output(input_data, isa):
|
||||
"""
|
||||
Parse asmbench output in the format
|
||||
|
||||
1 MNEMONIC[-OP1[_OP2][...]]
|
||||
2 Latency: X cycles
|
||||
3 Throughput: Y cycles
|
||||
4
|
||||
|
||||
and creates per 4 lines in the input_data one entry in the database.
|
||||
|
||||
:param str input_data: content of asmbench output file
|
||||
:param str isa: ISA of target architecture (x86, AArch64, ...)
|
||||
: return: dictionary with all new db_entries
|
||||
"""
|
||||
db_entries = {}
|
||||
for i in range(0, len(input_data), 4):
|
||||
if input_data[i + 3].strip() != '':
|
||||
print('asmbench output not in the correct format! Format must be: ', file=sys.stderr)
|
||||
print(
|
||||
'-------------\nMNEMONIC[-OP1[_OP2][...]]\nLatency: X cycles\n'
|
||||
'Throughput: Y cycles\n\n-------------',
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
'Entry {} and all further entries won\'t be added.'.format((i / 4) + 1),
|
||||
file=sys.stderr,
|
||||
)
|
||||
break
|
||||
else:
|
||||
i_form = input_data[i].strip()
|
||||
mnemonic = i_form.split('-')[0]
|
||||
operands = i_form.split('-')[1].split('_')
|
||||
operands = [_create_db_operand(op, isa) for op in operands]
|
||||
entry = {
|
||||
'name': mnemonic,
|
||||
'operands': operands,
|
||||
'throughput': _validate_measurement(float(input_data[i + 2].split()[1]), 'tp'),
|
||||
'latency': _validate_measurement(float(input_data[i + 1].split()[1]), 'lt'),
|
||||
'port_pressure': None,
|
||||
}
|
||||
if not entry['throughput'] or not entry['latency']:
|
||||
warnings.warn(
|
||||
'Your measurement for {} looks suspicious'.format(i_form)
|
||||
+ ' and was not added. Please inspect your benchmark.'
|
||||
)
|
||||
db_entries[i_form] = entry
|
||||
return db_entries
|
||||
|
||||
|
||||
def _get_ibench_output(input_data, isa):
|
||||
"""Parse the standard output of ibench and add instructions to DB."""
|
||||
db_entries = {}
|
||||
for line in input_data:
|
||||
if 'Using frequency' in line or len(line) == 0:
|
||||
@@ -156,7 +164,7 @@ def _get_ibench_output(input_data):
|
||||
else:
|
||||
mnemonic = instruction.split('-')[0]
|
||||
operands = instruction.split('-')[1].split('_')
|
||||
operands = [_create_db_operand(op) for op in operands]
|
||||
operands = [_create_db_operand(op, isa) for op in operands]
|
||||
entry = {
|
||||
'name': mnemonic,
|
||||
'operands': operands,
|
||||
@@ -165,24 +173,29 @@ def _get_ibench_output(input_data):
|
||||
'port_pressure': None,
|
||||
}
|
||||
if 'TP' in instruction:
|
||||
entry['throughput'] = _validate_measurement(float(line.split()[1]), True)
|
||||
entry['throughput'] = _validate_measurement(float(line.split()[1]), 'tp')
|
||||
if not entry['throughput']:
|
||||
warnings.warn(
|
||||
'Your THROUGHPUT measurement for {} looks suspicious'.format(key)
|
||||
+ ' and was not added. Please inspect your benchmark.'
|
||||
)
|
||||
elif 'LT' in instruction:
|
||||
entry['latency'] = _validate_measurement(float(line.split()[1]), False)
|
||||
entry['latency'] = _validate_measurement(float(line.split()[1]), 'lt')
|
||||
if not entry['latency']:
|
||||
warnings.warn(
|
||||
'Your LATENCY measurement for {} looks suspicious'.format(key)
|
||||
+ ' and was not added. Please inspect your benchmark.'
|
||||
)
|
||||
db_entries[key] = entry
|
||||
return db_entries
|
||||
|
||||
|
||||
def _validate_measurement(self, measurement, is_tp):
|
||||
if not is_tp:
|
||||
def _validate_measurement(measurement, mode):
|
||||
"""
|
||||
Check if latency has a maximum deviation of 0.05% and throughput is a reciprocal of a
|
||||
an integer number.
|
||||
"""
|
||||
if mode == 'lt':
|
||||
if (
|
||||
math.floor(measurement) * 1.05 >= measurement
|
||||
or math.ceil(measurement) * 0.95 <= measurement
|
||||
@@ -190,7 +203,7 @@ def _validate_measurement(self, measurement, is_tp):
|
||||
# Value is probably correct, so round it to the estimated value
|
||||
return float(round(measurement))
|
||||
# Check reciprocal only if it is a throughput value
|
||||
else:
|
||||
elif mode == 'tp':
|
||||
reciprocals = [1 / x for x in range(1, 11)]
|
||||
for reci in reciprocals:
|
||||
if reci * 0.95 <= measurement <= reci * 1.05:
|
||||
@@ -201,24 +214,30 @@ def _validate_measurement(self, measurement, is_tp):
|
||||
return None
|
||||
|
||||
|
||||
def _create_db_operand(self, operand):
|
||||
if self.isa == 'aarch64':
|
||||
return self._create_db_operand_aarch64(operand)
|
||||
elif self.isa == 'x86':
|
||||
return self._create_db_operand_x86(operand)
|
||||
def _create_db_operand(operand, isa):
|
||||
"""Get DB operand by input string and ISA."""
|
||||
if isa == 'aarch64':
|
||||
return _create_db_operand_aarch64(operand)
|
||||
elif isa == 'x86':
|
||||
return _create_db_operand_x86(operand)
|
||||
|
||||
|
||||
def _create_db_operand_aarch64(self, operand):
|
||||
def _create_db_operand_aarch64(operand):
|
||||
"""Get DB operand for AArch64 by operand string."""
|
||||
if operand == 'i':
|
||||
return {'class': 'immediate', 'imd': 'int'}
|
||||
elif operand in 'wxbhsdq':
|
||||
return {'class': 'register', 'prefix': operand}
|
||||
elif operand.startswith('v'):
|
||||
return {'class': 'register', 'prefix': 'v', 'shape': operand[1:2]}
|
||||
return {
|
||||
'class': 'register',
|
||||
'prefix': 'v',
|
||||
'shape': operand[1:2] if operand[1:2] != '' else 'd',
|
||||
}
|
||||
elif operand.startswith('m'):
|
||||
return {
|
||||
'class': 'memory',
|
||||
'base': 'gpr' if 'b' in operand else None,
|
||||
'base': 'x' if 'b' in operand else None,
|
||||
'offset': 'imd' if 'o' in operand else None,
|
||||
'index': 'gpr' if 'i' in operand else None,
|
||||
'scale': 8 if 's' in operand else 1,
|
||||
@@ -229,7 +248,8 @@ def _create_db_operand_aarch64(self, operand):
|
||||
raise ValueError('Parameter {} is not a valid operand code'.format(operand))
|
||||
|
||||
|
||||
def _create_db_operand_x86(self, operand):
|
||||
def _create_db_operand_x86(operand):
|
||||
"""Get DB operand for AArch64 by operand string."""
|
||||
if operand == 'r':
|
||||
return {'class': 'register', 'name': 'gpr'}
|
||||
elif operand in 'xyz':
|
||||
@@ -253,14 +273,109 @@ def _create_db_operand_x86(self, operand):
|
||||
########################
|
||||
|
||||
|
||||
def _check_sanity_arch_db(arch_mm, isa_mm):
|
||||
def _scrape_from_felixcloutier(mnemonic):
|
||||
"""Scrape src/dst information from felixcloutier website and return information for user."""
|
||||
import requests
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
print(
|
||||
'Module BeautifulSoup not installed. Fetching instruction form information '
|
||||
'online requires BeautifulSoup.\nUse \'pip install bs4\' for installation.',
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
index = 'https://www.felixcloutier.com/x86/index.html'
|
||||
base_url = 'https://www.felixcloutier.com/x86/'
|
||||
url = base_url + mnemonic.lower()
|
||||
|
||||
suspicious = True
|
||||
operands = []
|
||||
|
||||
# GET website
|
||||
r = requests.get(url=url)
|
||||
if r.status_code == 200:
|
||||
# Found result
|
||||
operand_enc = BeautifulSoup(r.text, 'html.parser').find(
|
||||
'h2', attrs={'id': 'instruction-operand-encoding'}
|
||||
)
|
||||
if operand_enc:
|
||||
# operand encoding found, otherwise, no need to mark as suspicous
|
||||
table = operand_enc.findNextSibling()
|
||||
operands = _get_src_dst_from_table(table)
|
||||
elif r.status_code == 404:
|
||||
# Check for alternative href
|
||||
index = BeautifulSoup(requests.get(url=index).text, 'html.parser')
|
||||
alternatives = [ref for ref in index.findAll('a') if ref.text == mnemonic.upper()]
|
||||
if len(alternatives) > 0:
|
||||
# alternative(s) found, take first one
|
||||
url = base_url + alternatives[0].attrs['href'][2:]
|
||||
operand_enc = BeautifulSoup(requests.get(url=url).text, 'html.parser').find(
|
||||
'h2', attrs={'id': 'instruction-operand-encoding'}
|
||||
)
|
||||
if operand_enc:
|
||||
# operand encoding found, otherwise, no need to mark as suspicous
|
||||
table = (
|
||||
operand_enc.findNextSibling()
|
||||
)
|
||||
operands = _get_src_dst_from_table(table)
|
||||
if operands:
|
||||
# Found src/dst assignment for NUM_OPERANDS
|
||||
if not any(['r' in x and 'w' in x for x in operands]):
|
||||
suspicious = False
|
||||
return (suspicious, ' '.join(operands))
|
||||
|
||||
|
||||
def _get_src_dst_from_table(table, num_operands=2):
|
||||
"""Prettify bs4 table object to string for user"""
|
||||
# Parse table
|
||||
header = [''.join(x.string.lower().split()) for x in table.find('tr').findAll('td')]
|
||||
data = table.findAll('tr')[1:]
|
||||
data_dict = OrderedDict()
|
||||
for i, row in enumerate(data):
|
||||
data_dict[i] = {}
|
||||
for j, col in enumerate(row.findAll('td')):
|
||||
if col.string != 'NA':
|
||||
data_dict[i][header[j]] = col.string
|
||||
# Get only the instruction forms with 2 operands
|
||||
num_ops = [_get_number_of_operands(row) for _, row in data_dict.items()]
|
||||
if num_operands in num_ops:
|
||||
row = data_dict[num_ops.index(num_operands)]
|
||||
reads_writes = []
|
||||
for i in range(1, num_operands + 1):
|
||||
m = re.search(r'(\([^\(\)]+\))', row['operand{}'.format(i)])
|
||||
if not m:
|
||||
# no parentheses (probably immediate operand), assume READ
|
||||
reads_writes.append('(r)')
|
||||
continue
|
||||
reads_writes.append(''.join(m.group(0).split()))
|
||||
# reverse reads_writes for AT&T syntax
|
||||
reads_writes.reverse()
|
||||
return reads_writes
|
||||
return []
|
||||
|
||||
|
||||
def _get_number_of_operands(data_dict_row):
|
||||
"""Return the number of `Operand [X]` attributes in row"""
|
||||
num = 0
|
||||
for i in range(1, 5):
|
||||
if 'operand{}'.format(i) in [''.join(x.split()).lower() for x in data_dict_row]:
|
||||
num += 1
|
||||
return num
|
||||
|
||||
|
||||
def _check_sanity_arch_db(arch_mm, isa_mm, internet_check=True):
|
||||
"""Do sanity check for ArchDB by given ISA."""
|
||||
# prefixes of instruction forms which we assume to have non-default operands
|
||||
suspicious_prefixes_x86 = ['vfm', 'fm']
|
||||
suspicious_prefixes_arm = ['fml', 'ldp', 'stp', 'str']
|
||||
# already known to be default-operand instruction forms with 2 operands
|
||||
if arch_mm.get_ISA().lower() == 'aarch64':
|
||||
suspicious_prefixes = suspicious_prefixes_arm
|
||||
if arch_mm.get_ISA().lower() == 'x86':
|
||||
suspicious_prefixes = suspicious_prefixes_x86
|
||||
port_num = len(arch_mm['ports'])
|
||||
|
||||
# returned lists
|
||||
missing_throughput = []
|
||||
@@ -268,6 +383,7 @@ def _check_sanity_arch_db(arch_mm, isa_mm):
|
||||
missing_port_pressure = []
|
||||
suspicious_instructions = []
|
||||
duplicate_instr_arch = []
|
||||
duplicate_strings = []
|
||||
|
||||
for instr_form in arch_mm['instruction_forms']:
|
||||
# check value in DB entry
|
||||
@@ -277,25 +393,39 @@ def _check_sanity_arch_db(arch_mm, isa_mm):
|
||||
missing_latency.append(instr_form)
|
||||
if instr_form['port_pressure'] is None:
|
||||
missing_port_pressure.append(instr_form)
|
||||
elif len(instr_form['port_pressure']) != port_num:
|
||||
warnings.warn(
|
||||
'Invalid number of ports:\n {}'.format(_get_full_instruction_name(instr_form))
|
||||
)
|
||||
# check entry against ISA DB
|
||||
for prefix in suspicious_prefixes:
|
||||
if instr_form['name'].startswith(prefix):
|
||||
if instr_form['name'].lower().startswith(prefix):
|
||||
# check if instruction in ISA DB
|
||||
if isa_mm.get_instruction(instr_form['name'], instr_form['operands']) is None:
|
||||
# if not, mark them as suspicious and print it on the screen
|
||||
suspicious_instructions.append(instr_form)
|
||||
# instr forms with less than 3 operands might need an ISA DB entry due to src_reg operands
|
||||
if (
|
||||
len(instr_form['operands']) < 3
|
||||
and len(instr_form['operands']) > 1
|
||||
and 'mov' not in instr_form['name'].lower()
|
||||
and not instr_form['name'].lower().startswith('j')
|
||||
and instr_form not in suspicious_instructions
|
||||
and isa_mm.get_instruction(instr_form['name'], instr_form['operands']) is None
|
||||
):
|
||||
# validate with data from internet if connected flag is set
|
||||
if internet_check:
|
||||
is_susp, info_string = _scrape_from_felixcloutier(instr_form['name'])
|
||||
if is_susp:
|
||||
instr_form['note'] = info_string
|
||||
suspicious_instructions.append(instr_form)
|
||||
else:
|
||||
suspicious_instructions.append(instr_form)
|
||||
# check for duplicates in DB
|
||||
if arch_mm._check_for_duplicate(instr_form['name'], instr_form['operands']):
|
||||
duplicate_instr_arch.append(instr_form)
|
||||
# every entry exists twice --> uniquify
|
||||
tmp_list = []
|
||||
for i in range(0, len(duplicate_instr_arch)):
|
||||
for _ in range(0, len(duplicate_instr_arch)):
|
||||
tmp = duplicate_instr_arch.pop()
|
||||
if tmp not in duplicate_instr_arch:
|
||||
if _get_full_instruction_name(tmp).lower() not in duplicate_strings:
|
||||
duplicate_strings.append(_get_full_instruction_name(tmp).lower())
|
||||
tmp_list.append(tmp)
|
||||
duplicate_instr_arch = tmp_list
|
||||
return (
|
||||
@@ -308,6 +438,7 @@ def _check_sanity_arch_db(arch_mm, isa_mm):
|
||||
|
||||
|
||||
def _check_sanity_isa_db(arch_mm, isa_mm):
|
||||
"""Do sanity check for an ISA DB."""
|
||||
# returned lists
|
||||
duplicate_instr_isa = []
|
||||
only_in_isa = []
|
||||
@@ -330,88 +461,83 @@ def _check_sanity_isa_db(arch_mm, isa_mm):
|
||||
return duplicate_instr_isa, only_in_isa
|
||||
|
||||
|
||||
def _print_sanity_report(
|
||||
total, m_tp, m_l, m_pp, suspic_instr, dup_arch, dup_isa, only_isa, verbose=False
|
||||
def _get_sanity_report(
|
||||
total, m_tp, m_l, m_pp, suspic_instr, dup_arch, dup_isa, only_isa, verbose=False, colors=False
|
||||
):
|
||||
"""Get sanity summary report."""
|
||||
s = ''
|
||||
# non-verbose summary
|
||||
print('SUMMARY\n----------------------')
|
||||
print(
|
||||
'{}% ({}/{}) of instruction forms have no throughput value.'.format(
|
||||
round(100 * len(m_tp) / total), len(m_tp), total
|
||||
)
|
||||
s += 'SUMMARY\n----------------------\n'
|
||||
s += '{}% ({}/{}) of instruction forms have no throughput value.\n'.format(
|
||||
round(100 * len(m_tp) / total), len(m_tp), total
|
||||
)
|
||||
print(
|
||||
'{}% ({}/{}) of instruction forms have no latency value.'.format(
|
||||
round(100 * len(m_l) / total), len(m_l), total
|
||||
)
|
||||
s += '{}% ({}/{}) of instruction forms have no latency value.\n'.format(
|
||||
round(100 * len(m_l) / total), len(m_l), total
|
||||
)
|
||||
print(
|
||||
'{}% ({}/{}) of instruction forms have no port pressure assignment.'.format(
|
||||
round(100 * len(m_pp) / total), len(m_pp), total
|
||||
)
|
||||
s += '{}% ({}/{}) of instruction forms have no port pressure assignment.\n'.format(
|
||||
round(100 * len(m_pp) / total), len(m_pp), total
|
||||
)
|
||||
print(
|
||||
'{}% ({}/{}) of instruction forms might miss an ISA DB entry.'.format(
|
||||
round(100 * len(suspic_instr) / total), len(suspic_instr), total
|
||||
)
|
||||
s += '{}% ({}/{}) of instruction forms might miss an ISA DB entry.\n'.format(
|
||||
round(100 * len(suspic_instr) / total), len(suspic_instr), total
|
||||
)
|
||||
print('{} duplicate instruction forms in uarch DB.'.format(len(dup_arch)))
|
||||
print('{} duplicate instruction forms in ISA DB.'.format(len(dup_isa)))
|
||||
print(
|
||||
s += '{} duplicate instruction forms in uarch DB.\n'.format(len(dup_arch))
|
||||
s += '{} duplicate instruction forms in ISA DB.\n'.format(len(dup_isa))
|
||||
s += (
|
||||
'{} instruction forms in ISA DB are not referenced by instruction '.format(len(only_isa))
|
||||
+ 'forms in uarch DB.'
|
||||
+ 'forms in uarch DB.\n'
|
||||
)
|
||||
print('----------------------\n')
|
||||
s += '----------------------\n'
|
||||
# verbose version
|
||||
if verbose:
|
||||
_print_sanity_report_verbose(
|
||||
total, m_tp, m_l, m_pp, suspic_instr, dup_arch, dup_isa, only_isa
|
||||
s += _get_sanity_report_verbose(
|
||||
total, m_tp, m_l, m_pp, suspic_instr, dup_arch, dup_isa, only_isa, colors=colors
|
||||
)
|
||||
return s
|
||||
|
||||
|
||||
def _print_sanity_report_verbose(
|
||||
total, m_tp, m_l, m_pp, suspic_instr, dup_arch, dup_isa, only_isa
|
||||
def _get_sanity_report_verbose(
|
||||
total, m_tp, m_l, m_pp, suspic_instr, dup_arch, dup_isa, only_isa, colors=False
|
||||
):
|
||||
BRIGHT_CYAN = '\033[1;36;1m'
|
||||
BRIGHT_BLUE = '\033[1;34;1m'
|
||||
BRIGHT_RED = '\033[1;31;1m'
|
||||
BRIGHT_MAGENTA = '\033[1;35;1m'
|
||||
BRIGHT_YELLOW = '\033[1;33;1m'
|
||||
CYAN = '\033[36m'
|
||||
YELLOW = '\033[33m'
|
||||
WHITE = '\033[0m'
|
||||
"""Get the verbose part of the sanity report with all missing instruction forms."""
|
||||
BRIGHT_CYAN = '\033[1;36;1m' if colors else ''
|
||||
BRIGHT_BLUE = '\033[1;34;1m' if colors else ''
|
||||
BRIGHT_RED = '\033[1;31;1m' if colors else ''
|
||||
BRIGHT_MAGENTA = '\033[1;35;1m' if colors else ''
|
||||
BRIGHT_YELLOW = '\033[1;33;1m' if colors else ''
|
||||
CYAN = '\033[36m' if colors else ''
|
||||
YELLOW = '\033[33m' if colors else ''
|
||||
WHITE = '\033[0m' if colors else ''
|
||||
|
||||
print('Instruction forms without throughput value:\n' if len(m_tp) != 0 else '', end='')
|
||||
for instr_form in m_tp:
|
||||
print('{}{}{}'.format(BRIGHT_BLUE, _get_full_instruction_name(instr_form), WHITE))
|
||||
print('Instruction forms without latency value:\n' if len(m_l) != 0 else '', end='')
|
||||
for instr_form in m_l:
|
||||
print('{}{}{}'.format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE))
|
||||
print(
|
||||
'Instruction forms without port pressure assignment:\n' if len(m_pp) != 0 else '', end=''
|
||||
s = ''
|
||||
s += 'Instruction forms without throughput value:\n' if len(m_tp) != 0 else ''
|
||||
for instr_form in sorted(m_tp, key=lambda i: i['name']):
|
||||
s += '{}{}{}\n'.format(BRIGHT_BLUE, _get_full_instruction_name(instr_form), WHITE)
|
||||
s += 'Instruction forms without latency value:\n' if len(m_l) != 0 else ''
|
||||
for instr_form in sorted(m_l, key=lambda i: i['name']):
|
||||
s += '{}{}{}\n'.format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE)
|
||||
s += 'Instruction forms without port pressure assignment:\n' if len(m_pp) != 0 else ''
|
||||
for instr_form in sorted(m_pp, key=lambda i: i['name']):
|
||||
s += '{}{}{}\n'.format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE)
|
||||
s += 'Instruction forms which might miss an ISA DB entry:\n' if len(suspic_instr) != 0 else ''
|
||||
for instr_form in sorted(suspic_instr, key=lambda i: i['name']):
|
||||
s += '{}{}{}{}\n'.format(
|
||||
BRIGHT_CYAN,
|
||||
_get_full_instruction_name(instr_form),
|
||||
' -- ' + instr_form['note'] if 'note' in instr_form else '',
|
||||
WHITE,
|
||||
)
|
||||
s += 'Duplicate instruction forms in uarch DB:\n' if len(dup_arch) != 0 else ''
|
||||
for instr_form in sorted(dup_arch, key=lambda i: i['name']):
|
||||
s += '{}{}{}\n'.format(YELLOW, _get_full_instruction_name(instr_form), WHITE)
|
||||
s += 'Duplicate instruction forms in ISA DB:\n' if len(dup_isa) != 0 else ''
|
||||
for instr_form in sorted(dup_isa, key=lambda i: i['name']):
|
||||
s += '{}{}{}\n'.format(BRIGHT_YELLOW, _get_full_instruction_name(instr_form), WHITE)
|
||||
s += (
|
||||
'Instruction forms existing in ISA DB but not in uarch DB:\n' if len(only_isa) != 0 else ''
|
||||
)
|
||||
for instr_form in m_pp:
|
||||
print('{}{}{}'.format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE))
|
||||
print(
|
||||
'Instruction forms which might miss an ISA DB entry:\n' if len(suspic_instr) != 0 else '',
|
||||
end='',
|
||||
)
|
||||
for instr_form in suspic_instr:
|
||||
print('{}{}{}'.format(BRIGHT_CYAN, _get_full_instruction_name(instr_form), WHITE))
|
||||
print('Duplicate instruction forms in uarch DB:\n' if len(dup_arch) != 0 else '', end='')
|
||||
for instr_form in dup_arch:
|
||||
print('{}{}{}'.format(YELLOW, _get_full_instruction_name(instr_form), WHITE))
|
||||
print('Duplicate instruction forms in ISA DB:\n' if len(dup_isa) != 0 else '', end='')
|
||||
for instr_form in dup_isa:
|
||||
print('{}{}{}'.format(BRIGHT_YELLOW, _get_full_instruction_name(instr_form), WHITE))
|
||||
print(
|
||||
'Instruction forms existing in ISA DB but not in uarch DB:\n'
|
||||
if len(only_isa) != 0
|
||||
else '',
|
||||
end='',
|
||||
)
|
||||
for instr_form in only_isa:
|
||||
print('{}{}{}'.format(CYAN, _get_full_instruction_name(instr_form), WHITE))
|
||||
for instr_form in sorted(only_isa, key=lambda i: i['name']):
|
||||
s += '{}{}{}\n'.format(CYAN, _get_full_instruction_name(instr_form), WHITE)
|
||||
return s
|
||||
|
||||
|
||||
###################
|
||||
@@ -420,6 +546,7 @@ def _print_sanity_report_verbose(
|
||||
|
||||
|
||||
def _get_full_instruction_name(instruction_form):
|
||||
"""Get full instruction form name/identifier string out of given instruction form."""
|
||||
operands = []
|
||||
for op in instruction_form['operands']:
|
||||
op_attrs = [
|
||||
@@ -431,16 +558,19 @@ def _get_full_instruction_name(instruction_form):
|
||||
|
||||
|
||||
def __represent_none(self, data):
|
||||
"""Get YAML None representation."""
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:null', u'~')
|
||||
|
||||
|
||||
def _create_yaml_object():
|
||||
"""Create YAML module with None representation."""
|
||||
yaml_obj = ruamel.yaml.YAML()
|
||||
yaml_obj.representer.add_representer(type(None), __represent_none)
|
||||
return yaml_obj
|
||||
|
||||
|
||||
def __dump_data_to_yaml(filepath, data):
|
||||
"""Dump data to YAML file at given filepath."""
|
||||
# first add 'normal' meta data in the right order (no ordered dict yet)
|
||||
meta_data = dict(data)
|
||||
del meta_data['instruction_forms']
|
||||
|
||||
@@ -1,17 +1,25 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
"""
|
||||
Frontend interface for OSACA. Does everything necessary for analysis report generation.
|
||||
"""
|
||||
import re
|
||||
from datetime import datetime as dt
|
||||
|
||||
from ruamel import yaml
|
||||
|
||||
from osaca import utils
|
||||
from osaca.semantics import INSTR_FLAGS, KernelDG, SemanticsAppender
|
||||
from osaca.semantics import INSTR_FLAGS, ArchSemantics, KernelDG, MachineModel
|
||||
|
||||
|
||||
class Frontend(object):
|
||||
def __init__(self, filename='', arch=None, path_to_yaml=None):
|
||||
"""
|
||||
Constructor method.
|
||||
|
||||
:param filename: path to the analyzed kernel file for documentation, defaults to ''
|
||||
:type filename: str, optional
|
||||
:param arch: micro-arch code for getting the machine model, defaults to None
|
||||
:type arch: str, optional
|
||||
:param path_to_yaml: path to the YAML file for getting the machine model, defaults to None
|
||||
:type path_to_yaml: str, optional
|
||||
"""
|
||||
self._filename = filename
|
||||
if not arch and not path_to_yaml:
|
||||
raise ValueError('Either arch or path_to_yaml required.')
|
||||
@@ -20,16 +28,32 @@ class Frontend(object):
|
||||
self._arch = arch
|
||||
if arch:
|
||||
self._arch = arch.lower()
|
||||
with open(utils.find_file(self._arch+'.yml'), 'r') as f:
|
||||
self._data = yaml.load(f, Loader=yaml.Loader)
|
||||
self._machine_model = MachineModel(arch=arch, lazy=True)
|
||||
elif path_to_yaml:
|
||||
with open(path_to_yaml, 'r') as f:
|
||||
self._data = yaml.load(f, Loader=yaml.Loader)
|
||||
self._machine_model = MachineModel(path_to_yaml=path_to_yaml, lazy=True)
|
||||
self._arch = self._machine_model.get_arch()
|
||||
|
||||
def _is_comment(self, instruction_form):
|
||||
"""
|
||||
Checks if instruction form is a comment-only line.
|
||||
|
||||
:param instruction_form: instruction form to check
|
||||
:type instruction_form: `dict`
|
||||
:returns: `True` if comment line, `False` otherwise
|
||||
"""
|
||||
return instruction_form['comment'] is not None and instruction_form['instruction'] is None
|
||||
|
||||
def print_throughput_analysis(self, kernel, show_lineno=False, show_cmnts=True):
|
||||
def throughput_analysis(self, kernel, show_lineno=False, show_cmnts=True):
|
||||
"""
|
||||
Build throughput analysis only.
|
||||
|
||||
:param kernel: Kernel to build throughput analysis for.
|
||||
:type kernel: list
|
||||
:param show_lineno: flag for showing the line number of instructions, defaults to `False`
|
||||
:type show_lineno: bool, optional
|
||||
:param show_cmnts: flag for showing comment-only lines in kernel, defaults to `True`
|
||||
:type show_cmnts: bool, optional
|
||||
"""
|
||||
lineno_filler = ' ' if show_lineno else ''
|
||||
port_len = self._get_max_port_len(kernel)
|
||||
separator = '-' * sum([x + 3 for x in port_len]) + '-'
|
||||
@@ -39,81 +63,42 @@ class Frontend(object):
|
||||
headline = 'Port pressure in cycles'
|
||||
headline_str = '{{:^{}}}'.format(len(separator))
|
||||
|
||||
print('\n\nThroughput Analysis Report\n' + '--------------------------')
|
||||
print(headline_str.format(headline))
|
||||
print(lineno_filler + self._get_port_number_line(port_len))
|
||||
print(separator)
|
||||
s = '\n\nThroughput Analysis Report\n--------------------------\n'
|
||||
s += headline_str.format(headline) + '\n'
|
||||
s += lineno_filler + self._get_port_number_line(port_len) + '\n'
|
||||
s += separator + '\n'
|
||||
for instruction_form in kernel:
|
||||
line = '{:4d} {} {} {}'.format(
|
||||
instruction_form['line_number'],
|
||||
self._get_port_pressure(instruction_form['port_pressure'], port_len, sep_list),
|
||||
self._get_port_pressure(
|
||||
instruction_form['port_pressure'], port_len, separator=sep_list
|
||||
),
|
||||
self._get_flag_symbols(instruction_form['flags'])
|
||||
if instruction_form['instruction'] is not None
|
||||
else ' ',
|
||||
instruction_form['line'].strip(),
|
||||
instruction_form['line'].strip().replace('\t', ' '),
|
||||
)
|
||||
line = line if show_lineno else col_sep + col_sep.join(line.split(col_sep)[1:])
|
||||
if show_cmnts is False and self._is_comment(instruction_form):
|
||||
continue
|
||||
print(line)
|
||||
print()
|
||||
tp_sum = SemanticsAppender.get_throughput_sum(kernel)
|
||||
print(lineno_filler + self._get_port_pressure(tp_sum, port_len, ' '))
|
||||
s += line + '\n'
|
||||
s += '\n'
|
||||
tp_sum = ArchSemantics.get_throughput_sum(kernel)
|
||||
s += lineno_filler + self._get_port_pressure(tp_sum, port_len, separator=' ') + '\n'
|
||||
return s
|
||||
|
||||
def _get_separator_list(self, separator, separator_2=' '):
|
||||
separator_list = []
|
||||
for i in range(len(self._data['ports']) - 1):
|
||||
match_1 = re.search(r'\d+', self._data['ports'][i])
|
||||
match_2 = re.search(r'\d+', self._data['ports'][i + 1])
|
||||
if match_1 is not None and match_2 is not None and match_1.group() == match_2.group():
|
||||
separator_list.append(separator_2)
|
||||
else:
|
||||
separator_list.append(separator)
|
||||
separator_list.append(separator)
|
||||
return separator_list
|
||||
def latency_analysis(self, cp_kernel, separator='|'):
|
||||
"""
|
||||
Build a list-based CP analysis report.
|
||||
|
||||
def _get_flag_symbols(self, flag_obj):
|
||||
string_result = ''
|
||||
string_result += '*' if INSTR_FLAGS.NOT_BOUND in flag_obj else ''
|
||||
string_result += 'X' if INSTR_FLAGS.TP_UNKWN in flag_obj else ''
|
||||
string_result += 'P' if INSTR_FLAGS.HIDDEN_LD in flag_obj else ''
|
||||
# TODO add other flags
|
||||
string_result += ' ' if len(string_result) == 0 else ''
|
||||
return string_result
|
||||
|
||||
def _get_port_pressure(self, ports, port_len, separator='|'):
|
||||
if not isinstance(separator, list):
|
||||
separator = [separator for x in ports]
|
||||
string_result = '{} '.format(separator[-1])
|
||||
for i in range(len(ports)):
|
||||
if float(ports[i]) == 0.0:
|
||||
string_result += port_len[i] * ' ' + ' {} '.format(separator[i])
|
||||
continue
|
||||
left_len = len(str(float(ports[i])).split('.')[0])
|
||||
substr = '{:' + str(left_len) + '.' + str(max(port_len[i] - left_len - 1, 0)) + 'f}'
|
||||
string_result += substr.format(ports[i]) + ' {} '.format(separator[i])
|
||||
return string_result[:-1]
|
||||
|
||||
def _get_max_port_len(self, kernel):
|
||||
port_len = [4 for x in self._data['ports']]
|
||||
for instruction_form in kernel:
|
||||
for i, port in enumerate(instruction_form['port_pressure']):
|
||||
if len('{:.2f}'.format(port)) > port_len[i]:
|
||||
port_len[i] = len('{:.2f}'.format(port))
|
||||
return port_len
|
||||
|
||||
def _get_port_number_line(self, port_len, separator='|'):
|
||||
string_result = separator
|
||||
separator_list = self._get_separator_list(separator, '-')
|
||||
for i, length in enumerate(port_len):
|
||||
substr = '{:^' + str(length + 2) + 's}'
|
||||
string_result += substr.format(self._data['ports'][i]) + separator_list[i]
|
||||
return string_result
|
||||
|
||||
def print_latency_analysis(self, cp_kernel, separator='|'):
|
||||
print('\n\nLatency Analysis Report\n' + '-----------------------')
|
||||
:param cp_kernel: loop kernel containing the CP information for each instruction form
|
||||
:type cp_kernel: list
|
||||
:separator: separator symbol for the columns, defaults to '|'
|
||||
:type separator: str, optional
|
||||
"""
|
||||
s = '\n\nLatency Analysis Report\n-----------------------\n'
|
||||
for instruction_form in cp_kernel:
|
||||
print(
|
||||
s += (
|
||||
'{:4d} {} {:4.1f} {}{}{} {}'.format(
|
||||
instruction_form['line_number'],
|
||||
separator,
|
||||
@@ -123,40 +108,287 @@ class Frontend(object):
|
||||
separator,
|
||||
instruction_form['line'],
|
||||
)
|
||||
)
|
||||
print(
|
||||
) + '\n'
|
||||
s += (
|
||||
'\n{:4} {} {:4.1f}'.format(
|
||||
' ' * max([len(str(instr_form['line_number'])) for instr_form in cp_kernel]),
|
||||
' ' * len(separator),
|
||||
sum([instr_form['latency_cp'] for instr_form in cp_kernel]),
|
||||
)
|
||||
)
|
||||
) + '\n'
|
||||
return s
|
||||
|
||||
def print_loopcarried_dependencies(self, dep_dict, separator='|'):
|
||||
print(
|
||||
def loopcarried_dependencies(self, dep_dict, separator='|'):
|
||||
"""
|
||||
Print a list-based LCD analysis to the terminal.
|
||||
|
||||
:param dep_dict: dictionary with first instruction in LCD as key and the deps as value
|
||||
:type dep_dict: dict
|
||||
:separator: separator symbol for the columns, defaults to '|'
|
||||
:type separator: str, optional
|
||||
"""
|
||||
s = (
|
||||
'\n\nLoop-Carried Dependencies Analysis Report\n'
|
||||
+ '-----------------------------------------'
|
||||
+ '-----------------------------------------\n'
|
||||
)
|
||||
# TODO find a way to overcome padding for different tab-lengths
|
||||
for dep in dep_dict:
|
||||
print(
|
||||
'{:4d} {} {:4.1f} {} {:36}{} {}'.format(
|
||||
dep,
|
||||
separator,
|
||||
sum(
|
||||
[
|
||||
instr_form['latency_lcd']
|
||||
for instr_form in dep_dict[dep]['dependencies']
|
||||
]
|
||||
),
|
||||
separator,
|
||||
dep_dict[dep]['root']['line'],
|
||||
separator,
|
||||
[node['line_number'] for node in dep_dict[dep]['dependencies']],
|
||||
)
|
||||
s += '{:4d} {} {:4.1f} {} {:36}{} {}\n'.format(
|
||||
dep,
|
||||
separator,
|
||||
sum([instr_form['latency_lcd'] for instr_form in dep_dict[dep]['dependencies']]),
|
||||
separator,
|
||||
dep_dict[dep]['root']['line'].strip(),
|
||||
separator,
|
||||
[node['line_number'] for node in dep_dict[dep]['dependencies']],
|
||||
)
|
||||
return s
|
||||
|
||||
def _print_header_report(self):
|
||||
def full_analysis(self, kernel, kernel_dg: KernelDG, ignore_unknown=False, arch_warning=False, length_warning=False, verbose=False):
|
||||
"""
|
||||
Build the full analysis report including header, the symbol map, the combined TP/CP/LCD
|
||||
view and the list based LCD view.
|
||||
|
||||
:param kernel: kernel to report on
|
||||
:type kernel: list
|
||||
:param kernel_dg: directed graph containing CP and LCD
|
||||
:type kernel_dg: :class:`~osaca.semantics.KernelDG`
|
||||
:param ignore_unknown: flag for ignore warning if performance data is missing, defaults to
|
||||
`False`
|
||||
:type ignore_unknown: boolean, optional
|
||||
:param print_arch_warning: flag for additional user warning to specify micro-arch
|
||||
:type print_arch_warning: boolean, optional
|
||||
:param print_length_warning: flag for additional user warning to specify kernel length with --lines
|
||||
:type print_length_warning: boolean, optional
|
||||
:param verbose: flag for verbosity level, defaults to False
|
||||
:type verbose: boolean, optional
|
||||
"""
|
||||
return (
|
||||
self._header_report()
|
||||
+ self._user_warnings(arch_warning, length_warning)
|
||||
+ self._symbol_map()
|
||||
+ self.combined_view(
|
||||
kernel,
|
||||
kernel_dg.get_critical_path(),
|
||||
kernel_dg.get_loopcarried_dependencies(),
|
||||
ignore_unknown,
|
||||
)
|
||||
+ self.loopcarried_dependencies(kernel_dg.get_loopcarried_dependencies())
|
||||
)
|
||||
|
||||
def combined_view(
|
||||
self, kernel, cp_kernel: KernelDG, dep_dict, ignore_unknown=False, show_cmnts=True
|
||||
):
|
||||
"""
|
||||
Build combined view of kernel including port pressure (TP), a CP column and a
|
||||
LCD column.
|
||||
|
||||
:param kernel: kernel to report on
|
||||
:type kernel: list
|
||||
:param kernel_dg: directed graph containing CP and LCD
|
||||
:type kernel_dg: :class:`~osaca.semantics.KernelDG`
|
||||
:param dep_dict: dictionary with first instruction in LCD as key and the deps as value
|
||||
:type dep_dict: dict
|
||||
:param ignore_unknown: flag for showing result despite of missing instructions, defaults to
|
||||
`False`
|
||||
:type ignore_unknown: bool, optional
|
||||
:param show_cmnts: flag for showing comment-only lines in kernel, defaults to `True`
|
||||
:type show_cmnts: bool, optional
|
||||
"""
|
||||
s = '\n\nCombined Analysis Report\n------------------------\n'
|
||||
lineno_filler = ' '
|
||||
port_len = self._get_max_port_len(kernel)
|
||||
# Separator for ports
|
||||
separator = '-' * sum([x + 3 for x in port_len]) + '-'
|
||||
# ... for line numbers
|
||||
separator += '--' + len(str(kernel[-1]['line_number'])) * '-'
|
||||
col_sep = '|'
|
||||
# for LCD/CP column
|
||||
separator += '-' * (2 * 6 + len(col_sep)) + '-' * len(col_sep)
|
||||
sep_list = self._get_separator_list(col_sep)
|
||||
headline = 'Port pressure in cycles'
|
||||
headline_str = '{{:^{}}}'.format(len(separator))
|
||||
# Prepare CP/LCD variable
|
||||
cp_lines = [x['line_number'] for x in cp_kernel]
|
||||
sums = {}
|
||||
for dep in dep_dict:
|
||||
sums[dep] = sum(
|
||||
[instr_form['latency_lcd'] for instr_form in dep_dict[dep]['dependencies']]
|
||||
)
|
||||
lcd_sum = max(sums.values()) if len(sums) > 0 else 0.0
|
||||
lcd_lines = []
|
||||
if len(dep_dict) > 0:
|
||||
longest_lcd = [line_no for line_no in sums if sums[line_no] == lcd_sum][0]
|
||||
lcd_lines = [d['line_number'] for d in dep_dict[longest_lcd]['dependencies']]
|
||||
|
||||
s += headline_str.format(headline) + '\n'
|
||||
s += (
|
||||
(
|
||||
lineno_filler
|
||||
+ self._get_port_number_line(port_len, separator=col_sep)
|
||||
+ '{}{:^6}{}{:^6}{}'.format(col_sep, 'CP', col_sep, 'LCD', col_sep)
|
||||
)
|
||||
+ '\n'
|
||||
+ separator
|
||||
+ '\n'
|
||||
)
|
||||
for instruction_form in kernel:
|
||||
if show_cmnts is False and self._is_comment(instruction_form):
|
||||
continue
|
||||
line_number = instruction_form['line_number']
|
||||
used_ports = [list(uops[1]) for uops in instruction_form['port_uops']]
|
||||
used_ports = list(set([p for uops_ports in used_ports for p in uops_ports]))
|
||||
s += '{:4d} {}{} {} {}\n'.format(
|
||||
line_number,
|
||||
self._get_port_pressure(
|
||||
instruction_form['port_pressure'], port_len, used_ports, sep_list
|
||||
),
|
||||
self._get_lcd_cp_ports(
|
||||
instruction_form['line_number'],
|
||||
cp_kernel if line_number in cp_lines else None,
|
||||
dep_dict[longest_lcd] if line_number in lcd_lines else None,
|
||||
),
|
||||
self._get_flag_symbols(instruction_form['flags'])
|
||||
if instruction_form['instruction'] is not None
|
||||
else ' ',
|
||||
instruction_form['line'].strip().replace('\t', ' '),
|
||||
)
|
||||
s += '\n'
|
||||
# check for unknown instructions and throw warning if called without --ignore-unknown
|
||||
if not ignore_unknown and INSTR_FLAGS.TP_UNKWN in [
|
||||
flag for instr in kernel for flag in instr['flags']
|
||||
]:
|
||||
num_missing = len(
|
||||
[instr['flags'] for instr in kernel if INSTR_FLAGS.TP_UNKWN in instr['flags']]
|
||||
)
|
||||
s += self._missing_instruction_error(num_missing)
|
||||
else:
|
||||
# lcd_sum already calculated before
|
||||
tp_sum = ArchSemantics.get_throughput_sum(kernel)
|
||||
cp_sum = sum([x['latency_cp'] for x in cp_kernel])
|
||||
s += (
|
||||
lineno_filler
|
||||
+ self._get_port_pressure(tp_sum, port_len, separator=' ')
|
||||
+ ' {:^6} {:^6}\n'.format(cp_sum, lcd_sum)
|
||||
)
|
||||
return s
|
||||
|
||||
####################
|
||||
# HELPER FUNCTIONS
|
||||
####################
|
||||
|
||||
def _missing_instruction_error(self, amount):
|
||||
"""Returns the warning for if any instruction form in the analysis is missing."""
|
||||
s = (
|
||||
'------------------ WARNING: The performance data for {} instructions is missing.'
|
||||
'------------------\n'
|
||||
' No final analysis is given. If you want to ignore this\n'
|
||||
' warning and run the analysis anyway, start osaca with\n'
|
||||
' --ignore-unknown flag.\n'
|
||||
'--------------------------------------------------------------------------------'
|
||||
'----------------{}\n'
|
||||
).format(amount, '-' * len(str(amount)))
|
||||
return s
|
||||
|
||||
def _user_warnings(self, arch_warning, length_warning):
|
||||
"""Returns warning texts for giving the user more insight in what he is doing."""
|
||||
arch_text = (
|
||||
'WARNING: No micro-architecture was specified and a default uarch was used.\n'
|
||||
' Specify the uarch with --arch. See --help for more information.\n'
|
||||
)
|
||||
length_text = (
|
||||
'WARNING: You are analyzing a large amount of instruction forms. Analyses '
|
||||
'across loops/block boundaries often do not make much sense.\n'
|
||||
' Specify the kernel length with --length. See --help for more '
|
||||
'information.\n'
|
||||
' If this is intentional, you can safely ignore this message.\n'
|
||||
)
|
||||
|
||||
warnings = ''
|
||||
warnings += arch_text if arch_warning else ''
|
||||
warnings += length_text if length_warning else ''
|
||||
warnings += '\n'
|
||||
return warnings
|
||||
|
||||
|
||||
def _get_separator_list(self, separator, separator_2=' '):
|
||||
"""Creates column view for seperators in the TP/combined view."""
|
||||
separator_list = []
|
||||
for i in range(len(self._machine_model.get_ports()) - 1):
|
||||
match_1 = re.search(r'\d+', self._machine_model.get_ports()[i])
|
||||
match_2 = re.search(r'\d+', self._machine_model.get_ports()[i + 1])
|
||||
if match_1 is not None and match_2 is not None and match_1.group() == match_2.group():
|
||||
separator_list.append(separator_2)
|
||||
else:
|
||||
separator_list.append(separator)
|
||||
separator_list.append(separator)
|
||||
return separator_list
|
||||
|
||||
def _get_flag_symbols(self, flag_obj):
|
||||
"""Returns flags for a flag object of an instruction"""
|
||||
string_result = ''
|
||||
string_result += '*' if INSTR_FLAGS.NOT_BOUND in flag_obj else ''
|
||||
string_result += 'X' if INSTR_FLAGS.TP_UNKWN in flag_obj else ''
|
||||
string_result += 'P' if INSTR_FLAGS.HIDDEN_LD in flag_obj else ''
|
||||
# TODO add other flags
|
||||
string_result += ' ' if len(string_result) == 0 else ''
|
||||
return string_result
|
||||
|
||||
def _get_port_pressure(self, ports, port_len, used_ports=[], separator='|'):
|
||||
"""Returns line of port pressure for an instruction."""
|
||||
if not isinstance(separator, list):
|
||||
separator = [separator for x in ports]
|
||||
string_result = '{} '.format(separator[-1])
|
||||
for i in range(len(ports)):
|
||||
if float(ports[i]) == 0.0 and self._machine_model.get_ports()[i] not in used_ports:
|
||||
string_result += port_len[i] * ' ' + ' {} '.format(separator[i])
|
||||
continue
|
||||
left_len = len(str(float(ports[i])).split('.')[0])
|
||||
substr = '{:' + str(left_len) + '.' + str(max(port_len[i] - left_len - 1, 0)) + 'f}'
|
||||
substr = substr.format(ports[i])
|
||||
string_result += (
|
||||
substr + ' {} '.format(separator[i])
|
||||
if '.' in substr
|
||||
else '{:.1f}{} '.format(ports[i], separator[i])
|
||||
)
|
||||
return string_result[:-1]
|
||||
|
||||
def _get_node_by_lineno(self, lineno, kernel):
|
||||
"""Returns instruction form from kernel by its line number."""
|
||||
nodes = [instr for instr in kernel if instr['line_number'] == lineno]
|
||||
return nodes[0] if len(nodes) > 0 else None
|
||||
|
||||
def _get_lcd_cp_ports(self, line_number, cp_dg, dependency, separator='|'):
|
||||
"""Returns the CP and LCD line for one instruction."""
|
||||
lat_cp = lat_lcd = ''
|
||||
if cp_dg:
|
||||
lat_cp = float(self._get_node_by_lineno(line_number, cp_dg)['latency_cp'])
|
||||
if dependency:
|
||||
lat_lcd = float(
|
||||
self._get_node_by_lineno(line_number, dependency['dependencies'])['latency_lcd']
|
||||
)
|
||||
return '{} {:>4} {} {:>4} {}'.format(separator, lat_cp, separator, lat_lcd, separator)
|
||||
|
||||
def _get_max_port_len(self, kernel):
|
||||
"""Returns the maximal length needed to print all throughputs of the kernel."""
|
||||
port_len = [4 for x in self._machine_model.get_ports()]
|
||||
for instruction_form in kernel:
|
||||
for i, port in enumerate(instruction_form['port_pressure']):
|
||||
if len('{:.2f}'.format(port)) > port_len[i]:
|
||||
port_len[i] = len('{:.2f}'.format(port))
|
||||
return port_len
|
||||
|
||||
def _get_port_number_line(self, port_len, separator='|'):
|
||||
"""Returns column view of port identificators of machine_model."""
|
||||
string_result = separator
|
||||
separator_list = self._get_separator_list(separator, '-')
|
||||
for i, length in enumerate(port_len):
|
||||
substr = '{:^' + str(length + 2) + 's}'
|
||||
string_result += substr.format(self._machine_model.get_ports()[i]) + separator_list[i]
|
||||
return string_result
|
||||
|
||||
def _header_report(self):
|
||||
"""Prints header information"""
|
||||
version = 'v0.3'
|
||||
adjust = 20
|
||||
header = ''
|
||||
@@ -166,9 +398,10 @@ class Frontend(object):
|
||||
header += 'Timestamp:'.ljust(adjust) + '{}\n'.format(
|
||||
dt.utcnow().strftime('%Y-%m-%d %H:%M:%S')
|
||||
)
|
||||
print(header)
|
||||
return header + '\n'
|
||||
|
||||
def _print_symbol_map(self):
|
||||
def _symbol_map(self):
|
||||
"""Prints instruction flag map."""
|
||||
symbol_dict = {
|
||||
INSTR_FLAGS.NOT_BOUND: 'Instruction micro-ops not bound to a port',
|
||||
INSTR_FLAGS.TP_UNKWN: 'No throughput/latency information for this instruction in '
|
||||
@@ -180,14 +413,7 @@ class Frontend(object):
|
||||
for flag in sorted(symbol_dict.keys()):
|
||||
symbol_map += ' {} - {}\n'.format(self._get_flag_symbols([flag]), symbol_dict[flag])
|
||||
|
||||
print(symbol_map, end='')
|
||||
return symbol_map
|
||||
|
||||
def _print_port_binding_summary(self):
|
||||
def _port_binding_summary(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def print_full_analysis(self, kernel, kernel_dg: KernelDG, verbose=False):
|
||||
self._print_header_report()
|
||||
self._print_symbol_map()
|
||||
self.print_throughput_analysis(kernel, show_lineno=True)
|
||||
self.print_latency_analysis(kernel_dg.get_critical_path())
|
||||
self.print_loopcarried_dependencies(kernel_dg.get_loopcarried_dependencies())
|
||||
|
||||
297
osaca/osaca.py
297
osaca/osaca.py
@@ -1,28 +1,42 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""CLI for OSACA"""
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from filecmp import dircmp
|
||||
from subprocess import call
|
||||
import traceback
|
||||
|
||||
from osaca.db_interface import sanity_check, import_benchmark_output
|
||||
from osaca.db_interface import import_benchmark_output, sanity_check
|
||||
from osaca.frontend import Frontend
|
||||
from osaca.parser import BaseParser, ParserAArch64v81, ParserX86ATT
|
||||
from osaca.semantics import (KernelDG, MachineModel, SemanticsAppender,
|
||||
reduce_to_section)
|
||||
from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT
|
||||
from osaca.semantics import (INSTR_FLAGS, ArchSemantics, KernelDG,
|
||||
MachineModel, reduce_to_section)
|
||||
|
||||
MODULE_DATA_DIR = os.path.join(
|
||||
os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
|
||||
)
|
||||
LOCAL_OSACA_DIR = os.path.join(os.path.expanduser('~') + '/.osaca/')
|
||||
DATA_DIR = os.path.join(LOCAL_OSACA_DIR, 'data/')
|
||||
|
||||
SUPPORTED_ARCHS = [
|
||||
'SNB',
|
||||
'IVB',
|
||||
'HSW',
|
||||
'BDW',
|
||||
'SKX',
|
||||
'CSX',
|
||||
'ICL',
|
||||
'ZEN1',
|
||||
'ZEN2',
|
||||
'TX2',
|
||||
'N1',
|
||||
'A64FX',
|
||||
]
|
||||
DEFAULT_ARCHS = {
|
||||
'aarch64': 'A64FX',
|
||||
'x86': 'SKX',
|
||||
}
|
||||
|
||||
|
||||
# Stolen from pip
|
||||
def __read(*names, **kwargs):
|
||||
"""Reads in file"""
|
||||
with io.open(
|
||||
os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")
|
||||
) as fp:
|
||||
@@ -31,6 +45,7 @@ def __read(*names, **kwargs):
|
||||
|
||||
# Stolen from pip
|
||||
def __find_version(*file_paths):
|
||||
"""Searches for a version attribute in the given file(s)"""
|
||||
version_file = __read(*file_paths)
|
||||
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
||||
if version_match:
|
||||
@@ -39,17 +54,29 @@ def __find_version(*file_paths):
|
||||
|
||||
|
||||
def get_version():
|
||||
"""
|
||||
Gets the current OSACA version stated in the __init__ file
|
||||
|
||||
:returns: str -- the version string.
|
||||
"""
|
||||
return __find_version('__init__.py')
|
||||
|
||||
|
||||
def create_parser():
|
||||
"""Return argparse parser."""
|
||||
def create_parser(parser=None):
|
||||
"""
|
||||
Return argparse parser.
|
||||
|
||||
:param parser: Existing parser object to add the arguments, defaults to `None`
|
||||
:type parser: :class:`~Argparse.ArgumentParser`
|
||||
:returns: The newly created :class:`~Argparse.ArgumentParser` object.
|
||||
"""
|
||||
# Create parser
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Analyzes a marked innermost loop snippet for a given architecture type.',
|
||||
epilog='For help, examples, documentation and bug reports go to:\nhttps://github.com'
|
||||
'/RRZE-HPC/OSACA/ | License: AGPLv3',
|
||||
)
|
||||
if not parser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Analyzes a marked innermost loop snippet for a given architecture type.',
|
||||
epilog='For help, examples, documentation and bug reports go to:\nhttps://github.com'
|
||||
'/RRZE-HPC/OSACA/ | License: AGPLv3',
|
||||
)
|
||||
|
||||
# Add arguments
|
||||
parser.add_argument(
|
||||
@@ -58,7 +85,21 @@ def create_parser():
|
||||
parser.add_argument(
|
||||
'--arch',
|
||||
type=str,
|
||||
help='Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ZEN1, VULCAN).',
|
||||
help='Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ZEN1, ZEN2, TX2, N1, '
|
||||
'A64FX). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--fixed',
|
||||
action='store_true',
|
||||
help='Run the throughput analysis with fixed probabilities for all suitable ports per '
|
||||
'instruction. Otherwise, OSACA will print the optimal port utilization for the kernel.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--lines',
|
||||
type=str,
|
||||
help='Define lines that should be included in the analysis. This option overwrites any'
|
||||
' range defined by markers in the assembly. Add either single lines or ranges defined by'
|
||||
' "-" or ":", each entry separated by commas, e.g.: --lines 1,2,8-18,20:24',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--db-check',
|
||||
@@ -67,6 +108,13 @@ def create_parser():
|
||||
help='Run a sanity check on the by "--arch" specified database. The output depends '
|
||||
'on the verbosity level.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--online',
|
||||
dest='internet_check',
|
||||
action='store_true',
|
||||
help='Run sanity check with online DB validation (currently felixcloutier) to see the '
|
||||
'src/dst distribution of the operands. Can be only used in combination with --db-check.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--import',
|
||||
metavar='MICROBENCH',
|
||||
@@ -74,8 +122,8 @@ def create_parser():
|
||||
type=str,
|
||||
default=argparse.SUPPRESS,
|
||||
help='Import a given microbenchmark output file into the corresponding architecture '
|
||||
'instruction database. Define the type of microbenchmark either as "ibench", '
|
||||
'"asmbench" or "uopsinfo".',
|
||||
'instruction database. Define the type of microbenchmark either as "ibench" or '
|
||||
'"asmbench".',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--insert-marker',
|
||||
@@ -93,9 +141,21 @@ def create_parser():
|
||||
help='Output path for .dot file export. If "." is given, the file will be stored as '
|
||||
'"./osaca_dg.dot"',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--ignore-unknown',
|
||||
dest='ignore_unknown',
|
||||
action='store_true',
|
||||
help='Ignore if instructions cannot be found in the data file and print analysis anyway.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v', action='count', default=0, help='Increases verbosity level.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--out', '-o',
|
||||
default=sys.stdout,
|
||||
type=argparse.FileType('w'),
|
||||
help='Write analysis to this file (default to stdout).'
|
||||
)
|
||||
parser.add_argument(
|
||||
'file', type=argparse.FileType('r'), help='Path to object (ASM or instruction file).'
|
||||
)
|
||||
@@ -104,11 +164,20 @@ def create_parser():
|
||||
|
||||
|
||||
def check_arguments(args, parser):
|
||||
"""Check arguments passed by user that are not checked by argparse itself."""
|
||||
supported_archs = ['SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'CSX', 'ZEN1', 'VULCAN']
|
||||
supported_import_files = ['ibench', 'asmbench', 'uopsinfo']
|
||||
"""
|
||||
Check arguments passed by user that are not checked by argparse itself.
|
||||
|
||||
if 'arch' in args and args.arch.upper() not in supported_archs:
|
||||
:param args: arguments given from :class:`~argparse.ArgumentParser` after parsing
|
||||
:param parser: :class:`~argparse.ArgumentParser` object
|
||||
"""
|
||||
supported_import_files = ['ibench', 'asmbench']
|
||||
|
||||
if args.arch is None and (args.check_db or 'import_data' in args):
|
||||
parser.error(
|
||||
'DB check and data import cannot work with a default microarchitecture. '
|
||||
'Please see --help for all valid architecture codes.'
|
||||
)
|
||||
elif args.arch is not None and args.arch.upper() not in SUPPORTED_ARCHS:
|
||||
parser.error(
|
||||
'Microarchitecture not supported. Please see --help for all valid architecture codes.'
|
||||
)
|
||||
@@ -117,45 +186,44 @@ def check_arguments(args, parser):
|
||||
'Microbenchmark not supported for data import. Please see --help for all valid '
|
||||
'microbenchmark codes.'
|
||||
)
|
||||
if args.internet_check and not args.check_db:
|
||||
parser.error('--online requires --check-db')
|
||||
|
||||
|
||||
def check_user_dir():
|
||||
# Check if data files are already in usr dir, otherwise create them
|
||||
if not os.path.isdir(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
for f in os.listdir(MODULE_DATA_DIR):
|
||||
if not os.path.exists(os.path.join(DATA_DIR, f)):
|
||||
call(['cp', '-r', os.path.join(MODULE_DATA_DIR, f), DATA_DIR])
|
||||
else:
|
||||
# Compare and warn if files in DATA_DIR are different
|
||||
dir_comp = dircmp(DATA_DIR, MODULE_DATA_DIR)
|
||||
if dir_comp.left_list != dir_comp.same_files:
|
||||
print(
|
||||
"WARNING: Files in {} differs from {}. Check or delete {}.".format(
|
||||
MODULE_DATA_DIR, DATA_DIR, DATA_DIR
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
def import_data(benchmark_type, arch, filepath, output_file=sys.stdout):
|
||||
"""
|
||||
Imports benchmark results from micro-benchmarks.
|
||||
|
||||
|
||||
def import_data(benchmark_type, arch, filepath):
|
||||
:param benchmark_type: key for defining type of benchmark output
|
||||
:type benchmark_type: str
|
||||
:param arch: target architecture to put the data into the right database
|
||||
:type arch: str
|
||||
:param filepath: filepath of the output file"
|
||||
:type filepath: str
|
||||
:param output_file: output stream specifying where to write output, defaults to :class:`sys.stdout`
|
||||
:type output_file: stream, optional
|
||||
"""
|
||||
if benchmark_type.lower() == 'ibench':
|
||||
import_benchmark_output(arch, 'ibench', filepath)
|
||||
import_benchmark_output(arch, 'ibench', filepath, output=output_file)
|
||||
elif benchmark_type.lower() == 'asmbench':
|
||||
import_benchmark_output(arch, 'asmbench', filepath, output=output_file)
|
||||
else:
|
||||
raise NotImplementedError('This benchmark input variant is not implemented yet.')
|
||||
raise NotImplementedError('This benchmark input variant is not supported.')
|
||||
|
||||
|
||||
def insert_byte_marker(args):
|
||||
if MachineModel.get_isa_for_arch(args.arch) != 'x86':
|
||||
print('Marker insertion for non-x86 is not yet supported by Kerncraft.', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
"""
|
||||
Inserts byte markers into an assembly file using kerncraft.
|
||||
|
||||
:param args: arguments given from :class:`~argparse.ArgumentParser` after parsing
|
||||
"""
|
||||
try:
|
||||
from kerncraft import iaca
|
||||
from kerncraft.incore_model import asm_instrumentation
|
||||
except ImportError:
|
||||
print(
|
||||
"Module kerncraft not installed. Use 'pip install --user "
|
||||
"kerncraft' for installation.\nFor more information see "
|
||||
"https://github.com/RRZE-HPC/kerncraft",
|
||||
'Module kerncraft not installed. Use \'pip install --user '
|
||||
'kerncraft\' for installation.\nFor more information see '
|
||||
'https://github.com/RRZE-HPC/kerncraft',
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
@@ -163,11 +231,12 @@ def insert_byte_marker(args):
|
||||
assembly = args.file.read()
|
||||
unmarked_assembly = io.StringIO(assembly)
|
||||
marked_assembly = io.StringIO()
|
||||
iaca.iaca_instrumentation(
|
||||
asm_instrumentation(
|
||||
input_file=unmarked_assembly,
|
||||
output_file=marked_assembly,
|
||||
block_selection='manual',
|
||||
pointer_increment='auto_with_manual_fallback',
|
||||
isa=MachineModel.get_isa_for_arch(args.arch),
|
||||
)
|
||||
|
||||
marked_assembly.seek(0)
|
||||
@@ -176,22 +245,56 @@ def insert_byte_marker(args):
|
||||
f.write(assembly)
|
||||
|
||||
|
||||
def inspect(args):
|
||||
arch = args.arch
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
verbose = args.verbose
|
||||
def inspect(args, output_file=sys.stdout):
|
||||
"""
|
||||
Does the actual throughput and critical path analysis of OSACA and prints it to the
|
||||
terminal.
|
||||
|
||||
:param args: arguments given from :class:`~argparse.ArgumentParser` after parsing
|
||||
:param output_file: Define the stream for output, defaults to :class:`sys.stdout`
|
||||
:type output_file: stream, optional
|
||||
"""
|
||||
# Read file
|
||||
code = args.file.read()
|
||||
|
||||
# Detect ISA if necessary
|
||||
arch = args.arch if args.arch is not None else DEFAULT_ARCHS[BaseParser.detect_ISA(code)]
|
||||
print_arch_warning = False if args.arch else True
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
verbose = args.verbose
|
||||
ignore_unknown = args.ignore_unknown
|
||||
|
||||
# Parse file
|
||||
parser = get_asm_parser(arch)
|
||||
parsed_code = parser.parse_file(code)
|
||||
try:
|
||||
parsed_code = parser.parse_file(code)
|
||||
except:
|
||||
# probably the wrong parser based on heuristic
|
||||
if args.arch is None:
|
||||
# change ISA and try again
|
||||
arch = DEFAULT_ARCHS['x86'] if BaseParser.detect_ISA(code) == 'aarch64' else DEFAULT_ARCHS['aarch64']
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
parser = get_asm_parser(arch)
|
||||
parsed_code = parser.parse_file(code)
|
||||
else:
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Reduce to marked kernel and add semantics
|
||||
kernel = reduce_to_section(parsed_code, isa)
|
||||
# Reduce to marked kernel or chosen section and add semantics
|
||||
if args.lines:
|
||||
line_range = get_line_range(args.lines)
|
||||
kernel = [line for line in parsed_code if line['line_number'] in line_range]
|
||||
print_length_warning = False
|
||||
else:
|
||||
kernel = reduce_to_section(parsed_code, isa)
|
||||
# Print warning if kernel has no markers and is larger than threshold (100)
|
||||
print_length_warning = True if len(kernel) == len(parsed_code) and len(kernel) > 100 else False
|
||||
machine_model = MachineModel(arch=arch)
|
||||
semantics = SemanticsAppender(machine_model)
|
||||
semantics = ArchSemantics(machine_model)
|
||||
semantics.add_semantics(kernel)
|
||||
# Do optimal schedule for kernel throughput if wished
|
||||
if not args.fixed:
|
||||
semantics.assign_optimal_throughput(kernel)
|
||||
|
||||
# Create DiGrahps
|
||||
kernel_graph = KernelDG(kernel, parser, machine_model)
|
||||
@@ -199,44 +302,90 @@ def inspect(args):
|
||||
kernel_graph.export_graph(args.dotpath if args.dotpath != '.' else None)
|
||||
# Print analysis
|
||||
frontend = Frontend(args.file.name, arch=arch)
|
||||
frontend.print_full_analysis(kernel, kernel_graph, verbose=verbose)
|
||||
print(
|
||||
frontend.full_analysis(
|
||||
kernel,
|
||||
kernel_graph,
|
||||
ignore_unknown=ignore_unknown,
|
||||
arch_warning=print_arch_warning,
|
||||
length_warning=print_length_warning,
|
||||
verbose=verbose
|
||||
),
|
||||
file=output_file,
|
||||
)
|
||||
|
||||
|
||||
def run(args, output_file=sys.stdout):
|
||||
"""
|
||||
Main entry point for OSACAs workflow. Decides whether to run an analysis or other things.
|
||||
|
||||
:param args: arguments given from :class:`~argparse.ArgumentParser` after parsing
|
||||
:param output_file: Define the stream for output, defaults to :class:`sys.stdout`
|
||||
:type output_file: stream, optional
|
||||
"""
|
||||
if args.check_db:
|
||||
# Sanity check on DB
|
||||
verbose = True if args.verbose > 0 else False
|
||||
sanity_check(args.arch, verbose=verbose)
|
||||
if 'import_data' in args:
|
||||
sanity_check(
|
||||
args.arch, verbose=verbose, internet_check=args.internet_check, output_file=output_file
|
||||
)
|
||||
elif 'import_data' in args:
|
||||
# Import microbench output file into DB
|
||||
import_data(args.import_data, args.arch, args.file)
|
||||
if args.insert_marker:
|
||||
import_data(args.import_data, args.arch, args.file.name, output_file=output_file)
|
||||
elif args.insert_marker:
|
||||
# Try to add IACA marker
|
||||
insert_byte_marker(args)
|
||||
else:
|
||||
# Analyze kernel
|
||||
inspect(args)
|
||||
inspect(args, output_file=output_file)
|
||||
|
||||
|
||||
# ---------------------------------------------------
|
||||
def get_asm_parser(arch) -> BaseParser:
|
||||
"""
|
||||
Helper function to create the right parser for a specific architecture.
|
||||
|
||||
:param arch: architecture code
|
||||
:type arch: str
|
||||
:returns: :class:`~osaca.parser.BaseParser` object
|
||||
"""
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
if isa == 'x86':
|
||||
return ParserX86ATT()
|
||||
elif isa == 'aarch64':
|
||||
return ParserAArch64v81()
|
||||
return ParserAArch64()
|
||||
|
||||
|
||||
# ---------------------------------------------------
|
||||
def get_unmatched_instruction_ratio(kernel):
|
||||
"""Return ratio of unmatched from total instructions in kernel."""
|
||||
unmatched_counter = 0
|
||||
for instruction in kernel:
|
||||
if (
|
||||
INSTR_FLAGS.TP_UNKWN in instruction['flags']
|
||||
and INSTR_FLAGS.LT_UNKWN in instruction['flags']
|
||||
):
|
||||
unmatched_counter += 1
|
||||
return unmatched_counter / len(kernel)
|
||||
|
||||
def get_line_range(line_str):
|
||||
line_str = line_str.replace(':', '-')
|
||||
lines = line_str.split(',')
|
||||
lines_int = []
|
||||
for l in lines:
|
||||
if '-' in l:
|
||||
start = int(l.split('-')[0])
|
||||
end = int(l.split('-')[1])
|
||||
rnge = list(range(start, end+1))
|
||||
lines_int += rnge
|
||||
else:
|
||||
lines_int.append(int(l))
|
||||
return lines_int
|
||||
|
||||
def main():
|
||||
"""Initialize and run command line interface."""
|
||||
parser = create_parser()
|
||||
args = parser.parse_args()
|
||||
check_arguments(args, parser)
|
||||
check_user_dir()
|
||||
run(args)
|
||||
run(args, output_file=args.out)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -6,6 +6,14 @@ Only the parser below will be exported, so please add new parsers to __all__.
|
||||
from .attr_dict import AttrDict
|
||||
from .base_parser import BaseParser
|
||||
from .parser_x86att import ParserX86ATT
|
||||
from .parser_AArch64v81 import ParserAArch64v81
|
||||
from .parser_AArch64 import ParserAArch64
|
||||
|
||||
__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64v81']
|
||||
__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64', 'get_parser']
|
||||
|
||||
def get_parser(isa):
|
||||
if isa.lower() == 'x86':
|
||||
return ParserX86ATT()
|
||||
elif isa.lower() == 'aarch64':
|
||||
return ParserAArch64()
|
||||
else:
|
||||
raise ValueError("Unknown ISA {!r}.".format(isa))
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Attribute Dictionary to access dictionary entries as attributes."""
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
@@ -8,14 +9,19 @@ class AttrDict(dict):
|
||||
|
||||
@staticmethod
|
||||
def convert_dict(dictionary):
|
||||
"""
|
||||
Convert given dictionary to `AttrDict`.
|
||||
|
||||
:param dictionary: `dict` to be converted
|
||||
:type dictionary: `dict`
|
||||
:returns: `AttrDict` representation of ``dictionary``
|
||||
"""
|
||||
if isinstance(dictionary, type(list())):
|
||||
return [AttrDict.convert_dict(x) for x in dictionary]
|
||||
if isinstance(dictionary, type(dict())):
|
||||
for key in list(dictionary.keys()):
|
||||
entry = dictionary[key]
|
||||
if isinstance(entry, type(dict())) or isinstance(
|
||||
entry, type(AttrDict())
|
||||
):
|
||||
if isinstance(entry, type(dict())) or isinstance(entry, type(AttrDict())):
|
||||
dictionary[key] = AttrDict.convert_dict(dictionary[key])
|
||||
if isinstance(entry, type(list())):
|
||||
dictionary[key] = [AttrDict.convert_dict(x) for x in entry]
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""Parser superclass of specific parsers."""
|
||||
import operator
|
||||
import re
|
||||
|
||||
class BaseParser(object):
|
||||
# Identifiers for operand types
|
||||
@@ -7,23 +9,42 @@ class BaseParser(object):
|
||||
DIRECTIVE_ID = 'directive'
|
||||
IMMEDIATE_ID = 'immediate'
|
||||
LABEL_ID = 'label'
|
||||
IDENTIFIER_ID = 'identifier'
|
||||
MEMORY_ID = 'memory'
|
||||
REGISTER_ID = 'register'
|
||||
SEGMENT_EXT_ID = 'segment_extension'
|
||||
INSTRUCTION_ID = 'instruction'
|
||||
OPERANDS_ID = 'operands'
|
||||
|
||||
def __init__(self):
|
||||
self.construct_parser()
|
||||
|
||||
@staticmethod
|
||||
def detect_ISA(file_content):
|
||||
"""Detect the ISA of the assembly based on the used registers and return the ISA code."""
|
||||
# Check for the amount of registers in the code to determine the ISA
|
||||
# 1) Check for xmm, ymm, zmm, rax, rbx, rcx, and rdx registers in x86
|
||||
heuristics_x86ATT = [r'%[xyz]mm[0-9]', r'%[er][abcd]x[0-9]']
|
||||
# 2) check for v and z vector registers and x/w general-purpose registers
|
||||
heuristics_aarch64 = [r'[vz][0-9][0-9]?\.[0-9][0-9]?[bhsd]', r'[wx][0-9]']
|
||||
matches = {'x86': 0, 'aarch64': 0}
|
||||
|
||||
for h in heuristics_x86ATT:
|
||||
matches['x86'] += len(re.findall(h, file_content))
|
||||
for h in heuristics_aarch64:
|
||||
matches['aarch64'] += len(re.findall(h, file_content))
|
||||
|
||||
return max(matches.items(), key=operator.itemgetter(1))[0]
|
||||
|
||||
def parse_file(self, file_content, start_line=0):
|
||||
'''
|
||||
"""
|
||||
Parse assembly file. This includes *not* extracting of the marked kernel and
|
||||
the parsing of the instruction forms.
|
||||
|
||||
:param str file_content: assembly code
|
||||
:param int start_line: offset, if first line in file_content is meant to be not 1
|
||||
:return: list of instruction forms
|
||||
'''
|
||||
"""
|
||||
# Create instruction form list
|
||||
asm_instructions = []
|
||||
lines = file_content.split('\n')
|
||||
|
||||
@@ -6,33 +6,36 @@ import pyparsing as pp
|
||||
from osaca.parser import AttrDict, BaseParser
|
||||
|
||||
|
||||
class ParserAArch64v81(BaseParser):
|
||||
class ParserAArch64(BaseParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.isa = 'aarch64'
|
||||
|
||||
def construct_parser(self):
|
||||
"""Create parser for ARM AArch64 ISA."""
|
||||
# Comment
|
||||
symbol_comment = '//'
|
||||
self.comment = pp.Literal(symbol_comment) + pp.Group(
|
||||
pp.ZeroOrMore(pp.Word(pp.printables))
|
||||
).setResultsName(self.COMMENT_ID)
|
||||
# Define ARM assembly identifier
|
||||
decimal_number = pp.Combine(
|
||||
pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
|
||||
).setResultsName('value')
|
||||
hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
|
||||
relocation = pp.Combine(pp.Literal(':') + pp.Word(pp.alphanums + '_') + pp.Literal(':'))
|
||||
first = pp.Word(pp.alphas + '_.', exact=1)
|
||||
rest = pp.Word(pp.alphanums + '_.')
|
||||
identifier = pp.Group(
|
||||
pp.Optional(relocation).setResultsName('relocation')
|
||||
+ pp.Combine(first + pp.Optional(rest)).setResultsName('name')
|
||||
).setResultsName('identifier')
|
||||
+ pp.Optional(pp.Suppress(pp.Literal('+')) + (hex_number | decimal_number).setResultsName('offset'))
|
||||
).setResultsName(self.IDENTIFIER_ID)
|
||||
# Label
|
||||
self.label = pp.Group(
|
||||
identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
|
||||
).setResultsName(self.LABEL_ID)
|
||||
# Directive
|
||||
decimal_number = pp.Combine(
|
||||
pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
|
||||
).setResultsName('value')
|
||||
hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
|
||||
directive_option = pp.Combine(
|
||||
pp.Word(pp.alphas + '#@.%', exact=1)
|
||||
+ pp.Optional(pp.Word(pp.printables + ' ', excludeChars=','))
|
||||
@@ -44,9 +47,18 @@ class ParserAArch64v81(BaseParser):
|
||||
self.directive = pp.Group(
|
||||
pp.Literal('.')
|
||||
+ pp.Word(pp.alphanums + '_').setResultsName('name')
|
||||
+ commaSeparatedList.setResultsName('parameters')
|
||||
+ (pp.OneOrMore(directive_parameter) ^ commaSeparatedList).setResultsName('parameters')
|
||||
+ pp.Optional(self.comment)
|
||||
).setResultsName(self.DIRECTIVE_ID)
|
||||
# LLVM-MCA markers
|
||||
self.llvm_markers = pp.Group(
|
||||
pp.Literal('#')
|
||||
+ pp.Combine(
|
||||
pp.CaselessLiteral('LLVM-MCA-')
|
||||
+ (pp.CaselessLiteral('BEGIN') | pp.CaselessLiteral('END'))
|
||||
)
|
||||
+ pp.Optional(self.comment)
|
||||
).setResultsName(self.COMMENT_ID)
|
||||
|
||||
##############################
|
||||
# Instructions
|
||||
@@ -80,31 +92,49 @@ class ParserAArch64v81(BaseParser):
|
||||
^ pp.CaselessLiteral('ror')
|
||||
^ pp.CaselessLiteral('sxtw')
|
||||
^ pp.CaselessLiteral('uxtw')
|
||||
^ pp.CaselessLiteral('mul vl')
|
||||
)
|
||||
arith_immediate = pp.Group(
|
||||
immediate.setResultsName('base_immediate')
|
||||
+ pp.Suppress(pp.Literal(','))
|
||||
+ shift_op.setResultsName('shift_op')
|
||||
+ immediate.setResultsName('shift')
|
||||
+ pp.Optional(immediate).setResultsName('shift')
|
||||
).setResultsName(self.IMMEDIATE_ID)
|
||||
# Register:
|
||||
# scalar: [XWBHSDQ][0-9]{1,2} | vector: V[0-9]{1,2}\.[12468]{1,2}[BHSD]()?
|
||||
# define SP and ZR register aliases as regex, due to pyparsing does not support
|
||||
# scalar: [XWBHSDQ][0-9]{1,2} | vector: [VZ][0-9]{1,2}(\.[12468]{1,2}[BHSD])?
|
||||
# | predicate: P[0-9]{1,2}(/[ZM])?
|
||||
# ignore vector len control ZCR_EL[123] for now
|
||||
# define SP, ZR register aliases as regex, due to pyparsing does not support
|
||||
# proper lookahead
|
||||
alias_r31_sp = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(sp|SP))')
|
||||
alias_r31_zr = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(zr|ZR))')
|
||||
scalar = pp.Word(pp.alphas, exact=1).setResultsName('prefix') + pp.Word(
|
||||
scalar = pp.Word('xwbhsdqXWBHSDQ', exact=1).setResultsName('prefix') + pp.Word(
|
||||
pp.nums
|
||||
).setResultsName('name')
|
||||
index = pp.Literal('[') + pp.Word(pp.nums).setResultsName('index') + pp.Literal(']')
|
||||
vector = (
|
||||
pp.CaselessLiteral('v').setResultsName('prefix')
|
||||
pp.oneOf('v z', caseless=True).setResultsName('prefix')
|
||||
+ pp.Word(pp.nums).setResultsName('name')
|
||||
+ pp.Literal('.')
|
||||
+ pp.Optional(pp.Word('12468')).setResultsName('lanes')
|
||||
+ pp.Word(pp.alphas, exact=1).setResultsName('shape')
|
||||
+ pp.Optional(index)
|
||||
)
|
||||
predicate = (
|
||||
pp.CaselessLiteral('p').setResultsName('prefix')
|
||||
+ pp.Word(pp.nums).setResultsName('name')
|
||||
+ pp.Optional(
|
||||
(
|
||||
pp.Suppress(pp.Literal('/'))
|
||||
+ pp.oneOf('z m', caseless=True).setResultsName('predication')
|
||||
)
|
||||
| (
|
||||
pp.Literal('.')
|
||||
+ pp.Optional(pp.Word('12468')).setResultsName('lanes')
|
||||
+ pp.Word(pp.alphas, exact=1).setResultsName('shape')
|
||||
)
|
||||
)
|
||||
)
|
||||
self.list_element = vector ^ scalar
|
||||
register_list = (
|
||||
pp.Literal('{')
|
||||
@@ -118,7 +148,8 @@ class ParserAArch64v81(BaseParser):
|
||||
+ pp.Optional(index)
|
||||
)
|
||||
register = pp.Group(
|
||||
(alias_r31_sp | alias_r31_zr | vector | scalar | register_list)
|
||||
(alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
|
||||
#(alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
|
||||
+ pp.Optional(
|
||||
pp.Suppress(pp.Literal(','))
|
||||
+ shift_op.setResultsName('shift_op')
|
||||
@@ -133,7 +164,7 @@ class ParserAArch64v81(BaseParser):
|
||||
pp.Literal('[')
|
||||
+ pp.Optional(register.setResultsName('base'))
|
||||
+ pp.Optional(pp.Suppress(pp.Literal(',')))
|
||||
+ pp.Optional(register_index ^ immediate.setResultsName('offset'))
|
||||
+ pp.Optional(register_index ^ (immediate ^ arith_immediate).setResultsName('offset'))
|
||||
+ pp.Literal(']')
|
||||
+ pp.Optional(
|
||||
pp.Literal('!').setResultsName('pre_indexed')
|
||||
@@ -166,22 +197,28 @@ class ParserAArch64v81(BaseParser):
|
||||
+ pp.Optional(self.comment)
|
||||
)
|
||||
|
||||
# for testing
|
||||
self.predicate = predicate
|
||||
self.vector = vector
|
||||
self.register = register
|
||||
|
||||
def parse_line(self, line, line_number=None):
|
||||
"""
|
||||
Parse line and return instruction form.
|
||||
|
||||
:param str line: line of assembly code
|
||||
:param int line_id: default None, identifier of instruction form
|
||||
:return: parsed instruction form
|
||||
:param line_number: identifier of instruction form, defautls to None
|
||||
:type line_number: int, optional
|
||||
:return: `dict` -- parsed asm line (comment, label, directive or instruction form)
|
||||
"""
|
||||
instruction_form = AttrDict(
|
||||
{
|
||||
self.INSTRUCTION_ID: None,
|
||||
self.OPERANDS_ID: None,
|
||||
self.OPERANDS_ID: [],
|
||||
self.DIRECTIVE_ID: None,
|
||||
self.COMMENT_ID: None,
|
||||
self.LABEL_ID: None,
|
||||
'line': line.strip(),
|
||||
'line': line,
|
||||
'line_number': line_number,
|
||||
}
|
||||
)
|
||||
@@ -194,7 +231,15 @@ class ParserAArch64v81(BaseParser):
|
||||
instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
|
||||
except pp.ParseException:
|
||||
pass
|
||||
|
||||
# 1.2 check for llvm-mca marker
|
||||
try:
|
||||
result = self.process_operand(
|
||||
self.llvm_markers.parseString(line, parseAll=True).asDict()
|
||||
)
|
||||
result = AttrDict.convert_dict(result)
|
||||
instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
|
||||
except pp.ParseException:
|
||||
pass
|
||||
# 2. Parse label
|
||||
if result is None:
|
||||
try:
|
||||
@@ -245,6 +290,12 @@ class ParserAArch64v81(BaseParser):
|
||||
return instruction_form
|
||||
|
||||
def parse_instruction(self, instruction):
|
||||
"""
|
||||
Parse instruction in asm line.
|
||||
|
||||
:param str instruction: Assembly line string.
|
||||
:returns: `dict` -- parsed instruction form
|
||||
"""
|
||||
result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
|
||||
result = AttrDict.convert_dict(result)
|
||||
operands = []
|
||||
@@ -274,29 +325,35 @@ class ParserAArch64v81(BaseParser):
|
||||
return return_dict
|
||||
|
||||
def process_operand(self, operand):
|
||||
"""Post-process operand"""
|
||||
# structure memory addresses
|
||||
if self.MEMORY_ID in operand:
|
||||
return self.substitute_memory_address(operand[self.MEMORY_ID])
|
||||
return self.process_memory_address(operand[self.MEMORY_ID])
|
||||
# structure register lists
|
||||
if self.REGISTER_ID in operand and (
|
||||
'list' in operand[self.REGISTER_ID] or 'range' in operand[self.REGISTER_ID]
|
||||
):
|
||||
# TODO: discuss if ranges should be converted to lists
|
||||
return self.substitute_register_list(operand[self.REGISTER_ID])
|
||||
return self.process_register_list(operand[self.REGISTER_ID])
|
||||
if self.REGISTER_ID in operand and operand[self.REGISTER_ID]['name'] == 'sp':
|
||||
return self.substitute_sp_register(operand[self.REGISTER_ID])
|
||||
return self.process_sp_register(operand[self.REGISTER_ID])
|
||||
# add value attribute to floating point immediates without exponent
|
||||
if self.IMMEDIATE_ID in operand:
|
||||
return self.substitute_immediate(operand[self.IMMEDIATE_ID])
|
||||
return self.process_immediate(operand[self.IMMEDIATE_ID])
|
||||
if self.LABEL_ID in operand:
|
||||
return self.substitute_label(operand[self.LABEL_ID])
|
||||
return self.process_label(operand[self.LABEL_ID])
|
||||
if self.IDENTIFIER_ID in operand:
|
||||
return self.process_identifier(operand[self.IDENTIFIER_ID])
|
||||
return operand
|
||||
|
||||
def substitute_memory_address(self, memory_address):
|
||||
def process_memory_address(self, memory_address):
|
||||
"""Post-process memory address operand"""
|
||||
# Remove unnecessarily created dictionary entries during parsing
|
||||
offset = None if 'offset' not in memory_address else memory_address['offset']
|
||||
base = None if 'base' not in memory_address else memory_address['base']
|
||||
index = None if 'index' not in memory_address else memory_address['index']
|
||||
offset = memory_address.get('offset', None)
|
||||
if isinstance(offset, list) and len(offset) == 1:
|
||||
offset = offset[0]
|
||||
base = memory_address.get('base', None)
|
||||
index = memory_address.get('index', None)
|
||||
scale = 1
|
||||
if base is not None and 'name' in base and base['name'] == 'sp':
|
||||
base['prefix'] = 'x'
|
||||
@@ -314,28 +371,33 @@ class ParserAArch64v81(BaseParser):
|
||||
new_dict['post_indexed'] = memory_address['post_indexed']
|
||||
return AttrDict({self.MEMORY_ID: new_dict})
|
||||
|
||||
def substitute_sp_register(self, register):
|
||||
def process_sp_register(self, register):
|
||||
"""Post-process stack pointer register"""
|
||||
reg = register
|
||||
reg['prefix'] = 'x'
|
||||
return AttrDict({self.REGISTER_ID: reg})
|
||||
|
||||
def substitute_register_list(self, register_list):
|
||||
def process_register_list(self, register_list):
|
||||
"""Post-process register lists (e.g., {r0,r3,r5}) and register ranges (e.g., {r0-r7})"""
|
||||
# Remove unnecessarily created dictionary entries during parsing
|
||||
vlist = []
|
||||
rlist = []
|
||||
dict_name = ''
|
||||
if 'list' in register_list:
|
||||
dict_name = 'list'
|
||||
if 'range' in register_list:
|
||||
dict_name = 'range'
|
||||
for v in register_list[dict_name]:
|
||||
vlist.append(
|
||||
AttrDict.convert_dict(self.list_element.parseString(v, parseAll=True).asDict())
|
||||
for r in register_list[dict_name]:
|
||||
rlist.append(
|
||||
AttrDict.convert_dict(self.list_element.parseString(r, parseAll=True).asDict())
|
||||
)
|
||||
index = None if 'index' not in register_list else register_list['index']
|
||||
new_dict = AttrDict({dict_name: vlist, 'index': index})
|
||||
index = register_list.get('index', None)
|
||||
new_dict = AttrDict({dict_name: rlist, 'index': index})
|
||||
if len(new_dict[dict_name]) == 1:
|
||||
return AttrDict({self.REGISTER_ID: new_dict[dict_name][0]})
|
||||
return AttrDict({self.REGISTER_ID: new_dict})
|
||||
|
||||
def substitute_immediate(self, immediate):
|
||||
def process_immediate(self, immediate):
|
||||
"""Post-process immediate operand"""
|
||||
dict_name = ''
|
||||
if 'identifier' in immediate:
|
||||
# actually an identifier, change declaration
|
||||
@@ -344,7 +406,9 @@ class ParserAArch64v81(BaseParser):
|
||||
# normal integer value, nothing to do
|
||||
return AttrDict({self.IMMEDIATE_ID: immediate})
|
||||
if 'base_immediate' in immediate:
|
||||
# arithmetic immediate, nothing to do
|
||||
# arithmetic immediate, add calculated value as value
|
||||
immediate['shift'] = immediate['shift'][0]
|
||||
immediate['value'] = int(immediate['base_immediate']['value']) << int(immediate['shift']['value'])
|
||||
return AttrDict({self.IMMEDIATE_ID: immediate})
|
||||
if 'float' in immediate:
|
||||
dict_name = 'float'
|
||||
@@ -359,12 +423,21 @@ class ParserAArch64v81(BaseParser):
|
||||
{self.IMMEDIATE_ID: AttrDict({'value': immediate[dict_name]['mantissa']})}
|
||||
)
|
||||
|
||||
def substitute_label(self, label):
|
||||
def process_label(self, label):
|
||||
"""Post-process label asm line"""
|
||||
# remove duplicated 'name' level due to identifier
|
||||
label['name'] = label['name']['name']
|
||||
return AttrDict({self.LABEL_ID: label})
|
||||
|
||||
def process_identifier(self, identifier):
|
||||
"""Post-process identifier operand"""
|
||||
# remove value if it consists of symbol+offset
|
||||
if 'value' in identifier:
|
||||
del identifier['value']
|
||||
return AttrDict({self.IDENTIFIER_ID: identifier})
|
||||
|
||||
def get_full_reg_name(self, register):
|
||||
"""Return one register name string including all attributes"""
|
||||
if 'lanes' in register:
|
||||
return (
|
||||
register['prefix']
|
||||
@@ -376,19 +449,21 @@ class ParserAArch64v81(BaseParser):
|
||||
return register['prefix'] + str(register['name'])
|
||||
|
||||
def normalize_imd(self, imd):
|
||||
"""Normalize immediate to decimal based representation"""
|
||||
if 'value' in imd:
|
||||
if imd['value'].lower().startswith('0x'):
|
||||
# hex, return decimal
|
||||
return int(imd['value'], 16)
|
||||
return int(imd['value'], 10)
|
||||
elif 'float' in imd:
|
||||
return self.ieee_to_int(imd['float'])
|
||||
return self.ieee_to_float(imd['float'])
|
||||
elif 'double' in imd:
|
||||
return self.ieee_to_int(imd['double'])
|
||||
return self.ieee_to_float(imd['double'])
|
||||
# identifier
|
||||
return imd
|
||||
|
||||
def ieee_to_int(self, ieee_val):
|
||||
def ieee_to_float(self, ieee_val):
|
||||
"""Convert IEEE representation to python float"""
|
||||
exponent = int(ieee_val['exponent'], 10)
|
||||
if ieee_val['e_sign'] == '-':
|
||||
exponent *= -1
|
||||
@@ -398,18 +473,29 @@ class ParserAArch64v81(BaseParser):
|
||||
raise NotImplementedError
|
||||
|
||||
def is_gpr(self, register):
|
||||
"""Check if register is a general purpose register"""
|
||||
if register['prefix'] in 'wx':
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_vector_register(self, register):
|
||||
if register['prefix'] in 'bhsdqv':
|
||||
"""Check if register is a vector register"""
|
||||
if register['prefix'] in 'bhsdqvz':
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_flag_dependend_of(self, flag_a, flag_b):
|
||||
"""Check if ``flag_a`` is dependent on ``flag_b``"""
|
||||
# we assume flags are independent of each other, e.g., CF can be read while ZF gets written
|
||||
# TODO validate this assumption
|
||||
if flag_a.name == flag_b.name:
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_reg_dependend_of(self, reg_a, reg_b):
|
||||
"""Check if ``reg_a`` is dependent on ``reg_b``"""
|
||||
prefixes_gpr = 'wx'
|
||||
prefixes_vec = 'bhsdqv'
|
||||
prefixes_vec = 'bhsdqvz'
|
||||
if reg_a['name'] == reg_b['name']:
|
||||
if reg_a['prefix'].lower() in prefixes_gpr and reg_b['prefix'].lower() in prefixes_gpr:
|
||||
return True
|
||||
@@ -418,4 +504,5 @@ class ParserAArch64v81(BaseParser):
|
||||
return False
|
||||
|
||||
def get_reg_type(self, register):
|
||||
"""Get register type"""
|
||||
return register['prefix']
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user