Resolved merge conflicts

This commit is contained in:
JanLJL
2019-10-16 10:59:03 +02:00
76 changed files with 8913 additions and 44043 deletions

View File

@@ -3,6 +3,28 @@ language: python
python:
- "3.5"
- "3.6"
- "3.7"
install: pip install tox-travis
script: tox
# Python 3.7 not working yet
# - "3.7"
before_install:
# - pip install tox-travis
- pip install codecov
install:
- pip install -e .
cache: pip
script:
# - tox
- coverage run -p tests/all_tests.py
after_success:
- coverage combine
- codecov
deploy:
provider: pypi
user: "__token__"
password:
secure: "fRRCETOwDkJ4pFacYZghPfCQ9mSsV4PlD3sTDp8rDHoCnebPjvFYc1tIdv+Wds0ae162KNUaj9GbxjK0MTGiRcy4pD08n7ufv8snmBQ2rtOLkj7RCRg1hw30WcMHjzqScFJgQcBrpjdPmR5AlesUufh6OadGvF1NspmVRWKr8ir3KQhmNV+itAliYoqaSTRTg1zC/znm+49l5gkzlLxd+mPj5/dtcc8vZ/i2M2+nNTTjDxq71q4Ddqv+bgZV1y7OZY2YuvjEDPflUbwc3fjOxpj891uMDHodsGmEHBu8WsLpF2tAO0C/x63S0jXamkV+/4cAQqQAwWr0Lby9/BjCfUwyUMOEgZ0S+z9WoFpBpQTQEfkD2JH/UFrv4CMnLFqgDkVMcx0vc/rT4Od8eJ5wOSG5+VdniJNOLpodFOXuKc09eJMk2lE9vk9OBrcsZ09UOTPTUCMZSIP4cBDxaIkx+RHQEy63TQdJZcElRBEWGEgj2e9hbiktvIoOvbFGQDscpz7ShBDklXIpu9hnxcKHtNDEjyywTUJmx7lTMILL05DPUnpUmnMb1Gyx5lbHzhSExc9re0cxEA354UUQKBS5HwHQcEBw9stMfsaForiBAUOocUKdGqlGP9cOXFoxdC9M+ff5FNstgbjPYSowb/JbATMlmCWKgH/bXXcTGCO10sk="
distributions: sdist
skip_existing: true
skip_cleanup: true
on:
repo: RRZE-HPC/OSACA
tag: true

View File

@@ -1,7 +1,7 @@
include README.rst
include LICENSE
include tox.ini
recursive-include osaca/data/ *.csv
recursive-include osaca/data/ *.yml
include examples/*
recursive-include tests *.py *.out
recursive-include tests/testfiles/ *

View File

@@ -16,9 +16,15 @@ analysis and throughput prediction for a innermost loop kernel.
.. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?token=393L6z2HEXNiGLtZ43s6&branch=master
:target: https://travis-ci.com/RRZE-HPC/OSACA
.. image:: https://landscape.io/github/RRZE-HPC/OSACA/master/landscape.svg?style=flat&badge_auth_token=c95f01b247f94bc79c09d21c5c827697
:target: https://landscape.io/github/RRZE-HPC/OSACA/master
:alt: Code Health
.. ..image:: https://landscape.io/github/RRZE-HPC/OSACA/master/landscape.svg?style=flat&badge_auth_token=c95f01b247f94bc79c09d21c5c827697
.. :target: https://landscape.io/github/RRZE-HPC/OSACA/master
.. :alt: Code Health
.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
:target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
:target: https://github.com/ambv/black
Getting started
===============
@@ -46,8 +52,7 @@ Dependencies:
Additional requirements are:
- `Python3 <https://www.python.org/>`_
- `pandas <http://pandas.pydata.org/>`_
- `NumPy <http://www.numpy.org/>`_
- `Graphviz <https://www.graphviz.org/>`_ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
- `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`_ for marker insertion
- `ibench <https://github.com/hofm/ibench>`_ for throughput/latency measurements
@@ -66,213 +71,169 @@ The usage of OSACA can be listed as:
.. code:: bash
osaca [-h] [-V] [--arch ARCH] [--tp-list] [-i | --iaca | -m] FILEPATH
osaca [-h] [-V] [--arch ARCH] [--export-graph GRAPHNAME] FILEPATH
- ``-h`` or ``--help`` prints out the help message.
- ``-V`` or ``--version`` shows the programs version number.
- ``ARCH`` needs to be replaced with the wished architecture abbreviation. This flag is necessary for the throughput analysis (default function) and the inclusion of an ibench output (``-i``). Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW`` and ``SKL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN`` for AMD Zen (17h family) architecture .
- While in the throughput analysis mode, one can add ``--tp-list`` for printing the additional throughput list of the kernel or ``--iaca`` for letting OSACA to know it has to search for IACA binary markers.
- ``-i`` or ``--include-ibench`` starts the integration of ibench output into the CSV data file determined by ``ARCH``.
- With the flag ``-m`` or ``--insert-marker`` OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`_ marker in suggested assembly blocks.
- ``FILEPATH`` describes the filepath to the file to work with and is always necessary
-h, --help
prints out the help message.
-V, --version
shows the programs version number.
--arch ARCH
needs to be replaced with the wished architecture abbreviation.
This flag is necessary for the throughput analysis (default function) and the inclusion of an ibench output (``-i``).
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX`` and ``CSX`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1`` for AMD Zen (17h family) architecture.
Furthermore, `VULCAN` for Marvell`s ARM-based ThunderX2 architecture is available.
--insert-marker
OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`_ marker in suggested assembly blocks.
--db-check
Run a sanity check on the by "--arch" specified database.
The output depends on the verbosity level.
Keep in mind you have to provide a (dummy) filename in anyway.
--export-graph EXPORT_PATH
Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
After the file was created, you can convert it to a PDF file using dot: `dot -Tpdf osaca_dg.dot -o osaca_dependency_graph.pdf`
The **FILEPATH** describes the filepath to the file to work with and is always necessary
______________________
Hereinafter OSACA's scope of function will be described.
Throughput analysis
~~~~~~~~~~~~~~~~~~~
As main functionality of OSACA this process starts by default. It is always necessary to specify the core architecture by the flag ``--arch ARCH``, where ``ARCH`` can stand for ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKL`` or ``ZEN``.
Throughput & Latency analysis
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
As main functionality of OSACA this process starts by default. It is always necessary to specify the core architecture by the flag ``--arch ARCH``, where ``ARCH`` can stand for ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ZEN`` or ``VULCAN``.
For extracting the right kernel, one has to mark it beforehand. For this there are two different approaches:
For extracting the right kernel, one has to mark it beforehand.
Currently, only the detechtion of markers in the assembly code and therefore the analysis of assemly files is supported by OSACA.
| **High level code**
**Assembly code**
The OSACA marker is ``//STARTLOOP`` and must be put in one line in front of the loop head, and the loop code must be indented consistently. This means the marker and the head must have the same indentation level while the whole loop body needs to be more indented than the code before and after. For instance, this is a valid OSACA marker:
.. code-block:: c
int i = 0;
//STARTLOOP
while(i < N){
// do something...
i++;
}
| **Assembly code**
Another way for marking a kernel is to insert the IACA byte markers in the assembly file in before and after the loop.
Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
Start and end marker can be seen in the example below:
For the convience of the user, in x86 assembly IACA byte markers are used.
**x86 Byte Markers**
.. code-block:: gas
movl $111,%ebx ;IACA START MARKER
.byte 100,103,144 ;IACA START MARKER
; LABEL
; do something
; ...
; conditional jump to LABEL
movl $222,%ebx ;IACA END MARKER
.byte 100,103,144 ;IACA END MARKER
movl $111,%ebx #IACA/OSACA START MARKER
.byte 100,103,144 #IACA/OSACA START MARKER
Loop:
# ...
movl $222,%ebx #IACA/OSACA END MARKER
.byte 100,103,144 #IACA/OSACA END MARKER
The optional flag ``--iaca`` defines if OSACA needs to search for the IACA byte markers or the OSACA marker in the chosen file.
**AArch64 Byte Markers**
With an additional, optional ``--tp-list``, OSACA adds a simple list of all kernel instruction forms together with their reciprocal throughput to the output. This is helpful in case of no further information about the port binding of the single instruction forms.
.. code-block:: asm
Include new measurements into the data file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running OSACA with the flag ``-i`` or ``--include-ibench`` and a specified micro architecture ``ARCH``, it
takes the values given in an ibench output file and checks them for reasonability. If a value is not in the data file already, it will be added, otherwise OSACA prints out a warning message and keeps the old value in the data file. If a value does not pass the validation, a warning message is shown, however, OSACA will keep working with the new value.
The handling of ibench is shortly described in the example section below.
mov x1, #111 // OSACA START
.byte 213,3,32,31 // OSACA START
\\ ...
mov x1, #222 // OSACA END
.byte 213,3,32,31 // OSACA END
.. Include new measurements into the data file
.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. Running OSACA with the flag ``-i`` or ``--include-ibench`` and a specified micro architecture ``ARCH``, it takes the values given in an ibench output file and checks them for reasonability. If a value is not in the data file already, it will be added, otherwise OSACA prints out a warning message and keeps the old value in the data file. If a value does not pass the validation, a warning message is shown, however, OSACA will keep working with the new value. The handling of ibench is shortly described in the example section below.
Insert IACA markers
~~~~~~~~~~~~~~~~~~~
Using the ``-m`` or ``--insert-marker`` flags for a given file, OSACA calls the implemented Kerncraft module for identifying and marking the inner-loop block in *manual mode*. More information about how this is done can be found in the `Kerncraft repository <https://github.com/RRZE-HPC/kerncraft>`_.
Using the ``--insert-marker`` flags for a given file, OSACA calls the implemented Kerncraft module for identifying and marking the inner-loop block in *manual mode*. More information about how this is done can be found in the `Kerncraft repository <https://github.com/RRZE-HPC/kerncraft>`_.
Note that this currrently only works for x86 loop kernels
Example
=======
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel IVB core hereafter:
For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:
.. code-block:: c
double a[N], double b[N];
double s;
//STARTLOOP
// loop
for(int i = 0; i < N; ++i)
a[i] = s * b[i];
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``. The result is
written in vector ``a``.
After including the OSACA marker ``//STARTLOOP`` and compiling the source, one can
start the analysis typing
The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``.
The result is written in vector ``a``.
After including the OSACA byte marker into the assembly, one can start the analysis typing
.. code:: bash
osaca --arch IVB PATH/TO/FILE
osaca --arch CSX PATH/TO/FILE
in the command line. Optionally, one can create the assembly code out of the file, identify and mark the kernel of interest and run OSACA with the additional ``--iaca`` flag.
in the command line.
The output is:
.. code-block::
Open Source Architecture Code Analyzer (OSACA) - v0.3
Analyzed file: scale.s.csx.O3.s
Architecture: csx
Timestamp: 2019-10-03 23:36:21
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Throughput Analysis Report
--------------------------
P - Load operation can be hidden behind a past or future store instruction
X - No information for this instruction in database
* - Instruction micro-ops not bound to a port
Port Binding in Cycles Per Iteration:
-------------------------------------------------
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------------
| Cycles | 2.33 | 1.33 | 5.0 | 5.0 | 2.0 | 1.33 |
-------------------------------------------------
Ports Pressure in cycles
| 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------
| | | 0.50 | 0.50 | 1.00 | | movl $0x0,-0x24(%rbp)
| | | | | | | jmp 10b <scale+0x10b>
| | | 0.50 | 0.50 | | | mov -0x48(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | | | vmovsd (%rax,%rdx,8),%xmm0
| 1.00 | | 0.50 | 0.50 | | | vmulsd -0x50(%rbp),%xmm0,%xmm0
| | | 0.50 | 0.50 | | | mov -0x38(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | 1.00 | | vmovsd %xmm0,(%rax,%rdx,8)
| | | | | | | X addl $0x1,-0x24(%rbp)
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%eax
| 0.33 | 0.33 | 0.50 | 0.50 | | 0.33 | cmp -0x54(%rbp),%eax
| | | | | | | jl e4 <scale+0xe4>
| 0.33 | 0.33 | | | | 0.33 | mov %rcx,%rsp
Total number of estimated throughput: 5.0
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------
170 | | | | | | | | | .L22:
171 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | vmulpd (%r12,%rax), %ymm1, %ymm0
172 | | | 0.50 | 0.50 | 1.00 | | | | vmovapd %ymm0, 0(%r13,%rax)
173 | 0.25 | 0.25 | | | | 0.25 | 0.25 | | addq $32, %rax
174 | 0.25 | 0.25 | | | | 0.25 | 0.25 | | cmpq %rax, %r14
175 | | | | | | | | | * jne .L22
1.00 1.00 1.00 0.50 1.00 0.50 1.00 0.50 0.50
Latency Analysis Report
-----------------------
171 | 8.0 | | vmulpd (%r12,%rax), %ymm1, %ymm0
172 | 5.0 | | vmovapd %ymm0, 0(%r13,%rax)
13.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
173 | 1.0 | addq $32, %rax | [173]
It shows the whole kernel together with the average port pressure of each instruction form and the overall port binding.
In the fifth to last line containing ``addl $0x1, -0x24(%rbp)`` one can see an ``X`` in front of the instruction form and no port occupation.
This means either there are no measured values for this instruction form or no port binding is provided in the
data file.
In the first case, OSACA automatically creates two benchmark assembly files (``add-mem_imd.S`` for latency and ``add-mem_imd-TP.S`` for throughput) in the benchmark folder, if it not already exists there.
Furthermore, the critical path of the loop kernel and all loop-carried dependencies, each with a list of line numbers being part of this dependency chain on the right.
One can now run ibench to get the throughput value for addl with the given file. Mind that the assembly
file, which is used for ibench, is implemented in Intel syntax. So for a valid run instruction ``addl`` must be
changed to ``add`` manually.
.. For measuring the instruction forms with ibench we highly recommend to use an exclusively allocated node, so there is no other workload falsifying the results. For the correct function of ibench the benchmark files from OSACA need to be placed in a subdirectory of src in root so ibench can create the a folder with the subdirectorys name and the shared objects. For running the tests the frequencies of all cores must set to a constant value and this has to be given as an argument together with the directory of the shared objects to ibench, e.g.:
For measuring the instruction forms with ibench we highly recommend to use an exclusively allocated node,
so there is no other workload falsifying the results. For the correct function of ibench the benchmark files
from OSACA need to be placed in a subdirectory of src in root so ibench can create the a folder with the
subdirectorys name and the shared objects. For running the tests the frequencies of all cores must set to a
constant value and this has to be given as an argument together with the directory of the shared objects to
ibench, e.g.:
.. code:: bash
.. .. code:: bash
./ibench ./AVX 2.2
for running ibench in the directory ``AVX`` with a core frequency of 2.2 GHz.
We get an output like:
.. for running ibench in the directory ``AVX`` with a core frequency of 2.2 GHz. We get an output like:
.. code:: bash
.. .. code:: bash
Using frequency 2.20GHz.
add-mem_imd-TP: 1.023 (clock cycles) [DEBUG - result: 1.000000]
add-mem_imd: 6.050 (clock cycles) [DEBUG - result: 1.000000]
The debug output as resulting value of register ``xmm0`` is additional validation information depending on
the executed instruction form meant for the user and is not considered by OSACA.
The ibench output information can be included by OSACA running the program with the flag ``--include-ibench`` or just
``-i`` and the specify micro architecture:
.. The debug output as resulting value of register ``xmm0`` is additional validation information depending on the executed instruction form meant for the user and is not considered by OSACA. The ibench output information can be included by OSACA running the program with the flag ``--include-ibench`` or just ``-i`` and the specify micro architecture:
.. code-block:: bash
.. .. code-block:: bash
osaca --arch IVB -i PATH/TO/IBENCH-OUTPUTFILE
For now no automatic allocation of ports for a instruction form is implemented, so for getting an output in the Ports Pressure table, one must add the port occupation by hand.
We know that the inserted instruction form must be assigned always to Port 2, 3 and 4 and additionally to either 0, 1 or 5, a valid data file therefore would look like this:
.. For now no automatic allocation of ports for a instruction form is implemented, so for getting an output in the Ports Pressure table, one must add the port occupation by hand. We know that the inserted instruction form must be assigned always to Port 2, 3 and 4 and additionally to either 0, 1 or 5, a valid data file therefore would look like this:
.. code:: bash
.. .. code:: bash
addl-mem_imd,1.0,6.0,"(0.33,0.33,1.00,1.00,1.00,0.33)"
Another throughput analysis with OSACA now returns all information for the kernel:
.. code-block::
Throughput Analysis Report
--------------------------
P - Load operation can be hidden behind a past or future store instruction
X - No information for this instruction in database
* - Instruction micro-ops not bound to a port
Port Binding in Cycles Per Iteration:
-------------------------------------------------
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------------
| Cycles | 2.67 | 1.67 | 6.0 | 6.0 | 3.0 | 1.67 |
-------------------------------------------------
Ports Pressure in cycles
| 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------
| | | 0.50 | 0.50 | 1.00 | | movl $0x0,-0x24(%rbp)
| | | | | | | jmp 10b <scale+0x10b>
| | | 0.50 | 0.50 | | | mov -0x48(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | | | vmovsd (%rax,%rdx,8),%xmm0
| 1.00 | | 0.50 | 0.50 | | | vmulsd -0x50(%rbp),%xmm0,%xmm0
| | | 0.50 | 0.50 | | | mov -0x38(%rbp),%rax
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%edx
| 0.33 | 0.33 | | | | 0.33 | movslq %edx,%rdx
| | | 0.50 | 0.50 | 1.00 | | vmovsd %xmm0,(%rax,%rdx,8)
| 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 0.33 | addl $0x1,-0x24(%rbp)
| | | 0.50 | 0.50 | | | mov -0x24(%rbp),%eax
| 0.33 | 0.33 | 0.50 | 0.50 | | 0.33 | cmp -0x54(%rbp),%eax
| | | | | | | jl e4 <scale+0xe4>
| 0.33 | 0.33 | | | | 0.33 | mov %rcx,%rsp
Total number of estimated throughput: 6.0
Credits
=======

Binary file not shown.

Before

Width:  |  Height:  |  Size: 231 KiB

After

Width:  |  Height:  |  Size: 206 KiB

View File

@@ -1,286 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.5.239 Build 20170817";
# mark_description "-fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o 2d.S";
.file "2d-5pt.c"
.text
..TXTST0:
# -- Begin jacobi2D5pt
.text
# mark_begin;
.align 16,0x90
.globl jacobi2D5pt
# --- jacobi2D5pt(int, int)
jacobi2D5pt:
# parameter 1: %edi
# parameter 2: %esi
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_jacobi2D5pt.1:
..L2:
#2.31
pushq %rbx #2.31
.cfi_def_cfa_offset 16
movq %rsp, %rbx #2.31
.cfi_def_cfa 3, 16
.cfi_offset 3, -16
andq $-32, %rsp #2.31
pushq %rbp #2.31
pushq %rbp #2.31
movq 8(%rbx), %rbp #2.31
movq %rbp, 8(%rsp) #2.31
movq %rsp, %rbp #2.31
.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
pushq %r13 #2.31
pushq %r14 #2.31
pushq %r15 #2.31
subq $88, %rsp #2.31
movslq %esi, %rsi #2.31
movslq %edi, %rcx #2.31
.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
movq %rsi, %r13 #4.17
imulq %rcx, %r13 #4.17
shlq $3, %r13 #4.12
movq %r13, %rax #4.12
addq $31, %rax #4.12
andq $-32, %rax #4.12
subq %rax, %rsp #4.12
movq %rsp, %rax #4.12
# LOE rax rcx rsi r12 r13 edi
..B1.29: # Preds ..B1.1
# Execution count [1.00e+00]
movq %rax, %r14 #4.12
# LOE rcx rsi r12 r13 r14 edi
..B1.2: # Preds ..B1.29
# Execution count [1.00e+00]
movq %r13, %rax #5.12
addq $31, %rax #5.12
andq $-32, %rax #5.12
subq %rax, %rsp #5.12
movq %rsp, %rax #5.12
# LOE rax rcx rsi r12 r13 r14 edi
..B1.30: # Preds ..B1.2
# Execution count [1.00e+00]
movq %rax, %r15 #5.12
# LOE rcx rsi r12 r13 r14 r15 edi
..B1.3: # Preds ..B1.30
# Execution count [1.00e+00]
xorl %r10d, %r10d #9.5
lea (%r15,%rcx,8), %r11 #13.13
vxorpd %xmm1, %xmm1, %xmm1 #6.5
lea (%r14,%rcx,8), %rdx #13.37
cmpq $2, %rsi #9.18
jle ..B1.21 # Prob 9% #9.18
# LOE rdx rcx rsi r10 r11 r12 r13 r14 r15 edi xmm1
..B1.4: # Preds ..B1.3
# Execution count [9.00e-01]
addl $-2, %edi #12.9
movq %rcx, %r9 #13.61
movl %edi, %eax #12.9
addq $-2, %rsi #9.18
andl $-16, %eax #12.9
xorl %r8d, %r8d #9.5
shlq $4, %r9 #13.61
movslq %eax, %rax #12.9
addq %r14, %r9 #13.61
movslq %edi, %rdi #12.9
vxorps %ymm0, %ymm0, %ymm0 #6.5
movq %rax, -80(%rbp) #12.9[spill]
movq %rdi, -88(%rbp) #12.9[spill]
movl %eax, -72(%rbp) #9.5[spill]
movq %rsi, -48(%rbp) #9.5[spill]
movq %rdx, -64(%rbp) #9.5[spill]
movq %r15, -96(%rbp) #9.5[spill]
movq %r14, -56(%rbp) #9.5[spill]
movq %r13, -104(%rbp) #9.5[spill]
movq %r12, -112(%rbp) #9.5[spill]
.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.5: # Preds ..B1.19 ..B1.4
# Execution count [5.00e+00]
cmpq $2, %rcx #12.22
jle ..B1.19 # Prob 50% #12.22
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.6: # Preds ..B1.5
# Execution count [4.50e+00]
cmpl $16, %edi #12.9
jl ..B1.26 # Prob 10% #12.9
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.7: # Preds ..B1.6
# Execution count [4.50e+00]
movl -72(%rbp), %r14d #12.9[spill]
xorl %edx, %edx #12.9
movq -80(%rbp), %r12 #13.13[spill]
lea (%r11,%r8), %rax #13.13
# LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
..B1.8: # Preds ..B1.8 ..B1.7
# Execution count [2.50e+01]
vmovupd %ymm0, 8(%rax,%rdx,8) #13.13
vmovupd %ymm0, 40(%rax,%rdx,8) #13.13
vmovupd %ymm0, 72(%rax,%rdx,8) #13.13
vmovupd %ymm0, 104(%rax,%rdx,8) #13.13
addq $16, %rdx #12.9
cmpq %r12, %rdx #12.9
jb ..B1.8 # Prob 82% #12.9
# LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
..B1.10: # Preds ..B1.8 ..B1.26
# Execution count [5.00e+00]
lea 1(%r14), %eax #12.9
cmpl %edi, %eax #12.9
ja ..B1.19 # Prob 50% #12.9
# LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
..B1.11: # Preds ..B1.10
# Execution count [4.50e+00]
movslq %r14d, %r14 #12.9
movq -88(%rbp), %r13 #12.9[spill]
subq %r14, %r13 #12.9
cmpq $4, %r13 #12.9
jl ..B1.25 # Prob 10% #12.9
# LOE rcx r8 r9 r10 r11 r13 r14 edi xmm1 ymm0
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
movl %r13d, %r15d #12.9
lea (%r11,%r8), %rax #13.13
andl $-4, %r15d #12.9
xorl %edx, %edx #12.9
movslq %r15d, %r15 #12.9
lea (%rax,%r14,8), %rax #13.13
# LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.13: # Preds ..B1.13 ..B1.12
# Execution count [2.50e+01]
vmovupd %ymm0, 8(%rax,%rdx,8) #13.13
addq $4, %rdx #12.9
cmpq %r15, %rdx #12.9
jb ..B1.13 # Prob 82% #12.9
# LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.15: # Preds ..B1.13 ..B1.25
# Execution count [5.00e+00]
cmpq %r13, %r15 #12.9
jae ..B1.19 # Prob 10% #12.9
# LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.16: # Preds ..B1.15
# Execution count [4.50e+00]
movq -56(%rbp), %rax #13.49[spill]
lea (%r11,%r8), %r12 #13.13
movq -64(%rbp), %rsi #13.25[spill]
lea (%r9,%r8), %rdx #13.61
lea (%r12,%r14,8), %r12 #13.13
addq %r8, %rax #13.49
addq %r8, %rsi #13.25
lea (%rdx,%r14,8), %rdx #13.61
lea (%rax,%r14,8), %rax #13.49
lea (%rsi,%r14,8), %r14 #13.25
# LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
..B1.17: # Preds ..B1.17 ..B1.16
# Execution count [2.50e+01]
vmovsd (%r14,%r15,8), %xmm2 #13.25
vaddsd 16(%r14,%r15,8), %xmm2, %xmm3 #13.37
vaddsd 8(%rax,%r15,8), %xmm3, %xmm4 #13.49
vaddsd 8(%rdx,%r15,8), %xmm4, %xmm5 #13.61
vmulsd %xmm5, %xmm1, %xmm6 #13.74
vmovsd %xmm6, 8(%r12,%r15,8) #13.13
incq %r15 #12.9
cmpq %r13, %r15 #12.9
jb ..B1.17 # Prob 82% #12.9
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
# LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
..B1.19: # Preds ..B1.17 ..B1.5 ..B1.10 ..B1.15
# Execution count [5.00e+00]
incq %r10 #9.5
lea (%r8,%rcx,8), %r8 #9.5
cmpq -48(%rbp), %r10 #9.5[spill]
jb ..B1.5 # Prob 82% #9.5
# LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
..B1.20: # Preds ..B1.19
# Execution count [9.00e-01]
movq -64(%rbp), %rdx #[spill]
movq -96(%rbp), %r15 #[spill]
movq -56(%rbp), %r14 #[spill]
movq -104(%rbp), %r13 #[spill]
movq -112(%rbp), %r12 #[spill]
.cfi_restore 12
# LOE rdx r11 r12 r13 r14 r15
..B1.21: # Preds ..B1.3 ..B1.20
# Execution count [1.00e+00]
addq $8, %rdx #16.5
addq $8, %r11 #16.5
movq %rdx, %rdi #16.5
movq %r11, %rsi #16.5
vzeroupper #16.5
..___tag_value_jacobi2D5pt.12:
# dummy(double *, double *)
call dummy #16.5
..___tag_value_jacobi2D5pt.13:
# LOE r12 r13 r14 r15
..B1.22: # Preds ..B1.21
# Execution count [1.00e+00]
movq %r15, %rdx #16.5
movq %r13, %rax #16.5
addq $31, %rax #16.5
andq $-32, %rax #16.5
addq %rax, %rsp #16.5
# LOE r12 r13 r14
..B1.23: # Preds ..B1.22
# Execution count [1.00e+00]
movq %r14, %rdx #16.5
movq %r13, %rax #16.5
addq $31, %rax #16.5
andq $-32, %rax #16.5
addq %rax, %rsp #16.5
# LOE r12
..B1.24: # Preds ..B1.23
# Execution count [1.00e+00]
lea -24(%rbp), %rsp #17.1
.cfi_restore 15
popq %r15 #17.1
.cfi_restore 14
popq %r14 #17.1
.cfi_restore 13
popq %r13 #17.1
popq %rbp #17.1
.cfi_restore 6
movq %rbx, %rsp #17.1
popq %rbx #17.1
.cfi_def_cfa 7, 8
.cfi_restore 3
ret #17.1
.cfi_def_cfa 3, 16
.cfi_offset 3, -16
.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
# LOE
..B1.25: # Preds ..B1.11
# Execution count [4.50e-01]: Infreq
xorl %r15d, %r15d #12.9
jmp ..B1.15 # Prob 100% #12.9
# LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
..B1.26: # Preds ..B1.6
# Execution count [4.50e-01]: Infreq
xorl %r14d, %r14d #12.9
jmp ..B1.10 # Prob 100% #12.9
.align 16,0x90
# LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
.cfi_endproc
# mark_end;
.type jacobi2D5pt,@function
.size jacobi2D5pt,.-jacobi2D5pt
.data
# -- End jacobi2D5pt
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End

View File

@@ -1,16 +0,0 @@
void jacobi2D5pt(int N, int M){
void dummy(double*, double*);
double a[M][N];
double b[M][N];
double s;
for(int j=1; j<M-1; ++j){
#pragma vector aligned
//STARTLOOP
for(int i=1; i<N-1; ++i){
b[j][i] = ( a[j][i-1] + a[j][i+1] + a[j-1][i] + a[j+1][i]) * s;
}
}
dummy(&a[1][1], &b[1][1]);
}

View File

@@ -1,13 +0,0 @@
void daxpy(int N){
void dummy(double*, double*);
double a[N], b[N];
double s;
//STARTLOOP
for(int i=0; i<N; ++i)
a[i] = a[i] + s * b[i];
dummy(&a[1], &b[1]);
}

View File

@@ -1,13 +0,0 @@
void scale(int N){
void dummy(double*, double*);
double a[N], b[N];
double s;
//STARTLOOP
for(int i=0; i<N; ++i){
a[i] = s * b[i];
}
dummy(&a[1],&b[1]);
}

Binary file not shown.

Binary file not shown.

View File

@@ -1,199 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
.file "taxCalc.c"
.text
..TXTST0:
# -- Begin main
.text
# mark_begin;
.align 16,0x90
.globl main
# --- main(void)
main:
..B1.1: # Preds ..B1.0
.cfi_startproc
..___tag_value_main.1:
..L2:
#4.15
pushq %rbp #4.15
.cfi_def_cfa_offset 16
movq %rsp, %rbp #4.15
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-128, %rsp #4.15
subq $4096, %rsp #4.15
movl $104446, %esi #4.15
movl $3, %edi #4.15
call __intel_new_feature_proc_init #4.15
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
vstmxcsr (%rsp) #4.15
movl $.2.3_2_kmpc_loc_struct_pack.3, %edi #4.15
xorl %esi, %esi #4.15
orl $32832, (%rsp) #4.15
xorl %eax, %eax #4.15
vldmxcsr (%rsp) #4.15
..___tag_value_main.6:
call __kmpc_begin #4.15
..___tag_value_main.7:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
movl $il0_peep_printf_format_0, %edi #5.5
call puts #5.5
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
vmovss .L_2il0floatpacket.0(%rip), %xmm0 #8.15
xorl %eax, %eax #11.5
vxorps %xmm1, %xmm1, %xmm1 #9.5
vmovss %xmm1, (%rsp) #9.5
movl $111,%ebx #IACA START
.byte 100,103,144 #IACA START
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
..B1.4: # Preds ..B1.4 ..B1.3
lea 1(%rax,%rax), %edx #12.9
vcvtsi2ss %edx, %xmm2, %xmm2 #12.27
vmulss %xmm2, %xmm0, %xmm3 #12.29
lea 2(%rax,%rax), %ecx #12.9
vaddss %xmm3, %xmm1, %xmm4 #12.29
vxorps %xmm1, %xmm1, %xmm1 #12.27
vcvtsi2ss %ecx, %xmm1, %xmm1 #12.27
vmulss %xmm1, %xmm0, %xmm5 #12.29
vmovss %xmm4, 4(%rsp,%rax,8) #12.9
vaddss %xmm5, %xmm4, %xmm1 #12.29
vmovss %xmm1, 8(%rsp,%rax,8) #12.9
incq %rax #11.5
cmpq $499, %rax #11.5
jb ..B1.4 # Prob 99% #11.5
movl $222,%ebx #IACA END
.byte 100,103,144 #IACA END
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
..B1.5: # Preds ..B1.4
vmovss 3992(%rsp), %xmm0 #12.18
movl $il0_peep_printf_format_1, %edi #15.5
vaddss .L_2il0floatpacket.1(%rip), %xmm0, %xmm1 #12.29
vmovss %xmm1, 3996(%rsp) #12.9
call puts #15.5
# LOE rbx r12 r13 r14 r15
..B1.6: # Preds ..B1.5
movl $.2.3_2_kmpc_loc_struct_pack.14, %edi #16.12
xorl %eax, %eax #16.12
..___tag_value_main.8:
call __kmpc_end #16.12
..___tag_value_main.9:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
xorl %eax, %eax #16.12
movq %rbp, %rsp #16.12
popq %rbp #16.12
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #16.12
.align 16,0x90
.cfi_endproc
# LOE
# mark_end;
.type main,@function
.size main,.-main
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.3:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.2
.align 4
.2.3_2__kmpc_loc_pack.2:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 52
.byte 59
.byte 52
.byte 59
.byte 59
.space 1, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.14:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.13
.align 4
.2.3_2__kmpc_loc_pack.13:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 49
.byte 54
.byte 59
.byte 49
.byte 54
.byte 59
.byte 59
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
il0_peep_printf_format_0:
.long 1128354639
.long 1702109249
.long 1931506803
.long 1953653108
.byte 0
.space 3, 0x00 # pad
.align 4
il0_peep_printf_format_1:
.long 1128354639
.long 1702109249
.long 1696625779
.word 25710
.byte 0
.data
# -- End main
.section .rodata, "a"
.align 4
.align 4
.L_2il0floatpacket.0:
.long 0x3e428f5c
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,4
.align 4
.L_2il0floatpacket.1:
.long 0x433dcf5c
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,4
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End

View File

@@ -1,18 +0,0 @@
#include <stdio.h>
//#include "iacaMarks.h"
int main(void){
printf("OSACA test start\n");
int i = 1;
float arr[1000];
float tax = 0.19;
arr[0] = 0;
//STARTLOOP
while(i < 1000){
arr[i] = arr[i-1]+i*tax;
i += 1;
}
printf("OSACA test end\n");
return 0;
}

View File

@@ -1,12 +0,0 @@
void triad(int N){
void dummy(double*);
double a[N], b[N], c[N], d[N];
double s;
//STARTLOOP
for(int i=0; i<N; ++i)
a[i] = b[i] + c[i] * d[i];
dummy(&a[1]);
}

View File

@@ -1,2 +1,10 @@
"""Open Source Architecture Code Analyzer"""
name = 'osaca'
__version__ = '0.2.2'
__version__ = '0.3.1.dev1'
# To trigger travis deployment to pypi, do the following:
# 1. Increment __version___
# 2. commit to RRZE-HPC/osaca's master branch
# 3. wait for travis to complete successful (unless already tested)
# 4. tag commit with 'v{}'.format(__version__) (`git tag vX.Y.Z`)
# 5. push tag to github (`git push origin vX.Y.Z` or push all tags with `git push --tags`)

8
osaca/api/__init__.py Normal file
View File

@@ -0,0 +1,8 @@
"""
APIs for handling interfaces to kerncraft, etc.
Only the classes below will be exported, so please add new semantic tools to __all__.
"""
from .kerncraft_interface import KerncraftAPI
__all__ = ['KerncraftAPI']

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
import collections
import sys
from io import StringIO
from osaca.frontend import Frontend
from osaca.parser import ParserAArch64v81, ParserX86ATT
from osaca.semantics import (INSTR_FLAGS, KernelDG, MachineModel,
SemanticsAppender, reduce_to_section)
# Stolen from https://stackoverflow.com/a/16571630
class Capturing(list):
def __enter__(self):
self._stdout = sys.stdout
sys.stdout = self._stringio = StringIO()
return self
def __exit__(self, *args):
self.extend(self._stringio.getvalue().splitlines())
del self._stringio # free up some memory
sys.stdout = self._stdout
class KerncraftAPI(object):
def __init__(self, arch, code):
self.machine_model = MachineModel(arch=arch)
self.semantics = SemanticsAppender(self.machine_model)
isa = self.machine_model.get_ISA().lower()
if isa == 'aarch64':
self.parser = ParserAArch64v81()
elif isa == 'x86':
self.parser = ParserX86ATT()
parsed_code = self.parser.parse_file(code)
self.kernel = reduce_to_section(parsed_code, self.machine_model.get_ISA())
self.semantics.add_semantics(self.kernel)
def create_output(self, verbose=False):
kernel_graph = KernelDG(self.kernel, self.parser, self.machine_model)
frontend = Frontend(arch=self.machine_model.get_arch())
with Capturing() as output:
frontend.print_full_analysis(self.kernel, kernel_graph, verbose=verbose)
return '\n'.join(output)
def get_unmatched_instruction_ratio(self):
unmatched_counter = 0
for instruction in self.kernel:
if (
INSTR_FLAGS.TP_UNKWN in instruction['flags']
and INSTR_FLAGS.LT_UNKWN in instruction['flags']
):
unmatched_counter += 1
return unmatched_counter / len(self.kernel)
def get_port_occupation_cycles(self):
throughput_values = self.semantics.get_throughput_sum(self.kernel)
port_names = self.machine_model['ports']
return collections.OrderedDict(list(zip(port_names, throughput_values)))
def get_total_throughput(self):
return max(self.semantics.get_throughput_sum(self.kernel))
def get_latency(self):
return (self.get_lcd(), self.get_cp())
def get_cp(self):
kernel_graph = KernelDG(self.kernel, self.parser, self.machine_model)
kernel_cp = kernel_graph.get_critical_path()
return sum([x['latency_cp'] for x in kernel_cp])
def get_lcd(self):
kernel_graph = KernelDG(self.kernel, self.parser, self.machine_model)
lcd_dict = kernel_graph.get_loopcarried_dependencies()
lcd = 0.0
for dep in lcd_dict:
lcd_tmp = sum([x['latency_lcd'] for x in lcd_dict[dep]['dependencies']])
lcd = lcd_tmp if lcd_tmp > lcd else lcd
return lcd

View File

@@ -1,41 +0,0 @@
#!/usr/bin/env python3
from param import Register, MemAddr, Parameter
from testcase import Testcase
# Choose out of various operands
reg8 = Register('al')
reg16 = Register('ax')
reg32 = Register('eax')
reg64 = Register('rax')
xmm = Register('xmm0')
ymm = Register('ymm0')
zmm = Register('zmm0')
mem0 = MemAddr('(%rax, %esi, 4)')
imd1 = Parameter('IMD')
# -----------------------------------------------
# -USER INPUT------------------------------------
# -----------------------------------------------
# Enter your mnemonic
mnemonic = 'add'
# Define your operands. If you don't need it, just type in None
dst = mem0
op1 = imd1
op2 = None
# Define the number of instructions per loop (default: 12)
per_loop = '32'
# -----------------------------------------------
# -----------------------------------------------
# Start
operands = [x for x in [dst, op1, op2] if x is not None]
opListStr = ', '.join([str(x) for x in operands])
print('Create Testcase for {} {}'.format(mnemonic, opListStr), end='')
tc = Testcase(mnemonic, operands, per_loop)
tc.write_testcase()
print(' --------> SUCCEEDED')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

540
osaca/data/csx.yml Normal file
View File

@@ -0,0 +1,540 @@
osaca_version: 0.3.2
micro_architecture: Cascade Lake SP
arch_code: CSX
isa: x86
ROB_size: 224
retired_uOps_per_cycle: 4
scheduler_size: 97
hidden_loads: false
load_latency: {gpr: 4.0, xmm: 4.0, ymm: 4.0, zmm: 4.0}
load_throughput:
- {base: gpr, index: ~, offset: ~, scale: 1, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
- {base: gpr, index: ~, offset: ~, scale: 8, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
- {base: gpr, index: ~, offset: imd, scale: 1, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
- {base: gpr, index: ~, offset: imd, scale: 8, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
- {base: gpr, index: gpr, offset: ~, scale: 1, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
- {base: gpr, index: gpr, offset: ~, scale: 8, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
- {base: gpr, index: gpr, offset: imd, scale: 1, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
- {base: gpr, index: gpr, offset: imd, scale: 8, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
ports: ['0', 0DV, '1', '2', 2D, '3', 3D, '4', '5', '6', '7']
port_model_scheme: |
┌------------------------------------------------------------------------┐
| 97 entry unified scheduler |
└------------------------------------------------------------------------┘
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
┌-------┐ ┌-------┐ ┌-----┐ ┌-----┐ ┌-----┐ ┌-------┐ ┌--------┐ ┌-----┐
| ALU | | ALU | | LD | | LD | | ST | | ALU | | ALU & | | AGU |
└-------┘ └-------┘ └-----┘ └-----┘ └-----┘ └-------┘ | Shift | └-----┘
┌-------┐ ┌-------┐ ┌-----┐ ┌-----┐ ┌-------┐ └--------┘
| 2ND | | Fast | | AGU | | AGU | | Fast |
| BRANCH| | LEA | └-----┘ └-----┘ | LEA |
└-------┘ └-------┘ └-------┘
┌-------┐ ┌-------┐ ┌-------┐
|AVX DIV| |AVX FMA| | AVX |
└-------┘ └-------┘ | SHUF |
┌-------┐ ┌-------┐ └-------┘
|AVX FMA| |AVX MUL| ┌-------┐
└-------┘ └-------┘ |AVX-512|
┌-------┐ ┌-------┐ | FMA |
|AVX MUL| |AVX ADD| └-------┘
└-------┘ └-------┘ ┌-------┐
┌-------┐ ┌-------┐ |AVX-512|
|AVX ADD| |AVX ALU| | ADD |
└-------┘ └-------┘ └-------┘
┌-------┐ ┌-------┐ ┌-------┐
|AVX ALU| | AVX | |AVX-512|
└-------┘ | Shift | | MUL |
┌-------┐ └-------┘ └-------┘
| AVX | ┌-------┐ ┌-------┐
| Shift | | Slow | |AVX-512|
└-------┘ | LEA | | ALU |
┌-------┐ └-------┘ └-------┘
| VNNI | ┌-------┐
└-------┘ | VNNI |
└-------┘
instruction_forms:
- name: addsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: addss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: addl
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p0156
port_pressure: [[1, '0156']]
- name: addq
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p0156
port_pressure: [[1, '0156']]
- name: cmpl
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: ~ # 1*p0156
port_pressure: [[1, '0156']]
- name: cmpq
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: ~ # 1*p0156
port_pressure: [[1, '0156']]
- name: incq
operands:
- class: register
name: gpr
throughput: 0.25
latency: ~ # 1*p0156
port_pressure: [[1, '0156']]
- name: ja
operands:
- class: identifier
throughput: 0.0
latency: 0.0
port_pressure: []
- name: jb
operands:
- class: identifier
throughput: 0.0
latency: 0.0
port_pressure: []
- name: jne
operands:
- class: identifier
throughput: 0.0
latency: 0.0
port_pressure: []
- name: mulsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: mulss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: movl
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.0
latency: 0.0
port_pressure: []
- name: movq
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.0
latency: 0.0
port_pressure: []
- name: movq
operands:
- class: memory
base: gpr
offset: ~
index: ~
scale: 1
- class: register
name: gpr
throughput: 0.5
latency: 3.0 # 1*p23+1*p2D3D
port_pressure: [[1, '23'], [1, [2D, 3D]]]
- name: movq
operands:
- class: register
name: gpr
- class: memory
base: gpr
offset: imd
index: ~
scale: 1
throughput: 0.5
latency: 2.0 # 1*p23+1*p4
port_pressure: [[1, '23'], [1, '4']]
- name: rcpss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 1.0
latency: 4.0
port_pressure: ~
- name: sqrtsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 6.0
latency: 22.0 # 1*p0+6*p0DV
port_pressure: [[1, '0'], [6.0, [0DV]]]
- name: sqrtss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 3.0
latency: 16.0 # 1*p0+3*p0DV
port_pressure: [[1, '0'], [3.0, [0DV]]]
- name: subq
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p0156
port_pressure: [[1, '0156']]
- name: vaddpd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vaddpd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vaddsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vaddss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vdivsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 4.0
latency: 14.0 # 1*p0+4*p0DV
port_pressure: [[1, '0'], [4.0, [0DV]]]
- name: vdivss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 3.0
latency: 11.0 # 1*p0+3*p0DV
port_pressure: [[1, '0'], [3.0, [0DV]]]
- name: vfmadd213pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vfmadd132pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vfmadd231pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vfmadd132pd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vfmadd213pd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vfmadd231pd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulpd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmovapd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovapd
operands:
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
- class: register
name: xmm
throughput: 1.0
latency: 4.0 # 1*p23+1*p2D3D
port_pressure: [[1, '23'], [1, [2D, 3D]]]
- name: vmovapd
operands:
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 1*p23+1*p2D3D
port_pressure: [[1, '23'], [1, [2D, 3D]]]
- name: vmovapd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p23+1*p4
port_pressure: [[1, '23'], [1, '4']]
- name: vmovapd
operands:
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovapd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 5.0 # 1*p23+1*p4
port_pressure: [[1, '23'], [1, '4']]
- name: vmovapd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 1.0
latency: 5.0 # 1*p23+1*p4
port_pressure: [[1, '23'], [1, '4']]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 5.0 # 1*p23+1*p4
port_pressure: [[1, '23'], [1, '4']]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 1.0
latency: 5.0 # 1*p23+1*p4
port_pressure: [[1, '23'], [1, '4']]
- name: vmovupd
operands:
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovsd
operands:
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p23+1*p2D3D
port_pressure: [[1, '23'], [1, [2D, 3D]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p23+1*p4
port_pressure: [[1, '23'], [1, '4']]

File diff suppressed because it is too large Load Diff

374
osaca/data/isa/aarch64.yml Normal file
View File

@@ -0,0 +1,374 @@
osaca_version: 0.3.0
isa: "AArch64"
# Contains all operand-irregular instruction forms OSACA supports for AArch64.
# Operand-regular for a AArch64 instruction form with N operands in the shape of
# mnemonic op1 ... opN
# means that op1 is the only destination operand and op2 to op(N) are source operands.
instruction_forms:
- name: "fmla"
operands:
- class: "register"
prefix: "v"
shape: "s"
source: true
destination: true
- class: "register"
prefix: "v"
shape: "s"
source: true
destination: false
- class: "register"
prefix: "v"
shape: "s"
source: true
destination: false
- name: "fmla"
operands:
- class: "register"
prefix: "v"
shape: "d"
source: true
destination: true
- class: "register"
prefix: "v"
shape: "d"
source: true
destination: false
- class: "register"
prefix: "v"
shape: "d"
source: true
destination: false
- name: "ldp"
operands:
- class: "register"
prefix: "d"
source: false
destination: true
- class: "register"
prefix: "d"
source: false
destination: true
- class: "memory"
base: "x"
offset: "imd"
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: true
destination: false
- name: "ldp"
operands:
- class: "register"
prefix: "d"
source: false
destination: true
- class: "register"
prefix: "d"
source: false
destination: true
- class: "memory"
base: "x"
offset: "imd"
index: ~
scale: 1
pre-indexed: false
post-indexed: true
source: true
destination: false
- name: "ldp"
operands:
- class: "register"
prefix: "d"
source: false
destination: true
- class: "register"
prefix: "d"
source: false
destination: true
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
source: true
destination: false
- name: "ldp"
operands:
- class: "register"
prefix: "q"
source: false
destination: true
- class: "register"
prefix: "q"
source: false
destination: true
- class: "memory"
base: "x"
offset: "imd"
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: true
destination: false
- name: "ldp"
operands:
- class: "register"
prefix: "q"
source: false
destination: true
- class: "register"
prefix: "q"
source: false
destination: true
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
source: true
destination: false
- name: "ldp"
operands:
- class: "register"
prefix: "q"
source: false
destination: true
- class: "register"
prefix: "q"
source: false
destination: true
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: true
destination: false
- name: "ldp"
operands:
- class: "register"
prefix: "q"
source: false
destination: true
- class: "register"
prefix: "q"
source: false
destination: true
- class: "memory"
base: "x"
offset: "imd"
index: ~
scale: 1
pre-indexed: true
post-indexed: false
source: true
destination: true
- name: "stp"
operands:
- class: "register"
prefix: "d"
source: true
destination: false
- class: "register"
prefix: "d"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true
- name: "stp"
operands:
- class: "register"
prefix: "d"
source: true
destination: false
- class: "register"
prefix: "d"
source: true
destination: false
- class: "memory"
base: "x"
offset: "imd"
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true
- name: "stp"
operands:
- class: "register"
prefix: "q"
source: true
destination: false
- class: "register"
prefix: "q"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true
- name: "stp"
operands:
- class: "register"
prefix: "q"
source: true
destination: false
- class: "register"
prefix: "q"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: True
source: false
destination: true
- name: "stp"
operands:
- class: "register"
prefix: "q"
source: true
destination: false
- class: "register"
prefix: "q"
source: true
destination: false
- class: "memory"
base: "x"
offset: "imd"
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true
- name: "str"
operands:
- class: "register"
prefix: "x"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true
- name: "str"
operands:
- class: "register"
prefix: "d"
source: true
destination: false
- class: "memory"
base: "x"
offset: "imd"
index: ~
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true
- name: "str"
operands:
- class: "register"
prefix: "d"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
source: false
destination: true
- name: "str"
operands:
- class: "register"
prefix: "q"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: "x"
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true
- name: "str"
operands:
- class: "register"
prefix: "q"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
source: false
destination: true
- name: "str"
operands:
- class: "register"
prefix: "x"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
source: false
destination: true
- name: "str"
operands:
- class: "register"
prefix: "x"
source: true
destination: false
- class: "memory"
base: "x"
offset: ~
index: "x"
scale: 1
pre-indexed: false
post-indexed: false
source: false
destination: true

218
osaca/data/isa/x86.yml Normal file
View File

@@ -0,0 +1,218 @@
osaca_version: 0.3.0
isa: "x86"
# Contains all operand-irregular instruction forms OSACA supports for x86.
# Operand-regular for a x86 AT&T instruction form with N operands in the shape of
# mnemonic op1 ... opN
# means that opN is the only destination operand and op1 to op(N-1) are source operands.
instruction_forms:
- name: addl
operands:
- class: "immediate"
imd: "int"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: true
- name: addq
operands:
- class: "immediate"
imd: "int"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: true
- name: addsd
operands:
- class: "register"
name: "xmm"
source: true
destination: true
- class: "register"
name: "xmm"
source: true
destination: false
- name: addss
operands:
- class: "register"
name: "xmm"
source: true
destination: true
- class: "register"
name: "xmm"
source: true
destination: false
- name: cmpl
operands:
- class: "register"
name: "gpr"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: false
- name: cmpq
operands:
- class: "register"
name: "gpr"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: false
- name: cmpq
operands:
- class: "register"
name: "gpr"
source: true
destination: false
- class: "memory"
base: "gpr"
offset: "imd"
index: ~
scale: 1
source: true
destination: false
- name: ja
operands:
- class: "identifier"
source: true
destination: false
- name: mulsd
operands:
- class: "register"
name: "xmm"
source: true
destination: true
- class: "register"
name: "xmm"
source: true
destination: false
- name: mulss
operands:
- class: "register"
name: "xmm"
source: true
destination: true
- class: "register"
name: "xmm"
source: true
destination: false
- name: subq
operands:
- class: "immediate"
imd: "int"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: true
- name: vfmadd132pd
operands:
- class: "memory"
base: "gpr"
offset: ~
index: "gpr"
scale: 1
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: true
- name: vfmadd132pd
operands:
- class: "memory"
base: "gpr"
offset: "imd"
index: "gpr"
scale: 1
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: true
- name: vfmadd213pd
operands:
- class: "memory"
base: "gpr"
offset: ~
index: "gpr"
scale: 1
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: true
- name: vfmadd213pd
operands:
- class: "memory"
base: "gpr"
offset: "imd"
index: "gpr"
scale: 1
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: true
- name: vfmadd231pd
operands:
- class: "memory"
base: "gpr"
offset: "imd"
index: "gpr"
scale: 1
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: true
- name: vfmadd231pd
operands:
- class: "memory"
base: "gpr"
offset: ~
index: "gpr"
scale: 1
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: false
- class: "register"
name: "ymm"
source: true
destination: true

File diff suppressed because it is too large Load Diff

View File

@@ -1,222 +1,202 @@
#!/usr/bin/env python3
from collections import defaultdict, OrderedDict
import xml.etree.ElementTree as ET
import re
import sys
import argparse
import sys
import xml.etree.ElementTree as ET
from distutils.version import StrictVersion
from osaca.param import Parameter, Register
from osaca.eu_sched import Scheduler
from osaca.parser import get_parser
from osaca.semantics import MachineModel
def normalize_reg_name(reg_name):
# strip spaces
reg_name = reg_name.strip()
# masks are denoted with curly brackets in uops.info
reg_name = re.sub(r'{K([0-7])}', r'K\1', reg_name)
reg_name = re.sub(r'ST\(([0-7])\)', r'ST\1', reg_name)
return reg_name
def port_pressure_from_tag_attributes(attrib):
# '1*p015+1*p1+1*p23+1*p4+3*p5' ->
# [[1, '015'], [1, '1'], [1, '23'], [1, '4'], [3, '5']]
port_occupation = []
for p in attrib['ports'].split('+'):
cycles, ports = p.split('*p')
port_occupation.append([int(cycles), ports])
def port_occupancy_from_tag_attributes(attrib, arch):
occupancy = defaultdict(int)
for k, v in attrib.items():
m = re.match('^port([0-9]+)', k)
if not m:
continue
ports = m.group(1)
# Ignore Port7 on HSW, BDW, SKL and SKX if present in combination with ports 2 and 3.
# Port7 is only used for simple address generation, while 2 and 3 handle all addressing,
# but uops.info does not differentiate.
if arch in ['HSW', 'BDW', 'SKL', 'SKX'] and ports == '237':
ports = ports.replace('7', '')
potential_ports = list(ports)
per_port_occupancy = int(v) / len(potential_ports)
for pp in potential_ports:
occupancy[pp] += per_port_occupancy
# Also consider DIV pipeline
# Also
if 'div_cycles' in attrib:
occupancy['0DV'] = int(attrib['div_cycles'])
port_occupation.append([int(attrib['div_cycles']), ['DIV']])
return dict(occupancy)
return port_occupation
def extract_paramters(instruction_tag):
def extract_paramters(instruction_tag, parser, isa):
# Extract parameter components
parameters = [] # used to store string representations
parameter_tags = sorted(instruction_tag.findall("operand"),
key=lambda p: int(p.attrib['idx']))
parameter_tags = sorted(instruction_tag.findall("operand"), key=lambda p: int(p.attrib['idx']))
for parameter_tag in parameter_tags:
parameter = {}
# Ignore parameters with suppressed=1
if int(parameter_tag.attrib.get('suppressed', '0')):
continue
p_type = parameter_tag.attrib['type']
if p_type == 'imm':
parameters.append('imd') # Parameter('IMD')
parameter['class'] = 'immediate'
parameter['imd'] = 'int'
parameters.append(parameter)
elif p_type == 'mem':
parameters.append('mem') # Parameter('MEM')
parameter['class'] = 'memory'
parameter['base'] = 'gpr'
parameter['offset'] = None
parameter['index'] = None
parameter['scale'] = 1
parameters.append(parameter)
elif p_type == 'reg':
possible_regs = [normalize_reg_name(r)
for r in parameter_tag.text.split(',')]
reg_groups = [Register.sizes.get(r, None) for r in possible_regs]
if reg_groups[1:] == reg_groups[:-1]:
if reg_groups[0] is None:
raise ValueError("Unknown register type for {} with {}.".format(
parameter_tag.attrib, parameter_tag.text))
elif reg_groups[0][1] == 'GPR':
parameters.append('r{}'.format(reg_groups[0][0]))
# Register(possible_regs[0]))
elif '{' in parameter_tag.text:
# We have a mask
parameters[-1] += '{opmask}'
parameter['class'] = 'register'
possible_regs = [parser.parse_register('%' + r) for r in parameter_tag.text.split(',')]
if possible_regs[0] is None:
raise ValueError(
'Unknown register type for {} with {}.'.format(
parameter_tag.attrib, parameter_tag.text
)
)
if isa == 'x86':
if parser.is_vector_register(possible_regs[0]['register']):
possible_regs[0]['register']['name'] = possible_regs[0]['register'][
'name'
].lower()[:3]
if 'mask' in possible_regs[0]['register']:
possible_regs[0]['register']['mask'] = True
else:
parameters.append(reg_groups[0][1].lower())
possible_regs[0]['register']['name'] = 'gpr'
elif isa == 'aarch64':
del possible_regs['register']['name']
for key in possible_regs[0]['register']:
parameter[key] = possible_regs[0]['register'][key]
parameters.append(parameter)
elif p_type == 'relbr':
parameters.append('LBL')
parameter['class'] = 'identifier'
parameters.append(parameter)
elif p_type == 'agen':
parameters.append('mem')
# FIXME actually only address generation
parameter['class'] = 'memory'
parameter['base'] = 'gpr'
parameter['offset'] = None
parameter['index'] = None
parameter['scale'] = 1
parameters.append(parameter)
parameters.append(parameter)
else:
raise ValueError("Unknown paramter type {}".format(parameter_tag.attrib))
return parameters
def extract_model(tree, arch):
model_data = []
for instruction_tag in tree.findall('//instruction'):
isa = MachineModel.get_isa_for_arch(arch)
mm = MachineModel(isa=isa)
parser = get_parser(isa)
for instruction_tag in tree.findall('.//instruction'):
ignore = False
mnemonic = instruction_tag.attrib['asm']
# Extract parameter components
try:
parameters = extract_paramters(instruction_tag)
parameters = extract_paramters(instruction_tag, parser, isa)
if isa == 'x86':
parameters.reverse()
except ValueError as e:
print(e, file=sys.stderr)
# Extract port occupation, throughput and latency
port_occupancy, throughput, latency = [], 0.0, None
arch_tag = instruction_tag.find('architecture[@name="'+arch+'"]')
port_pressure, throughput, latency, uops = [], None, None, None
arch_tag = instruction_tag.find('architecture[@name="' + arch.upper() + '"]')
if arch_tag is None:
continue
# We collect all measurement and IACA information and compare them later
for measurement_tag in arch_tag.iter('measurement'):
port_occupancy.append(port_occupancy_from_tag_attributes(measurement_tag.attrib, arch))
# FIXME handle min/max Latencies ('maxCycles' and 'minCycles')
latencies = [int(l_tag.attrib['cycles'])
for l_tag in measurement_tag.iter('latency') if 'latency' in l_tag.attrib]
if 'TP_ports' in measurement_tag.attrib:
throughput = measurement_tag.attrib['TP_ports']
else:
throughput = (
measurement_tag.attrib['TP'] if 'TP' in measurement_tag.attrib else None
)
uops = (
int(measurement_tag.attrib['uops']) if 'uops' in measurement_tag.attrib else None
)
if 'ports' in measurement_tag.attrib:
port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib))
latencies = [
int(l_tag.attrib['cycles'])
for l_tag in measurement_tag.iter('latency')
if 'cycles' in l_tag.attrib
]
if len(latencies) == 0:
latencies = [
int(l_tag.attrib['max_cycles'])
for l_tag in measurement_tag.iter('latency')
if 'max_cycles' in l_tag.attrib
]
if latencies[1:] != latencies[:-1]:
print("Contradicting latencies found:", mnemonic, file=sys.stderr)
ignore = True
elif latencies:
latency = latencies[0]
# Ordered by IACA version (newest last)
for iaca_tag in sorted(arch_tag.iter('IACA'),
key=lambda i: StrictVersion(i.attrib['version'])):
port_occupancy.append(port_occupancy_from_tag_attributes(iaca_tag.attrib, arch))
if ignore: continue
for iaca_tag in sorted(
arch_tag.iter('IACA'), key=lambda i: StrictVersion(i.attrib['version'])
):
if 'ports' in iaca_tag.attrib:
port_pressure.append(port_pressure_from_tag_attributes(iaca_tag.attrib))
if ignore:
continue
# Check if all are equal
if port_occupancy:
if port_occupancy[1:] != port_occupancy[:-1]:
print("Contradicting port occupancies, using latest IACA:", mnemonic,
file=sys.stderr)
port_occupancy = port_occupancy[-1]
throughput = max(list(port_occupancy.values())+[0.0])
if port_pressure:
if port_pressure[1:] != port_pressure[:-1]:
print(
"Contradicting port occupancies, using latest IACA:", mnemonic, file=sys.stderr
)
port_pressure = port_pressure[-1]
# Add missing ports:
for ports in [pp[1] for pp in port_pressure]:
for p in ports:
mm.add_port(p)
throughput = max(mm.average_port_pressure(port_pressure))
else:
# print("No data available for this architecture:", mnemonic, file=sys.stderr)
continue
# ---------------------------------------------
mm.set_instruction(mnemonic, parameters, latency, port_pressure, throughput, uops)
for m, p in build_variants(mnemonic, parameters):
model_data.append((m.lower() + '-' + '_'.join(p),
throughput, latency, port_occupancy))
return model_data
def all_or_false(iterator):
if not iterator:
return False
else:
return all(iterator)
def build_variants(mnemonic, parameters):
"""Yield all resonable variants of this instruction form."""
# The one that was given
mnemonic = mnemonic.upper()
yield mnemonic, parameters
# Without opmask
if any(['{opmask}' in p for p in parameters]):
yield mnemonic, list([p.replace('{opmask}', '') for p in parameters])
# With suffix (assuming suffix was not already present)
suffixes = {'Q': 'r64',
'L': 'r32',
'W': 'r16',
'B': 'r8'}
for s, reg in suffixes.items():
if not mnemonic.endswith(s) and all_or_false(
[p == reg for p in parameters if p not in ['mem', 'imd']]):
yield mnemonic+s, parameters
return mm
def architectures(tree):
return set([a.attrib['name'] for a in tree.findall('.//architecture')])
def int_or_zero(s):
try:
return int(s)
except ValueError:
return 0
def dump_csv(model_data, arch):
csv = 'instr,TP,LT,ports\n'
ports = set()
for mnemonic, throughput, latency, port_occupancy in model_data:
for p in port_occupancy:
ports.add(p)
ports = sorted(ports)
# If not all ports have been used (happens with port7 due to blacklist
# port_occupancy_from_tag_attributes), extend list accordingly:
while len(ports) < Scheduler.arch_dict[arch] + len(Scheduler.arch_pipeline_ports.get(arch, [])):
max_index = ports.index(str(max(map(int_or_zero, ports))))
ports.insert(max_index + 1, str(max(map(int_or_zero, ports)) + 1))
for mnemonic, throughput, latency, port_occupancy in model_data:
for p in ports:
if p not in port_occupancy:
port_occupancy[p] = 0.0
po_items = sorted(port_occupancy.items())
csv_line = '{},{},{},"({})"\n'.format(mnemonic, throughput, latency,
','.join([str(c) for p, c in po_items]))
csv += csv_line
return csv
def main():
parser = argparse.ArgumentParser()
parser.add_argument('xml', help='path of instructions.xml from http://uops.info')
parser.add_argument('arch', nargs='?',
help='architecture to extract, use IACA abbreviations (e.g., SNB). '
'if not given, all will be extracted and saved to file in CWD.')
parser.add_argument(
'arch',
nargs='?',
help='architecture to extract, use IACA abbreviations (e.g., SNB). '
'if not given, all will be extracted and saved to file in CWD.',
)
args = parser.parse_args()
tree = ET.parse(args.xml)
print('Available architectures:', ', '.join(architectures(tree)))
if args.arch:
model_data = extract_model(tree, args.arch)
print(dump_csv(model_data, args.arch))
model = extract_model(tree, args.arch)
print(model.dump())
else:
for arch in architectures(tree):
model_data = extract_model(tree, arch)
with open('{}_data.csv'.format(arch), 'w') as f:
f.write(dump_csv(model_data, arch))
print(arch, end='')
model = extract_model(tree, arch.lower())
with open('{}.yml'.format(arch.lower()), 'w') as f:
model.dump(f)
print('.')
if __name__ == '__main__':

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

698
osaca/data/tx2.yml Normal file
View File

@@ -0,0 +1,698 @@
osaca_version: 0.3.2
micro_architecture: Thunder X2
arch_code: tx2
isa: AArch64
ROB_size: 180
retired_uOps_per_cycle: 4
scheduler_size: 60
hidden_loads: false
load_latency: {w: 4.0, x: 4.0, b: 4.0, h: 4.0, s: 4.0, d: 4.0, q: 4.0, v: 4.0}
load_throughput:
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: ~, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: false, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: true, post-indexed: true, port_pressure: [1, '34']}
- {base: x, index: x, offset: imd, scale: 8, pre-indexed: true, post-indexed: false, port_pressure: [1, '34']}
ports: ['0', 0DV, '1', 1DV, '2', '3', '4', '5']
port_model_scheme: |
┌-----------------------------------------------------------┐
| 60 entry unified scheduler |
└-----------------------------------------------------------┘
0 | 1 | 2 | 3 | 4 | 5 |
▼ ▼ ▼ ▼ ▼ ▼
┌------┐ ┌------┐ ┌------┐ ┌------┐ ┌------┐ ┌------┐
| ALU | | ALU | | ALU/ | | LD | | LD | | ST |
└------┘ └------┘ | BR | └------┘ └------┘ └------┘
┌------┐ ┌------┐ └------┘ ┌------┐ ┌------┐
| FP/ | | FP/ | | AGU | | AGU |
| NEON | | NEON | └------┘ └------┘
└------┘ └------┘
┌------┐
| INT |
| MUL/ |
| DIV |
└------┘
┌------┐
|CRYPTO|
└------┘
instruction_forms:
- name: add
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: register
prefix: x
throughput: 0.33333333
latency: 1.0 # 1*p012
port_pressure: [[1, '012']]
- name: add
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: immediate
imd: int
throughput: 0.33333333
latency: 1.0 # 1*p012
port_pressure: [[1, '012']]
- name: adds
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: immediate
imd: int
throughput: 0.33333333
latency: 1.0 # 1*p012
port_pressure: [[1, '012']]
- name: b.ne
operands:
- class: identifier
throughput: 0.0
latency: 0.0
port_pressure: []
- name: b.gt
operands:
- class: identifier
throughput: 0.0
latency: 0.0
port_pressure: []
- name: bne
operands:
- class: identifier
throughput: 0.0
latency: 0.0
port_pressure: []
- name: cmp
operands:
- class: register
prefix: w
- class: immediate
imd: int
throughput: 0.33333333
latency: 1.0 # 1*p012
port_pressure: [[1, '012']]
- name: cmp
operands:
- class: register
prefix: x
- class: register
prefix: x
throughput: 0.33333333
latency: 1.0 # 1*p012
port_pressure: [[1, '012']]
- name: fadd
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fadd
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: register
prefix: d
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fadd
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fdiv
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
throughput: 8.5
latency: 16.0 # 1*p01+17*p0DV1DV
port_pressure: [[1, '01'], [17.0, [0DV, 1DV]]]
- name: fdiv
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
throughput: 12.0
latency: 23.0 # 1*p01+24*p0DV1DV
port_pressure: [[1, '01'], [24.0, [0DV, 1DV]]]
- name: fmla
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fmla
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fmov
operands:
- {class: register, prefix: s}
- {class: immediate, imd: double}
latency: ~ # 1*p01
port_pressure: [[1, '01']]
throughput: 0.5
- name: fmul
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fmul
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fmul
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: register
prefix: d
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fsub
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: fsub
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
throughput: 0.5
latency: 6.0 # 1*p01
port_pressure: [[1, '01']]
- name: ldp
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: ~ # 2*p34
port_pressure: [[2.0, '34']]
- name: ldp
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: false
post-indexed: true
throughput: 1.0
latency: ~ # 2*p34
port_pressure: [[2.0, '34']]
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: ~ # 2*p34
port_pressure: [[2.0, '34']]
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
throughput: 1.0
latency: ~ # 2*p34
port_pressure: [[2.0, '34']]
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: ~ # 2*p34
port_pressure: [[2.0, '34']]
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: true
post-indexed: false
throughput: 1.0
latency: ~ # 2*p34
port_pressure: [[2.0, '34']]
- name: ldp
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
throughput: 1.0
latency: ~ # 2*p34
port_pressure: [[2.0, '34']]
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: ~
index: ~
scale: 1
post-indexed: false
pre-indexed: false
throughput: 0.5
latency: 4.0 # 1*p34
port_pressure: [[1.0, '34']]
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: imd
index: ~
scale: 1
post-indexed: false
pre-indexed: false
throughput: 0.5
latency: 4.0 # 1*p34
port_pressure: [[1.0, '34']]
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: ~
index: x
scale: 8
post-indexed: false
pre-indexed: false
throughput: 0.5
latency: 4.0 # 1*p34
port_pressure: [[1.0, '34']]
- name: ldr
operands:
- class: register
prefix: x
- class: register
prefix: x
throughput: 0.0
latency: 0.0
port_pressure: []
- name: ldr
operands:
- class: register
prefix: q
- class: register
prefix: q
throughput: 0.0
latency: 0.0
port_pressure: []
- name: ldr
operands:
- class: register
prefix: d
- class: register
prefix: d
throughput: 0.0
latency: 0.0
port_pressure: []
- name: mov
operands:
- class: register
prefix: x
- class: register
prefix: x
throughput: 0.5
latency: 1.0 # 1*p01
port_pressure: [[1, '01']]
- name: mov
operands:
- class: register
prefix: v
shape: b
- class: register
prefix: v
shape: b
throughput: 0.5
latency: 5.0 # 1*p01
port_pressure: [[1, '01']]
- name: prfm
operands:
- class: prfop
type: pld
target: l1
policy: keep
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: ~
latency: ~
port_pressure: []
- name: stp
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 2.0
latency: ~ # 4*p34
port_pressure: [[4.0, '34']]
- name: stp
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 2.0
latency: ~ # 4*p34
port_pressure: [[4.0, '34']]
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
throughput: 2.0
latency: ~ # 2*p34+2*p5
port_pressure: [[2.0, '34'], [2.0, '5']]
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 2.0
latency: ~ # 2*p34+2*p5
port_pressure: [[2.0, '34'], [2.0, '5']]
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 2.0
latency: ~ # 2*p34+2*p5
port_pressure: [[2.0, '34'], [2.0, '5']]
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: 4.0 # 1*p34+1*p5
port_pressure: [[1.0, '34'], [1.0, '5']]
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: imd
index: ~
scale: 1
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: 4.0 # 1*p34+1*p5
port_pressure: [[1.0, '34'], [1.0, '5']]
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
throughput: 1.0
latency: 4.0 # 1*p34+1*p5
port_pressure: [[1.0, '34'], [1.0, '5']]
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: ~
index: x
scale: 1
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: 4.0 # 1*p34+1*p5
port_pressure: [[1.0, '34'], [1.0, '5']]
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
throughput: 1.0
latency: 4.0 # 1*p34+1*p5
port_pressure: [[1.0, '34'], [1.0, '5']]
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: ~
index: ~
scale: 1
pre-indexed: false
post-indexed: true
throughput: 1.0
latency: 4.0 # 1*p34+1*p5
port_pressure: [[1.0, '34'], [1.0, '5']]
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: ~
index: x
scale: 1
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: 4.0 # 1*p34+1*p5
port_pressure: [[1.0, '34'], [1.0, '5']]

File diff suppressed because it is too large Load Diff

539
osaca/data/zen1.yml Normal file
View File

@@ -0,0 +1,539 @@
osaca_version: 0.3.2
micro_architecture: AMD Zen (family 17h)
arch_code: ZEN1
isa: x86
load_latency: {gpr: 4.0, xmm: 4.0, ymm: 4.0}
load_throughput_multiplier: {gpr: 1.0, xmm: 1.0, ymm: 2.0}
load_throughput:
- {base: gpr, index: ~, offset: ~, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: ~, offset: ~, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: ~, offset: imd, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: ~, offset: imd, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: ~, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: ~, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: imd, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: imd, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
hidden_loads: false
ports: ['0', '1', '2', '3', 3DV, '4', '5', '6', '7', '8', '9', 8D, 9D, ST]
port_model_scheme: |
┌--------------------------------------┐ ┌-----------------------------------------------┐
| 96 entries OoO scheduler | | 84 entries OoO scheduler |
└--------------------------------------┘ └-----------------------------------------------┘
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐ ┌------┐ ┌-----┐ ┌-----┐ ┌------┐ ┌-----┐ ┌-----┐
|SSE ALU| |SSE ALU| |SSE ALU| |SSE ALU| | ALU | | ALU | | ALU | | ALU | | AGU | | AGU |
└-------┘ └-------┘ └-------┘ └-------┘ └------┘ └-----┘ └-----┘ └------┘ └-----┘ └-----┘
┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐ ┌------┐ ┌-----┐ ┌-----┐ ┌------┐ | |
|SSE MUL| |SSE MUL| |SSE ADD| |SSE ADD| |BRANCH| | MUL | | MUL | |BRANCH| ▼ ▼
└-------┘ └-------┘ └-------┘ └-------┘ └------┘ └-----┘ └-----┘ └------┘ ┌-------------┐
┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐ | LOAD |
|SSE FMA| |SSE FMA| | SSE | |SSE DIV| └-------------┘
└-------┘ └-------┘ | SHUF | └-------┘ ┌-------------┐
┌-------┐ └-------┘ | LOAD |
| SSE | └-------------┘
| SHUF | ┌-------------┐
└-------┘ | STORE |
└-------------┘
instruction_forms:
- name: add
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: add
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: 1 # 1*p4567
port_pressure: [[1, '4567']]
- name: addl
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: addq
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: cmpl
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: ~ # 1*p4567
port_pressure: [[1, '4567']]
- name: cmpq
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: ~ # 1*p4567
port_pressure: [[1, '4567']]
- name: incq
operands:
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: ja
operands:
- class: identifier
throughput: 0.0
latency: ~
port_pressure: []
- name: jb
operands:
- class: identifier
throughput: 0.0
latency: ~
port_pressure: []
- name: jne
operands:
- class: identifier
throughput: 0.0
latency: ~
port_pressure: []
- name: leaq
operands:
- class: memory
base: gpr
offset: imd
index: ~
scale: 1
- class: register
name: gpr
throughput: 0.5
latency: ~ # 1*p89
port_pressure: [[1, '89']]
- name: movl
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.0
latency: 0.0
port_pressure: []
- name: mulsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: mulss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p01
port_pressure: [[1, '01']]
- name: rcpss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: ~ #1.0
latency: 5.0
port_pressure: []
- name: sqrtsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: ~ #8.0
latency: 23.0
port_pressure: []
- name: sqrtss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: ~ #5.0
latency: 17.0
port_pressure: []
- name: subq
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: subq
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: vaddpd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 3.0 # 2*p23
port_pressure: [[2, '23']]
- name: vaddsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p23
port_pressure: [[1, '23']]
- name: vaddss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p23
port_pressure: [[1, '23']]
- name: vdivsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 4.0
latency: 13.0 # 1*p3+4*p3DV
port_pressure: [[1, '3'], [4.0, [3DV]]]
- name: vdivss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 3.0
latency: 10.0
port_pressure: [[1, '3'], [3.0, [3DV]]]
- name: vfmadd213pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vfmadd231pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vfmadd132pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vmulsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulpd
operands:
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01+1*p89+1*p8D9D
port_pressure: [[1, '01'], [1, '89'], [1, [8D, 9D]]]
- name: vmulpd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulpd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vmovapd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovapd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovapd
operands:
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovapd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovapd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovaps
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovaps
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovsd
operands:
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p89+1*p8D9D
port_pressure: [[1, '89'], [1, [8D, 9D]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: ~
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: imd
index: ~
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]

View File

@@ -1,138 +0,0 @@
instr,TP,LT,ports
jae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
ja-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jcxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jecxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
je-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jg-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jmp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jmpq-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jna-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jne-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jng-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jnz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jpe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jpo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
jz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
add-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
add-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
addl-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
addq-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
addl-mem_imd,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
addq-mem_imd,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
add-mem_r32,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
add-mem_r64,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
addl-mem_r32,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
addq-mem_r64,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
cmp-mem_r32,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
cmpl-mem_r32,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
cmp-r32_mem,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
cmpl-r32_mem,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
cmp-r32_r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
cmpl-r32_r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
cmp-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
cmp-r64_r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
cmpq-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
cmpq-r64_r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
inc-r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
incq-r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
incl-r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
mov-mem_r64,1.0,4.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
mov-r64_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
mov-r32_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
movq-mem_r64,1.0,4.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
movq-r64_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
movl-r32_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
movslq-r64_r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
sub-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
vaddpd-ymm_ymm_mem,1.0,3.0,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0.5, 0.5)"
vaddsd-xmm_xmm_mem,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)"
vaddsd-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
vaddss-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
vcvtsi2ss-xmm_xmm_r32,1.0,4.0,"(1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
vcvtss2si-r32_xmm,1.0,7.0,"(1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
cvtsi2ss-xmm_r32,1.0,8.0,"(1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd213pd-ymm_ymm_ymm,1.0,5.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd213pd-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd213ps-ymm_ymm_ymm,1.0,5.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd213ps-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd213sd-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd213ss-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd132sd-xmm_xmm_mem,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
vfmadd132pd-xmm_xmm_mem,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
vfmadd132pd-ymm_ymm_mem,1.0,5.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
vinsertf128-ymm_ymm_imd,0.6666666666666667,1.0,"(-1,)"
vmovsd-mem_xmm,1.0,8.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
vmovsd-xmm_mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
vmulpd-ymm_ymm_ymm,1.0,4.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vmulsd-xmm_xmm_mem,0.5,4.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
vmulsd-xmm_xmm_xmm,0.5,4.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vmulss-xmm_xmm_xmm,0.5,3.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vsubpd-ymm_ymm_mem,1.0,3.0,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 1.0, 1.0)"
vsubsd-xmm_xmm_mem,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)"
vsubsd-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
vsubss-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
vmovaps-xmm_mem,0.5,3.0,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0.5, 0.5)"
vmovaps-mem_xmm,1.0,5.0,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 1.0, 1.0)"
vmovapd-ymm_mem,1.0,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
vmovapd-mem_ymm,2.0,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)"
movq-r64_xmm,1.0,-1.0,"(0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0)"
#prefetcht0-mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
#prefetchw-mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
cmpl-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
vaddpd-xmm_xmm_xmm,0.5,3,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0)"
vaddpd-ymm_ymm_ymm,1,3,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
vcvtdq2pd-xmm_xmm,1,7,"(0.5, 0.5, 0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
vcvtdq2pd-ymm_xmm,2,7,"(1.0, 1.0, 0, 2.0, 0, 0, 0, 0, 0, 0, 0)"
vcvtsi2sd-xmm_xmm_r32,1,4,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
vextracti128-xmm_ymm_imd,0.3333333333333333,1,"(0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)"
vfmadd132pd-xmm_xmm_xmm,0.5,5,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd132pd-ymm_ymm_ymm,1,5,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vfmadd132sd-xmm_xmm_xmm,0.5,5,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vmulpd-xmm_xmm_xmm,0.5,4,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vpaddd-xmm_xmm_xmm,0.3333333333333333,1,"(0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)"
vpaddd-ymm_ymm_ymm,0.6666666666666667,1,"(0.66, 0.66, 0, 0.66, 0, 0, 0, 0, 0, 0, 0)"
vpshufd-xmm_xmm_imd,0.5,1,"(0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0)"
vxorpd-xmm_xmm_xmm,0.25,1,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0, 0)"
vxorps-xmm_xmm_xmm,0.25,1,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0, 0)"
vdivpd-xmm_xmm_xmm,4,8,"(0, 0, 0, 1.0, 4.0, 0, 0, 0, 0, 0, 0)"
vdivsd-xmm_xmm_xmm,4,8,"(0, 0, 0, 1.0, 4.0, 0, 0, 0, 0, 0, 0)"
vmovups-mem_xmm,0.5,8,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
vmovups-xmm_mem,1,8,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
vaddpd-xmm_xmm_mem,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)"
vmulpd-xmm_xmm_mem,0.5,4,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vmulpd-ymm_ymm_mem,1,4,"(1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
vinsertf128-ymm_ymm_mem_imd,1,5,"(0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)"
vmovupd-xmm_mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
vmovupd-mem_xmm,1,1,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)"
vmovupd-ymm_mem,3.0,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
vmovupd-mem_ymm,2,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)"
movupd-xmm_mem,0.5,-1,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
pushq-r64,0.5,-1,"(-1,)"
cmpq-r64_mem,0.5,-1,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
movq-r64_r64,0.2,-1,"(-1,)"
subq-r64_r64,0.25,1,"(-1,)"
cmpq-mem_r64,0.5,-1,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
1 instr TP LT ports
2 jae-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
3 ja-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
4 jbe-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
5 jb-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
6 jc-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
7 jcxz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
8 jecxz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
9 je-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
10 jge-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
11 jg-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
12 jle-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
13 jl-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
14 jmp-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
15 jmpq-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
16 jnae-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
17 jna-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
18 jnbe-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
19 jnb-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
20 jnc-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
21 jne-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
22 jnge-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
23 jng-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
24 jnle-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
25 jnl-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
26 jno-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
27 jno-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
28 jnp-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
29 jns-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
30 jns-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
31 jnz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
32 jo-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
33 jo-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
34 jpe-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
35 jp-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
36 jpo-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
37 js-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
38 js-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
39 jz-lbl 0.0 0.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
40 add-r32_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
41 add-r64_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
42 addl-r32_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
43 addq-r64_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
44 addl-mem_imd 1.0 7.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)
45 addq-mem_imd 1.0 7.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)
46 add-mem_r32 1.0 7.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)
47 add-mem_r64 1.0 7.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)
48 addl-mem_r32 1.0 7.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)
49 addq-mem_r64 1.0 7.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)
50 cmp-mem_r32 0.5 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)
51 cmpl-mem_r32 0.5 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)
52 cmp-r32_mem 0.5 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)
53 cmpl-r32_mem 0.5 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)
54 cmp-r32_r32 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
55 cmpl-r32_r32 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
56 cmp-r64_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
57 cmp-r64_r64 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
58 cmpq-r64_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
59 cmpq-r64_r64 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
60 inc-r64 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
61 incq-r64 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
62 incl-r32 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
63 mov-mem_r64 1.0 4.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
64 mov-r64_mem 0.5 3.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
65 mov-r32_mem 0.5 3.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
66 movq-mem_r64 1.0 4.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
67 movq-r64_mem 0.5 3.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
68 movl-r32_mem 0.5 3.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
69 movslq-r64_r32 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
70 sub-r32_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
71 vaddpd-ymm_ymm_mem 1.0 3.0 (0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0.5, 0.5)
72 vaddsd-xmm_xmm_mem 0.5 3.0 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)
73 vaddsd-xmm_xmm_xmm 0.5 3.0 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)
74 vaddss-xmm_xmm_xmm 0.5 3.0 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)
75 vcvtsi2ss-xmm_xmm_r32 1.0 4.0 (1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)
76 vcvtss2si-r32_xmm 1.0 7.0 (1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)
77 cvtsi2ss-xmm_r32 1.0 8.0 (1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)
78 vfmadd213pd-ymm_ymm_ymm 1.0 5.0 (1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
79 vfmadd213pd-xmm_xmm_xmm 0.5 5.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
80 vfmadd213ps-ymm_ymm_ymm 1.0 5.0 (1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
81 vfmadd213ps-xmm_xmm_xmm 0.5 5.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
82 vfmadd213sd-xmm_xmm_xmm 0.5 5.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
83 vfmadd213ss-xmm_xmm_xmm 0.5 5.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
84 vfmadd132sd-xmm_xmm_mem 0.5 5.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
85 vfmadd132pd-xmm_xmm_mem 0.5 5.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
86 vfmadd132pd-ymm_ymm_mem 1.0 5.0 (1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
87 vinsertf128-ymm_ymm_imd 0.6666666666666667 1.0 (-1,)
88 vmovsd-mem_xmm 1.0 8.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
89 vmovsd-xmm_mem 0.5 -1.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
90 vmulpd-ymm_ymm_ymm 1.0 4.0 (1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
91 vmulsd-xmm_xmm_mem 0.5 4.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
92 vmulsd-xmm_xmm_xmm 0.5 4.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
93 vmulss-xmm_xmm_xmm 0.5 3.0 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
94 vsubpd-ymm_ymm_mem 1.0 3.0 (0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 1.0, 1.0)
95 vsubsd-xmm_xmm_mem 0.5 3.0 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)
96 vsubsd-xmm_xmm_xmm 0.5 3.0 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)
97 vsubss-xmm_xmm_xmm 0.5 3.0 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)
98 vmovaps-xmm_mem 0.5 3.0 (0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0.5, 0.5)
99 vmovaps-mem_xmm 1.0 5.0 (0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 1.0, 1.0)
100 vmovapd-ymm_mem 1.0 -1.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
101 vmovapd-mem_ymm 2.0 -1.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)
102 movq-r64_xmm 1.0 -1.0 (0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0)
103 #prefetcht0-mem 0.5 -1.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
104 #prefetchw-mem 0.5 -1.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
105 cmpl-r32_imd 0.25 1.0 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)
106 vaddpd-xmm_xmm_xmm 0.5 3 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0)
107 vaddpd-ymm_ymm_ymm 1 3 (0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)
108 vcvtdq2pd-xmm_xmm 1 7 (0.5, 0.5, 0, 1.0, 0, 0, 0, 0, 0, 0, 0)
109 vcvtdq2pd-ymm_xmm 2 7 (1.0, 1.0, 0, 2.0, 0, 0, 0, 0, 0, 0, 0)
110 vcvtsi2sd-xmm_xmm_r32 1 4 (0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)
111 vextracti128-xmm_ymm_imd 0.3333333333333333 1 (0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)
112 vfmadd132pd-xmm_xmm_xmm 0.5 5 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
113 vfmadd132pd-ymm_ymm_ymm 1 5 (1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
114 vfmadd132sd-xmm_xmm_xmm 0.5 5 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
115 vmulpd-xmm_xmm_xmm 0.5 4 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
116 vpaddd-xmm_xmm_xmm 0.3333333333333333 1 (0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)
117 vpaddd-ymm_ymm_ymm 0.6666666666666667 1 (0.66, 0.66, 0, 0.66, 0, 0, 0, 0, 0, 0, 0)
118 vpshufd-xmm_xmm_imd 0.5 1 (0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0)
119 vxorpd-xmm_xmm_xmm 0.25 1 (0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0, 0)
120 vxorps-xmm_xmm_xmm 0.25 1 (0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0, 0)
121 vdivpd-xmm_xmm_xmm 4 8 (0, 0, 0, 1.0, 4.0, 0, 0, 0, 0, 0, 0)
122 vdivsd-xmm_xmm_xmm 4 8 (0, 0, 0, 1.0, 4.0, 0, 0, 0, 0, 0, 0)
123 vmovups-mem_xmm 0.5 8 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
124 vmovups-xmm_mem 1 8 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)
125 vaddpd-xmm_xmm_mem 0.5 3.0 (0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)
126 vmulpd-xmm_xmm_mem 0.5 4 (0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)
127 vmulpd-ymm_ymm_mem 1 4 (1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)
128 vinsertf128-ymm_ymm_mem_imd 1 5 (0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)
129 vmovupd-xmm_mem 0.5 -1.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
130 vmovupd-mem_xmm 1 1 (0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)
131 vmovupd-ymm_mem 3.0 -1.0 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
132 vmovupd-mem_ymm 2 2 (0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)
133 movupd-xmm_mem 0.5 -1 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)
134 pushq-r64 0.5 -1 (-1,)
135 cmpq-r64_mem 0.5 -1 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)
136 movq-r64_r64 0.2 -1 (-1,)
137 subq-r64_r64 0.25 1 (-1,)
138 cmpq-mem_r64 0.5 -1 (0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)

406
osaca/db_interface.py Executable file
View File

@@ -0,0 +1,406 @@
#!/usr/bin/env python3
import math
import os
import warnings
import ruamel.yaml
from osaca.semantics import MachineModel
def sanity_check(arch: str, verbose=False):
# load arch machine model
arch_mm = MachineModel(arch=arch)
data = arch_mm['instruction_forms']
# load isa machine model
isa = arch_mm.get_ISA()
isa_mm = MachineModel(arch='isa/{}'.format(isa))
num_of_instr = len(data)
# check arch DB entries
(
missing_throughput,
missing_latency,
missing_port_pressure,
wrong_port,
suspicious_instructions,
duplicate_instr_arch,
) = _check_sanity_arch_db(arch_mm, isa_mm)
# check ISA DB entries
duplicate_instr_isa, only_in_isa = _check_sanity_isa_db(arch_mm, isa_mm)
_print_sanity_report(
num_of_instr,
missing_throughput,
missing_latency,
missing_port_pressure,
wrong_port,
suspicious_instructions,
duplicate_instr_arch,
duplicate_instr_isa,
only_in_isa,
verbose=verbose,
)
def import_benchmark_output(arch, bench_type, filepath):
supported_bench_outputs = ['ibench', 'asmbench']
assert os.path.exists(filepath)
if bench_type not in supported_bench_outputs:
raise ValueError('Benchmark type is not supported.')
with open(filepath, 'r') as f:
input_data = f.readlines()
db_entries = None
if bench_type == 'ibench':
db_entries = _get_ibench_output(input_data)
elif bench_type == 'asmbench':
raise NotImplementedError
# write entries to DB
mm = MachineModel(arch)
for entry in db_entries:
mm.set_instruction_entry(entry)
with open(filepath, 'w') as f:
mm.dump(f)
##################
# HELPERS IBENCH #
##################
def _get_ibench_output(input_data):
db_entries = {}
for line in input_data:
if 'Using frequency' in line or len(line) == 0:
continue
instruction = line.split(':')[0]
key = '-'.join(instruction.split('-')[:2])
if key in db_entries:
# add only TP/LT value
entry = db_entries[key]
else:
mnemonic = instruction.split('-')[0]
operands = instruction.split('-')[1].split('_')
operands = [_create_db_operand(op) for op in operands]
entry = {
'name': mnemonic,
'operands': operands,
'throughput': None,
'latency': None,
'port_pressure': None,
}
if 'TP' in instruction:
entry['throughput'] = _validate_measurement(float(line.split()[1]), True)
if not entry['throughput']:
warnings.warn(
'Your THROUGHPUT measurement for {} looks suspicious'.format(key)
+ ' and was not added. Please inspect your benchmark.'
)
elif 'LT' in instruction:
entry['latency'] = _validate_measurement(float(line.split()[1]), False)
if not entry['latency']:
warnings.warn(
'Your LATENCY measurement for {} looks suspicious'.format(key)
+ ' and was not added. Please inspect your benchmark.'
)
db_entries[key] = entry
return db_entries
def _validate_measurement(self, measurement, is_tp):
if not is_tp:
if (
math.floor(measurement) * 1.05 >= measurement
or math.ceil(measurement) * 0.95 <= measurement
):
# Value is probably correct, so round it to the estimated value
return float(round(measurement))
# Check reciprocal only if it is a throughput value
else:
reciprocals = [1 / x for x in range(1, 11)]
for reci in reciprocals:
if reci * 0.95 <= measurement <= reci * 1.05:
# Value is probably correct, so round it to the estimated value
return round(reci, 5)
# No value close to an integer or its reciprocal found, we assume the
# measurement is incorrect
return None
def _create_db_operand(self, operand):
if self.isa == 'aarch64':
return self._create_db_operand_aarch64(operand)
elif self.isa == 'x86':
return self._create_db_operand_x86(operand)
def _create_db_operand_aarch64(self, operand):
if operand == 'i':
return {'class': 'immediate', 'imd': 'int'}
elif operand in 'wxbhsdq':
return {'class': 'register', 'prefix': operand}
elif operand.startswith('v'):
return {'class': 'register', 'prefix': 'v', 'shape': operand[1:2]}
elif operand.startswith('m'):
return {
'class': 'memory',
'base': 'gpr' if 'b' in operand else None,
'offset': 'imd' if 'o' in operand else None,
'index': 'gpr' if 'i' in operand else None,
'scale': 8 if 's' in operand else 1,
'pre-indexed': True if 'r' in operand else False,
'post-indexed': True if 'p' in operand else False,
}
else:
raise ValueError('Parameter {} is not a valid operand code'.format(operand))
def _create_db_operand_x86(self, operand):
if operand == 'r':
return {'class': 'register', 'name': 'gpr'}
elif operand in 'xyz':
return {'class': 'register', 'name': operand + 'mm'}
elif operand == 'i':
return {'class': 'immediate', 'imd': 'int'}
elif operand.startswith('m'):
return {
'class': 'memory',
'base': 'gpr' if 'b' in operand else None,
'offset': 'imd' if 'o' in operand else None,
'index': 'gpr' if 'i' in operand else None,
'scale': 8 if 's' in operand else 1,
}
else:
raise ValueError('Parameter {} is not a valid operand code'.format(operand))
########################
# HELPERS SANITY CHECK #
########################
def _check_sanity_arch_db(arch_mm, isa_mm):
suspicious_prefixes_x86 = ['vfm', 'fm']
suspicious_prefixes_arm = ['fml', 'ldp', 'stp', 'str']
if arch_mm.get_ISA().lower() == 'aarch64':
suspicious_prefixes = suspicious_prefixes_arm
if arch_mm.get_ISA().lower() == 'x86':
suspicious_prefixes = suspicious_prefixes_x86
# returned lists
missing_throughput = []
missing_latency = []
missing_port_pressure = []
wrong_port = []
suspicious_instructions = []
duplicate_instr_arch = []
for instr_form in arch_mm['instruction_forms']:
# check value in DB entry
if instr_form['throughput'] is None:
missing_throughput.append(instr_form)
if instr_form['latency'] is None:
missing_latency.append(instr_form)
if instr_form['port_pressure'] is None:
missing_port_pressure.append(instr_form)
else:
if _check_for_wrong_port(arch_mm['ports'], instr_form):
wrong_port.append(instr_form)
# check entry against ISA DB
for prefix in suspicious_prefixes:
if instr_form['name'].startswith(prefix):
# check if instruction in ISA DB
if isa_mm.get_instruction(instr_form['name'], instr_form['operands']) is None:
# if not, mark them as suspicious and print it on the screen
suspicious_instructions.append(instr_form)
# check for duplicates in DB
if arch_mm._check_for_duplicate(instr_form['name'], instr_form['operands']):
duplicate_instr_arch.append(instr_form)
# every entry exists twice --> uniquify
tmp_list = []
for i in range(0, len(duplicate_instr_arch)):
tmp = duplicate_instr_arch.pop()
if tmp not in duplicate_instr_arch:
tmp_list.append(tmp)
duplicate_instr_arch = tmp_list
return (
missing_throughput,
missing_latency,
missing_port_pressure,
wrong_port,
suspicious_instructions,
duplicate_instr_arch,
)
def _check_for_wrong_port(port_list, instr_form):
for cycles, ports in instr_form['port_pressure']:
for p in ports:
if p not in port_list:
return False
return True
def _check_sanity_isa_db(arch_mm, isa_mm):
# returned lists
duplicate_instr_isa = []
only_in_isa = []
for instr_form in isa_mm['instruction_forms']:
# check if instr is missing in arch DB
if arch_mm.get_instruction(instr_form['name'], instr_form['operands']) is None:
only_in_isa.append(instr_form)
# check for duplicates
if isa_mm._check_for_duplicate(instr_form['name'], instr_form['operands']):
duplicate_instr_isa.append(instr_form)
# every entry exists twice --> uniquify
tmp_list = []
for i in range(0, len(duplicate_instr_isa)):
tmp = duplicate_instr_isa.pop()
if tmp not in duplicate_instr_isa:
tmp_list.append(tmp)
duplicate_instr_isa = tmp_list
return duplicate_instr_isa, only_in_isa
def _print_sanity_report(
total, m_tp, m_l, m_pp, wrong_pp, suspic_instr, dup_arch, dup_isa, only_isa, verbose=False
):
# non-verbose summary
print('SUMMARY\n----------------------')
print(
'{}% ({}/{}) of instruction forms have no throughput value.'.format(
round(100 * len(m_tp) / total), len(m_tp), total
)
)
print(
'{}% ({}/{}) of instruction forms have no latency value.'.format(
round(100 * len(m_l) / total), len(m_l), total
)
)
print(
'{}% ({}/{}) of instruction forms have no port pressure assignment.'.format(
round(100 * len(m_pp) / total), len(m_pp), total
)
)
print(
'{}% ({}/{}) of instruction forms have an invalid port identifier.'.format(
round(100 * len(wrong_pp) / total), len(wrong_pp), total
)
)
print(
'{}% ({}/{}) of instruction forms might miss an ISA DB entry.'.format(
round(100 * len(suspic_instr) / total), len(suspic_instr), total
)
)
print('{} duplicate instruction forms in uarch DB.'.format(len(dup_arch)))
print('{} duplicate instruction forms in ISA DB.'.format(len(dup_isa)))
print(
'{} instruction forms in ISA DB are not referenced by instruction '.format(len(only_isa))
+ 'forms in uarch DB.'
)
print('----------------------\n')
# verbose version
if verbose:
_print_sanity_report_verbose(
total, m_tp, m_l, m_pp, wrong_pp, suspic_instr, dup_arch, dup_isa, only_isa
)
def _print_sanity_report_verbose(
total, m_tp, m_l, m_pp, wrong_pp, suspic_instr, dup_arch, dup_isa, only_isa
):
BRIGHT_CYAN = '\033[1;36;1m'
BRIGHT_BLUE = '\033[1;34;1m'
BRIGHT_RED = '\033[1;31;1m'
BRIGHT_MAGENTA = '\033[1;35;1m'
BRIGHT_YELLOW = '\033[1;33;1m'
CYAN = '\033[36m'
YELLOW = '\033[33m'
WHITE = '\033[0m'
print('Instruction forms without throughput value:\n' if len(m_tp) != 0 else '', end='')
for instr_form in m_tp:
print('{}{}{}'.format(BRIGHT_BLUE, _get_full_instruction_name(instr_form), WHITE))
print('Instruction forms without latency value:\n' if len(m_l) != 0 else '', end='')
for instr_form in m_l:
print('{}{}{}'.format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE))
print(
'Instruction forms without port pressure assignment:\n' if len(m_pp) != 0 else '', end=''
)
for instr_form in m_pp:
print('{}{}{}'.format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE))
print(
'Instruction forms with invalid port identifiers in port pressure:\n'
if len(wrong_pp) != 0
else '',
end='',
)
for instr_form in wrong_pp:
print('{}{}{}'.format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE))
print(
'Instruction forms which might miss an ISA DB entry:\n' if len(suspic_instr) != 0 else '',
end='',
)
for instr_form in suspic_instr:
print('{}{}{}'.format(BRIGHT_CYAN, _get_full_instruction_name(instr_form), WHITE))
print('Duplicate instruction forms in uarch DB:\n' if len(dup_arch) != 0 else '', end='')
for instr_form in dup_arch:
print('{}{}{}'.format(YELLOW, _get_full_instruction_name(instr_form), WHITE))
print('Duplicate instruction forms in ISA DB:\n' if len(dup_isa) != 0 else '', end='')
for instr_form in dup_isa:
print('{}{}{}'.format(BRIGHT_YELLOW, _get_full_instruction_name(instr_form), WHITE))
print(
'Instruction forms existing in ISA DB but not in uarch DB:\n'
if len(only_isa) != 0
else '',
end='',
)
for instr_form in only_isa:
print('{}{}{}'.format(CYAN, _get_full_instruction_name(instr_form), WHITE))
###################
# GENERIC HELPERS #
###################
def _get_full_instruction_name(instruction_form):
operands = []
for op in instruction_form['operands']:
op_attrs = [
y + ':' + str(op[y])
for y in list(filter(lambda x: True if x != 'class' else False, op))
]
operands.append('{}({})'.format(op['class'], ','.join(op_attrs)))
return '{} {}'.format(instruction_form['name'], ','.join(operands))
def __represent_none(self, data):
return self.represent_scalar(u'tag:yaml.org,2002:null', u'~')
def _create_yaml_object():
yaml_obj = ruamel.yaml.YAML()
yaml_obj.representer.add_representer(type(None), __represent_none)
return yaml_obj
def __dump_data_to_yaml(filepath, data):
# first add 'normal' meta data in the right order (no ordered dict yet)
meta_data = dict(data)
del meta_data['instruction_forms']
del meta_data['port_model_scheme']
with open(filepath, 'w') as f:
ruamel.yaml.dump(meta_data, f, allow_unicode=True)
with open(filepath, 'a') as f:
# now add port model scheme in |-scheme for better readability
ruamel.yaml.dump(
{'port_model_scheme': data['port_model_scheme']},
f,
allow_unicode=True,
default_style='|',
)
# finally, add instruction forms
ruamel.yaml.dump({'instruction_forms': data['instruction_forms']}, f, allow_unicode=True)

View File

@@ -1,447 +0,0 @@
#!/usr/bin/env python3
import sys
import os
import math
import ast
from operator import add
import pandas as pd
from osaca.param import Register, MemAddr
#from param import Register, MemAddr
class Scheduler(object):
arch_dict = {
# Intel
'NHM': 5, 'WSM': 5, # Nehalem, Westmere
'SNB': 6, 'IVB': 6, # Sandy Bridge, Ivy Bridge
'HSW': 8, 'BDW': 8, # Haswell, Broadwell
'SKL': 8, 'SKX': 8, # Skylake(-X)
'KBL': 8, 'CFL': 8, # Kaby Lake, Coffee Lake
# AMD
'ZEN': 10, # Zen/Ryzen/EPYC
}
arch_pipeline_ports = {
'NHM': ['0DV'], 'WSM': ['0DV'],
'SNB': ['0DV'], 'IVB': ['0DV'],
'HSW': ['0DV'], 'BDW': ['0DV'],
'SKL': ['0DV'], 'SKX': ['0DV'],
'KBL': ['0DV'], 'CFL': ['0DV'],
'ZEN': ['3DV'],}
# content of most inner list in instrList: instr, operand(s), instr form
df = None # type: DataFrame
# for parallel ld/st in archs with 1 st/cy and >1 ld/cy, able to do 1 st and 1 ld in 1cy
ld_ports = None # type: list<int>
# enable flag for parallel ld/st
en_par_ldst = False # type: boolean
def __init__(self, arch, instruction_list):
arch = arch.upper()
try:
self.ports = self.arch_dict[arch]
except KeyError:
print('Architecture not supported for EU scheduling.', file=sys.stderr)
sys.exit(1)
# check for parallel ld/st in a cycle
if arch == 'ZEN':
self.en_par_ldst = True
self.ld_ports = [9, 10]
# check for DV port
self.pipeline_ports = self.arch_pipeline_ports.get(arch, [])
self.instrList = instruction_list
# curr_dir = os.path.realpath(__file__)[:-11]
osaca_dir = os.path.expanduser('~/.osaca/')
self.df = pd.read_csv(osaca_dir + 'data/' + arch.lower() + '_data.csv', quotechar='"',
converters={'ports': ast.literal_eval})
def new_schedule(self, machine_readable=False):
"""
Schedule Instruction Form list and calculate port bindings.
Parameters
----------
machine_readable : bool
Boolean for indicating if the return value should be human readable (if False) or
machine readable (if True)
Returns
-------
(str, [float, ...]) or ([[float, ...], ...], [float, ...])
A tuple containing the output of the schedule as string (if machine_readable is not
given or False) or as list of lists (if machine_readable is True) and the port bindings
as list of float.
"""
sched = self.get_head()
# Initialize ports
# Add DV port, if it is existing
occ_ports = [[0] * (self.ports + len(self.pipeline_ports)) for x in range(len(self.instrList))]
port_bndgs = [0] * (self.ports + len(self.pipeline_ports))
# Store instruction counter for parallel ld/st
par_ldst = 0
# Count the number of store instr if we schedule for an architecture with par ld/st
if self.en_par_ldst:
for i, instrForm in enumerate(self.instrList):
if (isinstance(instrForm[1], MemAddr) and len(instrForm) > 3
and not instrForm[0].startswith('cmp')):
# print('({}, {}) is st --> par_ldst = {}'.format(i, instrForm[0], par_ldst + 1))
par_ldst += 1
# Check if there's a port occupation stored in the CSV, otherwise leave the
# occ_port list item empty
for i, instrForm in enumerate(self.instrList):
search_string = instrForm[0] + self.get_operand_suffix(instrForm)
try:
entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
tup = entry.ports.values[0]
if len(tup) == 1 and tup[0] == -1:
raise IndexError()
except IndexError:
# Instruction form not in CSV
if instrForm[0][:3] == 'nop':
sched += self.format_port_occupation_line(occ_ports[i], '* ' + instrForm[-1])
elif instrForm[0] == 'DIRECTIVE':
sched += self.format_port_occupation_line(occ_ports[i], '* ' + instrForm[-1])
else:
sched += self.format_port_occupation_line(occ_ports[i], 'X ' + instrForm[-1])
continue
occ_ports[i] = list(tup)
# Check if it's a ld including instr
p_flg = ''
if self.en_par_ldst:
# Check for ld
# FIXME remove special load handling from here and place in machine model
if (isinstance(instrForm[-2], MemAddr) or
(len(instrForm) > 4 and isinstance(instrForm[2], MemAddr))):
if par_ldst > 0:
par_ldst -= 1
p_flg = 'P '
for port in self.ld_ports:
occ_ports[i][port] = 0.0 # '(' + str(occ_ports[i][port]) + ')'
# Write schedule line
if len(p_flg) > 0:
sched += self.format_port_occupation_line(occ_ports[i], p_flg + instrForm[-1])
for port in self.ld_ports:
occ_ports[i][port] = 0
else:
sched += self.format_port_occupation_line(occ_ports[i], instrForm[-1])
# Add throughput to total port binding
port_bndgs = list(map(add, port_bndgs, occ_ports[i]))
if machine_readable:
list(map(self.append, occ_ports, self.instrList))
return occ_ports, port_bndgs
return sched, port_bndgs
def schedule(self):
"""
Schedule Instruction Form list and calculate port bindings.
Returns
-------
(str, [int, ...])
A tuple containing the graphic output of the schedule as string and
the port bindings as list of ints.
"""
wTP = False
sched = self.get_head()
# Initialize ports
port_bndgs = [0] * self.ports
# Check if there's a port occupation stored in the CSV, otherwise leave the
# occ_port list item empty
for i, instrForm in enumerate(self.instrList):
try:
search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
tup = entry.ports.values[0]
if len(tup) == 1 and tup[0][0] == -1:
raise IndexError()
except IndexError:
# Instruction form not in CSV
if instrForm[0][:3] == 'nop':
sched += self.format_port_occupation_line(occ_ports[i], '* ' + instrForm[-1])
else:
sched += self.format_port_occupation_line(occ_ports[i], 'X ' + instrForm[-1])
continue
if wTP:
# Get the occurance of each port from the occupation list
port_occurances = self.get_port_occurances(tup)
# Get 'occurance groups'
occurance_groups = self.get_occurance_groups(port_occurances)
# Calculate port dependent throughput
tp_ges = entry.TP.values[0] * len(occurance_groups[0])
for occGroup in occurance_groups:
for port in occGroup:
occ_ports[i][port] = tp_ges / len(occGroup)
else:
variations = len(tup)
t_all = self.flatten(tup)
if entry.TP.values[0] == 0:
t_all = ()
if variations == 1:
for j in tup[0]:
occ_ports[i][j] = entry.TP.values[0]
else:
for j in range(0, self.ports):
occ_ports[i][j] = t_all.count(j) / variations
# Write schedule line
sched += self.format_port_occupation_line(occ_ports[i], instrForm[-1])
# Add throughput to total port binding
port_bndgs = list(map(add, port_bndgs, occ_ports[i]))
return sched, port_bndgs
def flatten(self, l):
if len(l) == 0:
return l
if isinstance(l[0], type(l)):
return self.flatten(l[0]) + self.flatten(l[1:])
return l[:1] + self.flatten(l[1:])
def append(self, l, e):
if(isinstance(l, list)):
l.append(e)
def schedule_fcfs(self):
"""
Schedule Instruction Form list for a single run with latencies.
Returns
-------
(str, int)
A tuple containing the graphic output as string and the total throughput time as int.
"""
sched = self.get_head()
total = 0
# Initialize ports
occ_ports = [0] * self.ports
for instrForm in self.instrList:
try:
search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
tup = entry.ports.values[0]
if len(tup) == 1 and tup[0][0] == -1:
raise IndexError()
except IndexError:
# Instruction form not in CSV
sched += self.format_port_occupation_line([0] * self.ports, '* ' + instrForm[-1])
continue
found = False
while not found:
for portOcc in tup:
# Test if chosen instruction form port occupation suits the current CPU port
# occupation
if self.test_ports_fcfs(occ_ports, portOcc):
# Current port occupation fits for chosen port occupation of instruction!
found = True
good = [entry.LT.values[0] if (j in portOcc) else 0 for j in
range(0, self.ports)]
sched += self.format_port_occupation_line(good, instrForm[-1])
# Add new occupation
occ_ports = [occ_ports[j] + good[j] for j in range(0, self.ports)]
break
# Step
occ_ports = [j - 1 if (j > 0) else 0 for j in occ_ports]
if entry.LT.values[0] != 0:
total += 1
total += max(occ_ports)
return sched, total
def get_occurance_groups(self, port_occurances):
"""
Group ports in groups by the number of their occurrence and sorts groups by cardinality.
Parameters
----------
port_occurances : [int, ...]
List with the length of ports containing the number of occurances
of each port
Returns
-------
[[int, ...], ...]
List of lists with all occurance groups sorted by cardinality
(smallest group first)
"""
groups = [[] for x in range(len(set(port_occurances))-1)]
for i, groupInd in enumerate(range(min(list(filter(lambda x: x > 0, port_occurances))),
max(port_occurances) + 1)):
for p, occurs in enumerate(port_occurances):
if groupInd == occurs:
groups[i].append(p)
# Sort groups by cardinality
groups.sort(key=len)
return groups
def get_port_occurances(self, tups):
"""
Return the number of each port occurrence for the possible port occupations.
Parameters
----------
tups : ((int, ...), ...)
Tuple of tuples of possible port occupations
Returns
-------
[int, ...]
List in the length of the number of ports for the current architecture,
containing the amount of occurances for each port
"""
ports = [0] * self.ports
for tup in tups:
for elem in tup:
ports[elem] += 1
return ports
def test_ports_fcfs(self, occ_ports, needed_ports):
"""
Test if current configuration of ports is possible and returns boolean.
Parameters
----------
occ_ports : [int]
Tuple to inspect for current port occupation
needed_ports : (int)
Tuple with needed port(s) for particular instruction form
Returns
-------
bool
True if needed ports can get scheduled on current port occupation
False if not
"""
for port in needed_ports:
if occ_ports[port] != 0:
return False
return True
def get_report_info(self):
"""
Create Report information including all needed annotations.
Returns
-------
str
String containing the report information
"""
analysis = 'Throughput Analysis Report\n' + ('-' * 26) + '\n'
annotations = (
'P - Load operation can be hidden behind a past or future store instruction\n'
'X - No information for this instruction in data file\n'
'* - Not bound to a port, therefore ignored\n\n')
return analysis + annotations
def get_head(self):
"""
Create right heading for CPU architecture.
Returns
-------
str
String containing the header
"""
port_names = self.get_port_naming()
port_line = ''.join('|{:^6}'.format(pn) for pn in port_names) + '|\n'
horiz_line = '-' * (len(port_line) - 1) + '\n'
port_anno = ' ' * ((len(port_line) - 25) // 2) + 'Ports Pressure in cycles\n'
return port_anno + port_line + horiz_line
def format_port_occupation_line(self, occ_ports, instr_name):
"""
Create line with port occupation for output.
Parameters
----------
occ_ports : (int, ...)
Integer tuple containing needed ports
instr_name : str
Name of instruction form for output
Returns
-------
str
String for output containing port scheduling for instr_name
"""
line = ''
for cycles in occ_ports:
if cycles == 0:
line += '|' + ' ' * 6
elif cycles >= 10:
line += '|{:^6.1f}'.format(cycles)
else:
line += '|{:^6.2f}'.format(cycles)
line += '| ' + instr_name + '\n'
return line
def get_port_naming(self):
"""
Return list of port names
:return: list of strings
"""
return sorted([str(i) for i in range(self.ports)] + self.pipeline_ports)
def get_port_binding(self, port_bndg):
"""
Create port binding out of scheduling result.
Parameters
----------
port_bndg : [int, ...]
Integer list containing port bindings
Returns
-------
str
String containing the port binding graphical output
"""
col_widths = self.get_column_widths(port_bndg)
header = 'Port Binding in Cycles Per Iteration:\n'
horiz_line = '-' * 10 + '-' * (sum(col_widths) + len(col_widths)) + '\n'
port_line = '| Port |'
for i, port_name in enumerate(self.get_port_naming()):
port_line += port_name.center(col_widths[i]) + '|'
port_line += '\n'
cyc_line = '| Cycles |'
for i in range(len(port_bndg)):
cyc_line += '{}|'.format(str(round(port_bndg[i], 2)).center(col_widths[i]))
cyc_line += '\n'
binding = header + horiz_line + port_line + horiz_line + cyc_line + horiz_line
return binding
def get_column_widths(self, port_bndg):
return [max(len(str(round(x, 2))), len(name)) + 2
for x, name in zip(port_bndg, self.get_port_naming())]
def get_operand_suffix(self, instr_form):
"""
Create operand suffix out of list of Parameters.
Parameters
----------
instr_form : [str, Parameter, ..., Parameter, str]
Instruction Form data structure
Returns
-------
str
Operand suffix for searching in data file
"""
op_ext = []
operands = ''
if len(instr_form) > 2:
operands = '-'
for i in range(1, len(instr_form) - 1):
if isinstance(instr_form[i], Register) and instr_form[i].reg_type == 'GPR':
optmp = 'r' + str(instr_form[i].size)
elif isinstance(instr_form[i], MemAddr):
optmp = 'mem'
else:
optmp = str(instr_form[i]).lower()
op_ext.append(optmp)
operands += '_'.join(op_ext)
return operands
if __name__ == '__main__':
print('Nothing to do.')

193
osaca/frontend.py Executable file
View File

@@ -0,0 +1,193 @@
#!/usr/bin/env python3
import os
import re
from datetime import datetime as dt
from ruamel import yaml
from osaca import utils
from osaca.semantics import INSTR_FLAGS, KernelDG, SemanticsAppender
class Frontend(object):
def __init__(self, filename='', arch=None, path_to_yaml=None):
self._filename = filename
if not arch and not path_to_yaml:
raise ValueError('Either arch or path_to_yaml required.')
if arch and path_to_yaml:
raise ValueError('Only one of arch and path_to_yaml is allowed.')
self._arch = arch
if arch:
self._arch = arch.lower()
with open(utils.find_file(self._arch + '.yml'), 'r') as f:
self._data = yaml.load(f, Loader=yaml.Loader)
elif path_to_yaml:
with open(path_to_yaml, 'r') as f:
self._data = yaml.load(f, Loader=yaml.Loader)
def _is_comment(self, instruction_form):
return instruction_form['comment'] is not None and instruction_form['instruction'] is None
def print_throughput_analysis(self, kernel, show_lineno=False, show_cmnts=True):
lineno_filler = ' ' if show_lineno else ''
port_len = self._get_max_port_len(kernel)
separator = '-' * sum([x + 3 for x in port_len]) + '-'
separator += '--' + len(str(kernel[-1]['line_number'])) * '-' if show_lineno else ''
col_sep = '|'
sep_list = self._get_separator_list(col_sep)
headline = 'Port pressure in cycles'
headline_str = '{{:^{}}}'.format(len(separator))
print('\n\nThroughput Analysis Report\n' + '--------------------------')
print(headline_str.format(headline))
print(lineno_filler + self._get_port_number_line(port_len))
print(separator)
for instruction_form in kernel:
line = '{:4d} {} {} {}'.format(
instruction_form['line_number'],
self._get_port_pressure(instruction_form['port_pressure'], port_len, sep_list),
self._get_flag_symbols(instruction_form['flags'])
if instruction_form['instruction'] is not None
else ' ',
instruction_form['line'].strip(),
)
line = line if show_lineno else col_sep + col_sep.join(line.split(col_sep)[1:])
if show_cmnts is False and self._is_comment(instruction_form):
continue
print(line)
print()
tp_sum = SemanticsAppender.get_throughput_sum(kernel)
print(lineno_filler + self._get_port_pressure(tp_sum, port_len, ' '))
def _get_separator_list(self, separator, separator_2=' '):
separator_list = []
for i in range(len(self._data['ports']) - 1):
match_1 = re.search(r'\d+', self._data['ports'][i])
match_2 = re.search(r'\d+', self._data['ports'][i + 1])
if match_1 is not None and match_2 is not None and match_1.group() == match_2.group():
separator_list.append(separator_2)
else:
separator_list.append(separator)
separator_list.append(separator)
return separator_list
def _get_flag_symbols(self, flag_obj):
string_result = ''
string_result += '*' if INSTR_FLAGS.NOT_BOUND in flag_obj else ''
string_result += 'X' if INSTR_FLAGS.TP_UNKWN in flag_obj else ''
string_result += 'P' if INSTR_FLAGS.HIDDEN_LD in flag_obj else ''
# TODO add other flags
string_result += ' ' if len(string_result) == 0 else ''
return string_result
def _get_port_pressure(self, ports, port_len, separator='|'):
if not isinstance(separator, list):
separator = [separator for x in ports]
string_result = '{} '.format(separator[-1])
for i in range(len(ports)):
if float(ports[i]) == 0.0:
string_result += port_len[i] * ' ' + ' {} '.format(separator[i])
continue
left_len = len(str(float(ports[i])).split('.')[0])
substr = '{:' + str(left_len) + '.' + str(max(port_len[i] - left_len - 1, 0)) + 'f}'
string_result += substr.format(ports[i]) + ' {} '.format(separator[i])
return string_result[:-1]
def _get_max_port_len(self, kernel):
port_len = [4 for x in self._data['ports']]
for instruction_form in kernel:
for i, port in enumerate(instruction_form['port_pressure']):
if len('{:.2f}'.format(port)) > port_len[i]:
port_len[i] = len('{:.2f}'.format(port))
return port_len
def _get_port_number_line(self, port_len, separator='|'):
string_result = separator
separator_list = self._get_separator_list(separator, '-')
for i, length in enumerate(port_len):
substr = '{:^' + str(length + 2) + 's}'
string_result += substr.format(self._data['ports'][i]) + separator_list[i]
return string_result
def print_latency_analysis(self, cp_kernel, separator='|'):
print('\n\nLatency Analysis Report\n' + '-----------------------')
for instruction_form in cp_kernel:
print(
'{:4d} {} {:4.1f} {}{}{} {}'.format(
instruction_form['line_number'],
separator,
instruction_form['latency_cp'],
separator,
'X' if INSTR_FLAGS.LT_UNKWN in instruction_form['flags'] else ' ',
separator,
instruction_form['line'],
)
)
print(
'\n{:4} {} {:4.1f}'.format(
' ' * max([len(str(instr_form['line_number'])) for instr_form in cp_kernel]),
' ' * len(separator),
sum([instr_form['latency_cp'] for instr_form in cp_kernel]),
)
)
def print_loopcarried_dependencies(self, dep_dict, separator='|'):
print(
'\n\nLoop-Carried Dependencies Analysis Report\n'
+ '-----------------------------------------'
)
# TODO find a way to overcome padding for different tab-lengths
for dep in dep_dict:
print(
'{:4d} {} {:4.1f} {} {:36}{} {}'.format(
dep,
separator,
sum(
[
instr_form['latency_lcd']
for instr_form in dep_dict[dep]['dependencies']
]
),
separator,
dep_dict[dep]['root']['line'],
separator,
[node['line_number'] for node in dep_dict[dep]['dependencies']],
)
)
def _print_header_report(self):
version = 'v0.3'
adjust = 20
header = ''
header += 'Open Source Architecture Code Analyzer (OSACA) - {}\n'.format(version)
header += 'Analyzed file:'.ljust(adjust) + '{}\n'.format(self._filename)
header += 'Architecture:'.ljust(adjust) + '{}\n'.format(self._arch)
header += 'Timestamp:'.ljust(adjust) + '{}\n'.format(
dt.utcnow().strftime('%Y-%m-%d %H:%M:%S')
)
print(header)
def _print_symbol_map(self):
symbol_dict = {
INSTR_FLAGS.NOT_BOUND: 'Instruction micro-ops not bound to a port',
INSTR_FLAGS.TP_UNKWN: 'No throughput/latency information for this instruction in '
+ 'data file',
INSTR_FLAGS.HIDDEN_LD: 'Throughput of LOAD operation can be hidden behind a past '
+ 'or future STORE instruction',
}
symbol_map = ''
for flag in sorted(symbol_dict.keys()):
symbol_map += ' {} - {}\n'.format(self._get_flag_symbols([flag]), symbol_dict[flag])
print(symbol_map, end='')
def _print_port_binding_summary(self):
raise NotImplementedError
def print_full_analysis(self, kernel, kernel_dg: KernelDG, verbose=False):
self._print_header_report()
self._print_symbol_map()
self.print_throughput_analysis(kernel, show_lineno=True)
self.print_latency_analysis(kernel_dg.get_critical_path())
self.print_loopcarried_dependencies(kernel_dg.get_loopcarried_dependencies())

View File

@@ -1,240 +0,0 @@
#!/usr/bin/env python3
import os
import re
import argparse
from osaca.testcase import Testcase
from osaca.param import Register, MemAddr, Parameter
#from testcase import Testcase
#from param import Register, MemAddr, Parameter
class InstrExtractor(object):
filepaths = []
# Variables for checking lines
numSeps = 0
sem = 0
db = {}
sorted_db = []
lncnt = 1
cntChar = ''
first = True
# Constant variables
MARKER = r'//STARTLOOP'
ASM_LINE = re.compile(r'\s[0-9a-f]+[:]')
def __init__(self, filepath):
self.filepaths = filepath
def check_all(self):
for i in range(0, len(self.filepaths)):
self.extract_instr(self.filepaths[i])
def is_elffile(self, filepath):
if os.path.isfile(filepath):
with open(filepath) as f:
src = f.read()
if 'format elf64' in src:
return True
return False
def extract_instr(self, asm_file):
# Check if parameter is in the correct file format
if not self.is_elffile(asm_file):
print('Invalid argument')
return
# Open file
f = open(asm_file, 'r')
# Analyse code line by line and check the instructions
self.lncnt = 1
for line in f:
self.check_line(line)
self.lncnt += 1
f.close()
def check_line(self, line):
# Check if MARKER is in line and count the number of whitespaces if so
if self.MARKER in line:
# But first, check if high level code ist indented with whitespaces or tabs
if self.first:
self.set_counter_char(line)
self.first = False
self.numSeps = (re.split(self.MARKER, line)[0]).count(self.cntChar)
self.sem = 2
elif self.sem > 0:
# We're in the marked code snipped
# Check if the line is ASM code and - if not - check if we're still in the loop
match = re.search(self.ASM_LINE, line)
if match:
# Further analysis of instructions
# Check if there are commetns in line
if r'//' in line:
return
self.check_instr(''.join(re.split(r'\t', line)[-1:]))
elif (re.split(r'\S', line)[0]).count(self.cntChar) <= self.numSeps:
# Not in the loop anymore - or yet - so we decrement the semaphore
self.sem = self.sem - 1
# Check if seperator is either tabulator or whitespace
def set_counter_char(self, line):
num_spaces = (re.split(self.MARKER, line)[0]).count(' ')
num_tabs = (re.split(self.MARKER, line)[0]).count('\t')
if num_spaces != 0 and num_tabs == 0:
self.cntChar = ' '
elif num_spaces == 0 and num_tabs != 0:
self.cntChar = '\t'
else:
err_msg = 'Indentation of code is only supported for whitespaces and tabs.'
raise NotImplementedError(err_msg)
def check_instr(self, instr):
# Check for strange clang padding bytes
while instr.startswith('data32'):
instr = instr[7:]
# Seperate mnemonic and operands
mnemonic = instr.split()[0]
params = ''.join(instr.split()[1:])
# Check if line is not only a byte
empty_byte = re.compile(r'[0-9a-f]{2}')
if re.match(empty_byte, mnemonic) and len(mnemonic) == 2:
return
# Check if there's one or more operand and store all in a list
param_list = self.flatten(self.separate_params(params))
op_list = list(param_list)
# Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or
# LABEL (LBL)
for i in range(len(param_list)):
op = param_list[i]
if len(op) <= 0:
op = Parameter('NONE')
elif op[0] == '$':
op = Parameter('IMD')
elif op[0] == '%' and '(' not in op:
j = len(op)
opmask = False
if '{' in op:
j = op.index('{')
opmask = True
op = Register(op[1:j], opmask)
elif '<' in op:
op = Parameter('LBL')
else:
op = MemAddr(op)
param_list[i] = str(op) if (type(op) is not Register) else str(op) + str(op.size)
op_list[i] = op
# Join mnemonic and operand(s) to an instruction form
if len(mnemonic) > 7:
tabs = '\t'
else:
tabs = '\t\t'
instr_form = mnemonic + tabs + (' '.join(param_list))
# Check in data file for instruction form and increment the counter
if instr_form in self.db:
self.db[instr_form] = self.db[instr_form] + 1
else:
self.db[instr_form] = 1
# Create testcase for instruction form, since it is the first appearance of it
# Only create benchmark if no label (LBL) is part of the operands
do_bench = True
for par in op_list:
if str(par) == 'LBL' or str(par) == '':
do_bench = False
if do_bench:
# Create testcase with reversed param list, due to the fact its intel syntax!
tc = Testcase(mnemonic, list(reversed(op_list)), '64')
tc.write_testcase()
def separate_params(self, params):
param_list = [params]
if ',' in params:
if ')' in params:
if params.index(')') < len(params) - 1 and params[params.index(')') + 1] == ',':
i = params.index(')') + 1
elif params.index('(') < params.index(','):
return param_list
else:
i = params.index(',')
else:
i = params.index(',')
param_list = [params[:i], self.separate_params(params[i + 1:])]
elif '#' in params:
i = params.index('#')
param_list = [params[:i]]
return param_list
def sort_db(self):
self.sorted_db = sorted(self.db.items(), key=lambda x: x[1], reverse=True)
def print_sorted_db(self):
self.sort_db()
total = 0
print('Number of\tmnemonic')
print('calls\n')
for i in range(len(self.sorted_db)):
print(str(self.sorted_db[i][1]) + '\t\t' + self.sorted_db[i][0])
total += self.sorted_db[i][1]
print('\nCumulated number of instructions: ' + str(total))
def save_db(self):
file = open('.cnt_asm_ops.db', 'w')
for i in self.db.items():
file.write(i[0] + '\t' + str(i[1]) + '\n')
file.close()
def load_db(self):
try:
file = open('.cnt_asm_ops.db', 'r')
except FileNotFoundError:
print('no data file found in current directory')
return
for line in file:
mnemonic = line.split('\t')[0]
# Join mnemonic and operand(s) to an instruction form
if len(mnemonic) > 7:
tabs = '\t'
params = line.split('\t')[1]
num_calls = line.split('\t')[2][:-1]
else:
tabs = '\t\t'
params = line.split('\t')[2]
num_calls = line.split('\t')[3][:-1]
instr_form = mnemonic + tabs + params
self.db[instr_form] = int(num_calls)
file.close()
def flatten(self, l):
if not l:
return l
if isinstance(l[0], list):
return self.flatten(l[0]) + self.flatten(l[1:])
return l[:1] + self.flatten(l[1:])
def main():
# Parse args
parser = argparse.ArgumentParser(description='Returns a list of all instruction forms in the '
'given files sorted by their number of '
'occurrences.')
parser.add_argument('-V', '--version', action='version', version='%(prog)s 0.2')
parser.add_argument('filepath', nargs='+', help='path to objdump(s)')
parser.add_argument('-l', '--load', dest='load', action='store_true',
help='load data file before checking new files')
parser.add_argument('-s', '--store', dest='store', action='store_true',
help='store data file before checking new files')
# Create object and store arguments as attribute
inp = parser.parse_args()
ie = InstrExtractor(inp.filepath)
# Do work
if inp.load:
ie.load_db()
ie.check_all()
ie.print_sorted_db()
if inp.store:
ie.save_db()
# ---------main method----------
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,142 +0,0 @@
#!/usr/bin/env python3
import re
class Parameter(object):
type_list = ['REG', 'MEM', 'IMD', 'LBL', 'NONE']
def __init__(self, ptype):
self.ptype = ptype.upper()
if self.ptype not in self.type_list:
raise NameError('Type not supported: '+ptype)
def __str__(self):
"""Return string representation."""
if self.ptype == 'NONE':
return ''
else:
return self.ptype
class MemAddr(Parameter):
segment_regs = ['CS', 'DS', 'SS', 'ES', 'FS', 'GS']
scales = [1, 2, 4, 8]
def __init__(self, name):
super().__init__("MEM")
name = name.strip(', \t')
self.offset = None
self.base = None
self.index = None
self.scale = None
m = re.match(r'((?P<offset_hex>[x0-9a-fA-F]*)|(?P<offset_dec>\-?[0-9]*))'
r'\((?P<base>[^,\)]+)(?:,\s*(?P<index>[^,\)]+)(?:,\s*'
r'(?P<scale>[^,\)]+))?)?\)', name)
if not m:
raise ValueError('Type not supported: {!r}'.format(name))
self.offset = m.group('offset_dec') or m.group('offset_hex') or None
self.base = m.group('base') or None
self.index = m.group('index') or None
self.scale = m.group('scale') or None
def __str__(self):
"""returns string representation"""
mem_format = 'MEM('
if self.offset:
mem_format += 'offset'
if self.base and not self.index:
mem_format += '(base)'
elif self.base and self.index and self.scale:
mem_format += '(base, index, scale)'
mem_format += ')'
return mem_format
class Register(Parameter):
sizes = {
# General Purpose Registers
'AH': (8, 'GPR'), 'AL': (8, 'GPR'), 'BH': (8, 'GPR'), 'BL': (8, 'GPR'), 'CH': (8, 'GPR'),
'CL': (8, 'GPR'), 'DH': (8, 'GPR'), 'DL': (8, 'GPR'), 'BPL': (8, 'GPR'), 'SIL': (8, 'GPR'),
'DIL': (8, 'GPR'), 'SPL': (8, 'GPR'), 'R8L': (8, 'GPR'), 'R9L': (8, 'GPR'),
'R10L': (8, 'GPR'), 'R11L': (8, 'GPR'), 'R12L': (8, 'GPR'), 'R13L': (8, 'GPR'),
'R14L': (8, 'GPR'), 'R15L': (8, 'GPR'), 'R8B': (8, 'GPR'), 'R9B': (8, 'GPR'),
'R10B': (8, 'GPR'), 'R11B': (8, 'GPR'), 'R12B': (8, 'GPR'), 'R13B': (8, 'GPR'),
'R14B': (8, 'GPR'), 'R15B': (8, 'GPR'), 'AX': (16, 'GPR'), 'BC': (16, 'GPR'),
'CX': (16, 'GPR'), 'DX': (16, 'GPR'), 'BP': (16, 'GPR'), 'SI': (16, 'GPR'),
'DI': (16, 'GPR'), 'SP': (16, 'GPR'), 'R8W': (16, 'GPR'), 'R9W': (16, 'GPR'),
'R10W': (16, 'GPR'), 'R11W': (16, 'GPR'), 'R12W': (16, 'GPR'), 'R13W': (16, 'GPR'),
'R14W': (16, 'GPR'), 'R15W': (16, 'GPR'), 'EAX': (32, 'GPR'), 'EBX': (32, 'GPR'),
'ECX': (32, 'GPR'), 'EDX': (32, 'GPR'), 'EBP': (32, 'GPR'), 'ESI': (32, 'GPR'),
'EDI': (32, 'GPR'), 'ESP': (32, 'GPR'), 'R8D': (32, 'GPR'), 'R9D': (32, 'GPR'),
'R10D': (32, 'GPR'), 'R11D': (32, 'GPR'), 'R12D': (32, 'GPR'), 'R13D': (32, 'GPR'),
'R14D': (32, 'GPR'), 'R15D': (32, 'GPR'), 'RAX': (64, 'GPR'), 'RBX': (64, 'GPR'),
'RCX': (64, 'GPR'), 'RDX': (64, 'GPR'), 'RBP': (64, 'GPR'), 'RSI': (64, 'GPR'),
'RDI': (64, 'GPR'), 'RSP': (64, 'GPR'), 'R8': (64, 'GPR'), 'R9': (64, 'GPR'),
'R10': (64, 'GPR'), 'R11': (64, 'GPR'), 'R12': (64, 'GPR'), 'R13': (64, 'GPR'),
'R14': (64, 'GPR'), 'R15': (64, 'GPR'), 'CS': (16, 'GPR'), 'DS': (16, 'GPR'),
'SS': (16, 'GPR'), 'ES': (16, 'GPR'), 'FS': (16, 'GPR'), 'GS': (16, 'GPR'),
'EFLAGS': (32, 'GPR'), 'RFLAGS': (64, 'GPR'), 'EIP': (32, 'GPR'), 'RIP': (64, 'GPR'),
# FPU Registers
'ST0': (80, 'FPU'), 'ST1': (80, 'FPU'), 'ST2': (80, 'FPU'), 'ST3': (80, 'FPU'),
'ST4': (80, 'FPU'), 'ST5': (80, 'FPU'), 'ST6': (80, 'FPU'), 'ST7': (80, 'FPU'),
# MMX Registers
'MM0': (64, 'MMX'), 'MM1': (64, 'MMX'), 'MM2': (64, 'MMX'), 'MM3': (64, 'MMX'),
'MM4': (64, 'MMX'), 'MM5': (64, 'MMX'), 'MM6': (64, 'MMX'), 'MM7': (64, 'MMX'),
# XMM Registers
'XMM0': (128, 'XMM'), 'XMM1': (128, 'XMM'), 'XMM2': (128, 'XMM'), 'XMM3': (128, 'XMM'),
'XMM4': (128, 'XMM'), 'XMM5': (128, 'XMM'), 'XMM6': (128, 'XMM'), 'XMM7': (128, 'XMM'),
'XMM8': (128, 'XMM'), 'XMM9': (128, 'XMM'), 'XMM10': (128, 'XMM'), 'XMM11': (128, 'XMM'),
'XMM12': (128, 'XMM'), 'XMM13': (128, 'XMM'), 'XMM14': (128, 'XMM'), 'XMM15': (128, 'XMM'),
'XMM16': (128, 'XMM'), 'XMM17': (128, 'XMM'), 'XMM18': (128, 'XMM'), 'XMM19': (128, 'XMM'),
'XMM20': (128, 'XMM'), 'XMM21': (128, 'XMM'), 'XMM22': (128, 'XMM'), 'XMM23': (128, 'XMM'),
'XMM24': (128, 'XMM'), 'XMM25': (128, 'XMM'), 'XMM26': (128, 'XMM'), 'XMM27': (128, 'XMM'),
'XMM28': (128, 'XMM'), 'XMM29': (128, 'XMM'), 'XMM30': (128, 'XMM'), 'XMM31': (128, 'XMM'),
# YMM Registers
'YMM0': (256, 'YMM'), 'YMM1': (256, 'YMM'), 'YMM2': (256, 'YMM'), 'YMM3': (256, 'YMM'),
'YMM4': (256, 'YMM'), 'YMM5': (256, 'YMM'), 'YMM6': (256, 'YMM'), 'YMM7': (256, 'YMM'),
'YMM8': (256, 'YMM'), 'YMM9': (256, 'YMM'), 'YMM10': (256, 'YMM'), 'YMM11': (256, 'YMM'),
'YMM12': (256, 'YMM'), 'YMM13': (256, 'YMM'), 'YMM14': (256, 'YMM'), 'YMM15': (256, 'YMM'),
'YMM16': (256, 'YMM'), 'YMM17': (256, 'YMM'), 'YMM18': (256, 'YMM'), 'YMM19': (256, 'YMM'),
'YMM20': (256, 'YMM'), 'YMM21': (256, 'YMM'), 'YMM22': (256, 'YMM'), 'YMM23': (256, 'YMM'),
'YMM24': (256, 'YMM'), 'YMM25': (256, 'YMM'), 'YMM26': (256, 'YMM'), 'YMM27': (256, 'YMM'),
'YMM28': (256, 'YMM'), 'YMM29': (256, 'YMM'), 'YMM30': (256, 'YMM'), 'YMM31': (256, 'YMM'),
# ZMM Registers
'ZMM0': (512, 'ZMM'), 'ZMM1': (512, 'ZMM'), 'ZMM2': (512, 'ZMM'), 'ZMM3': (512, 'ZMM'),
'ZMM4': (512, 'ZMM'), 'ZMM5': (512, 'ZMM'), 'ZMM6': (512, 'ZMM'), 'ZMM7': (512, 'ZMM'),
'ZMM8': (512, 'ZMM'), 'ZMM9': (512, 'ZMM'), 'ZMM10': (512, 'ZMM'), 'ZMM11': (512, 'ZMM'),
'ZMM12': (512, 'ZMM'), 'ZMM13': (512, 'ZMM'), 'ZMM14': (512, 'ZMM'), 'ZMM15': (512, 'ZMM'),
'ZMM16': (512, 'ZMM'), 'ZMM17': (512, 'ZMM'), 'ZMM18': (512, 'ZMM'), 'ZMM19': (512, 'ZMM'),
'ZMM20': (512, 'ZMM'), 'ZMM21': (512, 'ZMM'), 'ZMM22': (512, 'ZMM'), 'ZMM23': (512, 'ZMM'),
'ZMM24': (512, 'ZMM'), 'ZMM25': (512, 'ZMM'), 'ZMM26': (512, 'ZMM'), 'ZMM27': (512, 'ZMM'),
'ZMM28': (512, 'ZMM'), 'ZMM29': (512, 'ZMM'), 'ZMM30': (512, 'ZMM'), 'ZMM31': (512, 'ZMM'),
# Opmask Register
'K0': (64, 'K'), 'K1': (64, 'K'), 'K2': (64, 'K'), 'K3': (64, 'K'), 'K4': (64, 'K'),
'K5': (64, 'K'), 'K6': (64, 'K'), 'K7': (64, 'K'),
# Bounds Registers
'BND0': (128, 'BND'), 'BND1': (128, 'BND'), 'BND2': (128, 'BND'), 'BND3': (128, 'BND'),
# Registers in gerneral
'R16': (16, 'GPR'), 'R32': (32, 'GPR'), 'R64': (64, 'GPR'), 'FPU': (80, 'FPU'),
'MMX': (64, 'MMX'), 'XMM': (128, 'XMM'), 'YMM': (256, 'YMM'), 'ZMM': (512, 'ZMM'),
'K': (64, 'K'), 'BND': (128, 'BND')
}
def __init__(self, name, mask=False):
super().__init__("REG")
self.name = name.upper()
self.mask = mask
if self.name in self.sizes:
self.size = self.sizes[self.name][0]
self.reg_type = self.sizes[self.name][1]
else:
raise NameError('Register name not in dictionary: {}'.format(self.name))
def __str__(self):
"""Return string representation."""
opmask = ''
if self.mask:
opmask = '{opmask}'
return self.reg_type + opmask

19
osaca/parser/__init__.py Normal file
View File

@@ -0,0 +1,19 @@
"""
Collection of parsers supported by OSACA.
Only the parser below will be exported, so please add new parsers to __all__.
"""
from .attr_dict import AttrDict
from .base_parser import BaseParser
from .parser_x86att import ParserX86ATT
from .parser_AArch64v81 import ParserAArch64v81
__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64v81', 'get_parser']
def get_parser(isa):
if isa.lower() == 'x86':
return ParserX86ATT()
elif isa.lower() == 'aarch64':
return ParserAArch64v81()
else:
raise ValueError("Unknown ISA {!r}.".format(isa))

23
osaca/parser/attr_dict.py Executable file
View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python3
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
@staticmethod
def convert_dict(dictionary):
if isinstance(dictionary, type(list())):
return [AttrDict.convert_dict(x) for x in dictionary]
if isinstance(dictionary, type(dict())):
for key in list(dictionary.keys()):
entry = dictionary[key]
if isinstance(entry, type(dict())) or isinstance(
entry, type(AttrDict())
):
dictionary[key] = AttrDict.convert_dict(dictionary[key])
if isinstance(entry, type(list())):
dictionary[key] = [AttrDict.convert_dict(x) for x in entry]
return AttrDict(dictionary)
return dictionary

74
osaca/parser/base_parser.py Executable file
View File

@@ -0,0 +1,74 @@
#!/usr/bin/env python3
class BaseParser(object):
# Identifiers for operand types
COMMENT_ID = 'comment'
DIRECTIVE_ID = 'directive'
IMMEDIATE_ID = 'immediate'
LABEL_ID = 'label'
MEMORY_ID = 'memory'
REGISTER_ID = 'register'
INSTRUCTION_ID = 'instruction'
OPERANDS_ID = 'operands'
def __init__(self):
self.construct_parser()
def parse_file(self, file_content, start_line=0):
'''
Parse assembly file. This includes *not* extracting of the marked kernel and
the parsing of the instruction forms.
:param str file_content: assembly code
:param int start_line: offset, if first line in file_content is meant to be not 1
:return: list of instruction forms
'''
# Create instruction form list
asm_instructions = []
lines = file_content.split('\n')
for i, line in enumerate(lines):
if line.strip() == '':
continue
asm_instructions.append(self.parse_line(line, i + 1 + start_line))
return asm_instructions
def parse_line(self, line, line_number=None):
# Done in derived classes
raise NotImplementedError
def parse_instruction(self, instruction):
# Done in derived classes
raise NotImplementedError
def parse_register(self, register_string):
raise NotImplementedError
def is_gpr(self, register):
raise NotImplementedError
def is_vector_register(self, register):
raise NotImplementedError
def get_reg_type(self, register):
raise NotImplementedError
def construct_parser(self):
return
# raise NotImplementedError
##################
# Helper functions
##################
def process_operand(self, operand):
raise NotImplementedError
def get_full_reg_name(self, register):
raise NotImplementedError
def normalize_imd(self, imd):
raise NotImplementedError
def is_reg_dependend_of(self, reg_a, reg_b):
raise NotImplementedError

421
osaca/parser/parser_AArch64v81.py Executable file
View File

@@ -0,0 +1,421 @@
#!/usr/bin/env python3
import pyparsing as pp
from osaca.parser import AttrDict, BaseParser
class ParserAArch64v81(BaseParser):
def __init__(self):
super().__init__()
def construct_parser(self):
# Comment
symbol_comment = '//'
self.comment = pp.Literal(symbol_comment) + pp.Group(
pp.ZeroOrMore(pp.Word(pp.printables))
).setResultsName(self.COMMENT_ID)
# Define ARM assembly identifier
relocation = pp.Combine(pp.Literal(':') + pp.Word(pp.alphanums + '_') + pp.Literal(':'))
first = pp.Word(pp.alphas + '_.', exact=1)
rest = pp.Word(pp.alphanums + '_.')
identifier = pp.Group(
pp.Optional(relocation).setResultsName('relocation')
+ pp.Combine(first + pp.Optional(rest)).setResultsName('name')
).setResultsName('identifier')
# Label
self.label = pp.Group(
identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
).setResultsName(self.LABEL_ID)
# Directive
decimal_number = pp.Combine(
pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
).setResultsName('value')
hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
directive_option = pp.Combine(
pp.Word(pp.alphas + '#@.%', exact=1)
+ pp.Optional(pp.Word(pp.printables + ' ', excludeChars=','))
)
directive_parameter = (
pp.quotedString | directive_option | identifier | hex_number | decimal_number
)
commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
self.directive = pp.Group(
pp.Literal('.')
+ pp.Word(pp.alphanums + '_').setResultsName('name')
+ commaSeparatedList.setResultsName('parameters')
+ pp.Optional(self.comment)
).setResultsName(self.DIRECTIVE_ID)
##############################
# Instructions
# Mnemonic
# (?P<instr>[a-zA-Z][a-zA-Z0-9]*)(?P<setflg>S?)(P?<CC>.[a-zA-Z]{2})
mnemonic = pp.Word(pp.alphanums + '.').setResultsName('mnemonic')
# Immediate:
# int: ^-?[0-9]+ | hex: ^0x[0-9a-fA-F]+ | fp: ^[0-9]{1}.[0-9]+[eE]{1}[\+-]{1}[0-9]+[fF]?
symbol_immediate = '#'
mantissa = pp.Combine(
pp.Optional(pp.Literal('-')) + pp.Word(pp.nums) + pp.Literal('.') + pp.Word(pp.nums)
).setResultsName('mantissa')
exponent = (
pp.CaselessLiteral('e')
+ pp.Word('+-').setResultsName('e_sign')
+ pp.Word(pp.nums).setResultsName('exponent')
)
float_ = pp.Group(
mantissa + pp.Optional(exponent) + pp.CaselessLiteral('f')
).setResultsName('float')
double_ = pp.Group(mantissa + pp.Optional(exponent)).setResultsName('double')
immediate = pp.Group(
pp.Optional(pp.Literal(symbol_immediate))
+ (hex_number ^ decimal_number ^ float_ ^ double_)
| (pp.Optional(pp.Literal(symbol_immediate)) + identifier)
).setResultsName(self.IMMEDIATE_ID)
shift_op = (
pp.CaselessLiteral('lsl')
^ pp.CaselessLiteral('lsr')
^ pp.CaselessLiteral('asr')
^ pp.CaselessLiteral('ror')
^ pp.CaselessLiteral('sxtw')
^ pp.CaselessLiteral('uxtw')
)
arith_immediate = pp.Group(
immediate.setResultsName('base_immediate')
+ pp.Suppress(pp.Literal(','))
+ shift_op.setResultsName('shift_op')
+ immediate.setResultsName('shift')
).setResultsName(self.IMMEDIATE_ID)
# Register:
# scalar: [XWBHSDQ][0-9]{1,2} | vector: V[0-9]{1,2}\.[12468]{1,2}[BHSD]()?
# define SP and ZR register aliases as regex, due to pyparsing does not support
# proper lookahead
alias_r31_sp = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(sp|SP))')
alias_r31_zr = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(zr|ZR))')
scalar = pp.Word(pp.alphas, exact=1).setResultsName('prefix') + pp.Word(
pp.nums
).setResultsName('name')
index = pp.Literal('[') + pp.Word(pp.nums).setResultsName('index') + pp.Literal(']')
vector = (
pp.CaselessLiteral('v').setResultsName('prefix')
+ pp.Word(pp.nums).setResultsName('name')
+ pp.Literal('.')
+ pp.Optional(pp.Word('12468')).setResultsName('lanes')
+ pp.Word(pp.alphas, exact=1).setResultsName('shape')
+ pp.Optional(index)
)
self.list_element = vector ^ scalar
register_list = (
pp.Literal('{')
+ (
pp.delimitedList(pp.Combine(self.list_element), delim=',').setResultsName('list')
^ pp.delimitedList(pp.Combine(self.list_element), delim='-').setResultsName(
'range'
)
)
+ pp.Literal('}')
+ pp.Optional(index)
)
register = pp.Group(
(alias_r31_sp | alias_r31_zr | vector | scalar | register_list)
+ pp.Optional(
pp.Suppress(pp.Literal(','))
+ shift_op.setResultsName('shift_op')
+ immediate.setResultsName('shift')
)
).setResultsName(self.REGISTER_ID)
# Memory
register_index = register.setResultsName('index') + pp.Optional(
pp.Literal(',') + pp.Word(pp.alphas) + immediate.setResultsName('scale')
)
memory = pp.Group(
pp.Literal('[')
+ pp.Optional(register.setResultsName('base'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(register_index ^ immediate.setResultsName('offset'))
+ pp.Literal(']')
+ pp.Optional(
pp.Literal('!').setResultsName('pre_indexed')
| (pp.Suppress(pp.Literal(',')) + immediate.setResultsName('post_indexed'))
)
).setResultsName(self.MEMORY_ID)
prefetch_op = pp.Group(
pp.Group(pp.CaselessLiteral('PLD') ^ pp.CaselessLiteral('PST')).setResultsName('type')
+ pp.Group(
pp.CaselessLiteral('L1') ^ pp.CaselessLiteral('L2') ^ pp.CaselessLiteral('L3')
).setResultsName('target')
+ pp.Group(pp.CaselessLiteral('KEEP') ^ pp.CaselessLiteral('STRM')).setResultsName(
'policy'
)
).setResultsName('prfop')
# Combine to instruction form
operand_first = pp.Group(
register ^ (prefetch_op | immediate) ^ memory ^ arith_immediate ^ identifier
)
operand_rest = pp.Group((register ^ immediate ^ memory ^ arith_immediate) | identifier)
self.instruction_parser = (
mnemonic
+ pp.Optional(operand_first.setResultsName('operand1'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand2'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand3'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand4'))
+ pp.Optional(self.comment)
)
def parse_line(self, line, line_number=None):
"""
Parse line and return instruction form.
:param str line: line of assembly code
:param int line_id: default None, identifier of instruction form
:return: parsed instruction form
"""
instruction_form = AttrDict(
{
self.INSTRUCTION_ID: None,
self.OPERANDS_ID: None,
self.DIRECTIVE_ID: None,
self.COMMENT_ID: None,
self.LABEL_ID: None,
'line': line.strip(),
'line_number': line_number,
}
)
result = None
# 1. Parse comment
try:
result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
except pp.ParseException:
pass
# 2. Parse label
if result is None:
try:
result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.LABEL_ID] = result[self.LABEL_ID].name
if self.COMMENT_ID in result[self.LABEL_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.LABEL_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 3. Parse directive
if result is None:
try:
result = self.process_operand(
self.directive.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.DIRECTIVE_ID] = AttrDict(
{
'name': result[self.DIRECTIVE_ID].name,
'parameters': result[self.DIRECTIVE_ID].parameters,
}
)
if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.DIRECTIVE_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 4. Parse instruction
if result is None:
try:
result = self.parse_instruction(line)
except (pp.ParseException, KeyError):
print(
'\n\n*-*-*-*-*-*-*-*-*-*-\n{}: {}\n*-*-*-*-*-*-*-*-*-*-\n\n'.format(
line_number, line
)
)
instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
return instruction_form
def parse_instruction(self, instruction):
result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
result = AttrDict.convert_dict(result)
operands = []
# Add operands to list
# Check first operand
if 'operand1' in result:
operands.append(self.process_operand(result['operand1']))
# Check second operand
if 'operand2' in result:
operands.append(self.process_operand(result['operand2']))
# Check third operand
if 'operand3' in result:
operands.append(self.process_operand(result['operand3']))
# Check fourth operand
if 'operand4' in result:
operands.append(self.process_operand(result['operand4']))
return_dict = AttrDict(
{
self.INSTRUCTION_ID: result.mnemonic,
self.OPERANDS_ID: operands,
self.COMMENT_ID: ' '.join(result[self.COMMENT_ID])
if self.COMMENT_ID in result
else None,
}
)
return return_dict
def process_operand(self, operand):
# structure memory addresses
if self.MEMORY_ID in operand:
return self.substitute_memory_address(operand[self.MEMORY_ID])
# structure register lists
if self.REGISTER_ID in operand and (
'list' in operand[self.REGISTER_ID] or 'range' in operand[self.REGISTER_ID]
):
# TODO: discuss if ranges should be converted to lists
return self.substitute_register_list(operand[self.REGISTER_ID])
if self.REGISTER_ID in operand and operand[self.REGISTER_ID]['name'] == 'sp':
return self.substitute_sp_register(operand[self.REGISTER_ID])
# add value attribute to floating point immediates without exponent
if self.IMMEDIATE_ID in operand:
return self.substitute_immediate(operand[self.IMMEDIATE_ID])
if self.LABEL_ID in operand:
return self.substitute_label(operand[self.LABEL_ID])
return operand
def substitute_memory_address(self, memory_address):
# Remove unnecessarily created dictionary entries during parsing
offset = None if 'offset' not in memory_address else memory_address['offset']
base = None if 'base' not in memory_address else memory_address['base']
index = None if 'index' not in memory_address else memory_address['index']
scale = 1
if base is not None and 'name' in base and base['name'] == 'sp':
base['prefix'] = 'x'
if index is not None and 'name' in index and index['name'] == 'sp':
index['prefix'] = 'x'
valid_shift_ops = ['lsl', 'uxtw', 'sxtw']
if 'index' in memory_address:
if 'shift' in memory_address['index']:
if memory_address['index']['shift_op'].lower() in valid_shift_ops:
scale = 2 ** int(memory_address['index']['shift']['value'])
new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale})
if 'pre_indexed' in memory_address:
new_dict['pre_indexed'] = True
if 'post_indexed' in memory_address:
new_dict['post_indexed'] = memory_address['post_indexed']
return AttrDict({self.MEMORY_ID: new_dict})
def substitute_sp_register(self, register):
reg = register
reg['prefix'] = 'x'
return AttrDict({self.REGISTER_ID: reg})
def substitute_register_list(self, register_list):
# Remove unnecessarily created dictionary entries during parsing
vlist = []
dict_name = ''
if 'list' in register_list:
dict_name = 'list'
if 'range' in register_list:
dict_name = 'range'
for v in register_list[dict_name]:
vlist.append(
AttrDict.convert_dict(self.list_element.parseString(v, parseAll=True).asDict())
)
index = None if 'index' not in register_list else register_list['index']
new_dict = AttrDict({dict_name: vlist, 'index': index})
return AttrDict({self.REGISTER_ID: new_dict})
def substitute_immediate(self, immediate):
dict_name = ''
if 'identifier' in immediate:
# actually an identifier, change declaration
return immediate
if 'value' in immediate:
# normal integer value, nothing to do
return AttrDict({self.IMMEDIATE_ID: immediate})
if 'base_immediate' in immediate:
# arithmetic immediate, nothing to do
return AttrDict({self.IMMEDIATE_ID: immediate})
if 'float' in immediate:
dict_name = 'float'
if 'double' in immediate:
dict_name = 'double'
if 'exponent' in immediate[dict_name]:
# nothing to do
return AttrDict({self.IMMEDIATE_ID: immediate})
else:
# change 'mantissa' key to 'value'
return AttrDict(
{self.IMMEDIATE_ID: AttrDict({'value': immediate[dict_name]['mantissa']})}
)
def substitute_label(self, label):
# remove duplicated 'name' level due to identifier
label['name'] = label['name']['name']
return AttrDict({self.LABEL_ID: label})
def get_full_reg_name(self, register):
if 'lanes' in register:
return (
register['prefix']
+ str(register['name'])
+ '.'
+ str(register['lanes'])
+ register['shape']
)
return register['prefix'] + str(register['name'])
def normalize_imd(self, imd):
if 'value' in imd:
if imd['value'].lower().startswith('0x'):
# hex, return decimal
return int(imd['value'], 16)
return int(imd['value'], 10)
elif 'float' in imd:
return self.ieee_to_int(imd['float'])
elif 'double' in imd:
return self.ieee_to_int(imd['double'])
# identifier
return imd
def ieee_to_int(self, ieee_val):
exponent = int(ieee_val['exponent'], 10)
if ieee_val['e_sign'] == '-':
exponent *= -1
return float(ieee_val['mantissa']) * (10 ** exponent)
def parse_register(self, register_string):
raise NotImplementedError
def is_gpr(self, register):
if register['prefix'] in 'wx':
return True
return False
def is_vector_register(self, register):
if register['prefix'] in 'bhsdqv':
return True
return False
def is_reg_dependend_of(self, reg_a, reg_b):
prefixes_gpr = 'wx'
prefixes_vec = 'bhsdqv'
if reg_a['name'] == reg_b['name']:
if reg_a['prefix'].lower() in prefixes_gpr and reg_b['prefix'].lower() in prefixes_gpr:
return True
if reg_a['prefix'].lower() in prefixes_vec and reg_b['prefix'].lower() in prefixes_vec:
return True
return False
def get_reg_type(self, register):
return register['prefix']

328
osaca/parser/parser_x86att.py Executable file
View File

@@ -0,0 +1,328 @@
#!/usr/bin/env python3
import pyparsing as pp
from osaca.parser import AttrDict, BaseParser
class ParserX86ATT(BaseParser):
def __init__(self):
super().__init__()
def construct_parser(self):
decimal_number = pp.Combine(
pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
).setResultsName('value')
hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
# Comment
symbol_comment = '#'
self.comment = pp.Literal(symbol_comment) + pp.Group(
pp.ZeroOrMore(pp.Word(pp.printables))
).setResultsName(self.COMMENT_ID)
# Define x86 assembly identifier
id_offset = pp.Word(pp.nums) + pp.Suppress(pp.Literal('+'))
first = pp.Word(pp.alphas + '_.', exact=1)
rest = pp.Word(pp.alphanums + '$_.')
identifier = pp.Group(
pp.Optional(id_offset).setResultsName('offset')
+ pp.Combine(first + pp.Optional(rest)).setResultsName('name')
).setResultsName('identifier')
# Label
self.label = pp.Group(
identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
).setResultsName(self.LABEL_ID)
# Register: pp.Regex('^%[0-9a-zA-Z]+,?')
self.register = pp.Group(
pp.Literal('%')
+ pp.Word(pp.alphanums).setResultsName('name')
+ pp.Optional(
pp.Literal('{')
+ pp.Literal('%')
+ pp.Word(pp.alphanums).setResultsName('mask')
+ pp.Literal('}')
)
).setResultsName(self.REGISTER_ID)
# Immediate: pp.Regex('^\$(-?[0-9]+)|(0x[0-9a-fA-F]+),?')
symbol_immediate = '$'
immediate = pp.Group(
pp.Literal(symbol_immediate) + (hex_number | decimal_number | identifier)
).setResultsName(self.IMMEDIATE_ID)
# Memory: offset(base, index, scale)
offset = pp.Group(identifier | hex_number | decimal_number).setResultsName(
self.IMMEDIATE_ID
)
scale = pp.Word('1248', exact=1)
memory = pp.Group(
pp.Optional(offset.setResultsName('offset'))
+ pp.Literal('(')
+ pp.Optional(self.register.setResultsName('base'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(self.register.setResultsName('index'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(scale.setResultsName('scale'))
+ pp.Literal(')')
).setResultsName(self.MEMORY_ID)
# Directive
directive_option = pp.Combine(
pp.Word('#@.', exact=1) + pp.Word(pp.printables, excludeChars=',')
)
directive_parameter = (pp.quotedString | directive_option | identifier | hex_number |
decimal_number | self.register
)
commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
self.directive = pp.Group(
pp.Literal('.')
+ pp.Word(pp.alphanums + '_').setResultsName('name')
+ commaSeparatedList.setResultsName('parameters')
+ pp.Optional(self.comment)
).setResultsName(self.DIRECTIVE_ID)
# Instructions
# Mnemonic
mnemonic = pp.ZeroOrMore(pp.Literal('data16') | pp.Literal('data32')) + pp.Word(
pp.alphanums
).setResultsName('mnemonic')
# Combine to instruction form
operand_first = pp.Group(self.register ^ immediate ^ memory ^ identifier)
operand_rest = pp.Group(self.register ^ immediate ^ memory)
self.instruction_parser = (
mnemonic
+ pp.Optional(operand_first.setResultsName('operand1'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand2'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand3'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand4'))
+ pp.Optional(self.comment)
)
def parse_register(self, register_string):
try:
return self.process_operand(
self.register.parseString(register_string, parseAll=True).asDict()
)
except pp.ParseException:
return None
def parse_line(self, line, line_number=None):
"""
Parse line and return instruction form.
:param str line: line of assembly code
:param int line_id: default None, identifier of instruction form
:return: parsed instruction form
"""
instruction_form = AttrDict(
{
self.INSTRUCTION_ID: None,
self.OPERANDS_ID: None,
self.DIRECTIVE_ID: None,
self.COMMENT_ID: None,
self.LABEL_ID: None,
'line': line.strip(),
'line_number': line_number,
}
)
result = None
# 1. Parse comment
try:
result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
except pp.ParseException:
pass
# 2. Parse label
if result is None:
try:
result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.LABEL_ID] = result[self.LABEL_ID]['name']
if self.COMMENT_ID in result[self.LABEL_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.LABEL_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 3. Parse directive
if result is None:
try:
result = self.process_operand(
self.directive.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.DIRECTIVE_ID] = AttrDict(
{
'name': result[self.DIRECTIVE_ID]['name'],
'parameters': result[self.DIRECTIVE_ID]['parameters'],
}
)
if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.DIRECTIVE_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 4. Parse instruction
if result is None:
try:
result = self.parse_instruction(line)
except pp.ParseException as e:
raise ValueError('Could not parse instruction on line {}: {!r}'.format(
line_number, line))
instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
return instruction_form
def parse_instruction(self, instruction):
result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
result = AttrDict.convert_dict(result)
operands = []
# Add operands to list
# Check first operand
if 'operand1' in result:
operands.append(self.process_operand(result['operand1']))
# Check second operand
if 'operand2' in result:
operands.append(self.process_operand(result['operand2']))
# Check third operand
if 'operand3' in result:
operands.append(self.process_operand(result['operand3']))
# Check fourth operand
if 'operand4' in result:
operands.append(self.process_operand(result['operand4']))
return_dict = AttrDict(
{
self.INSTRUCTION_ID: result['mnemonic'],
self.OPERANDS_ID: operands,
self.COMMENT_ID:
' '.join(result[self.COMMENT_ID]) if self.COMMENT_ID in result else None,
}
)
return return_dict
def process_operand(self, operand):
# For the moment, only used to structure memory addresses
if self.MEMORY_ID in operand:
return self.substitute_memory_address(operand[self.MEMORY_ID])
if self.IMMEDIATE_ID in operand:
return self.substitue_immediate(operand[self.IMMEDIATE_ID])
if self.LABEL_ID in operand:
return self.substitute_label(operand[self.LABEL_ID])
return operand
def substitute_memory_address(self, memory_address):
# Remove unecessarily created dictionary entries during memory address parsing
offset = None if 'offset' not in memory_address else memory_address['offset']
base = None if 'base' not in memory_address else memory_address['base']
index = None if 'index' not in memory_address else memory_address['index']
scale = 1 if 'scale' not in memory_address else int(memory_address['scale'])
new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale})
return AttrDict({self.MEMORY_ID: new_dict})
def substitute_label(self, label):
# remove duplicated 'name' level due to identifier
label['name'] = label['name']['name']
return AttrDict({self.LABEL_ID: label})
def substitue_immediate(self, immediate):
if 'identifier' in immediate:
# actually an identifier, change declaration
return immediate
# otherwise nothing to do
return AttrDict({self.IMMEDIATE_ID: immediate})
def get_full_reg_name(self, register):
# nothing to do
return register['name']
def normalize_imd(self, imd):
if 'value' in imd:
if imd['value'].lower().startswith('0x'):
# hex, return decimal
return int(imd['value'], 16)
return int(imd['value'], 10)
# identifier
return imd
def is_reg_dependend_of(self, reg_a, reg_b):
# Check if they are the same registers
if reg_a.name == reg_b.name:
return True
# Check vector registers first
if self.is_vector_register(reg_a):
if self.is_vector_register(reg_b):
if reg_a.name[1:] == reg_b.name[1:]:
# Registers in the same vector space
return True
return False
# Check basic GPRs
a_dep = ['RAX', 'EAX', 'AX', 'AH', 'AL']
b_dep = ['RBX', 'EBX', 'BX', 'BH', 'BL']
c_dep = ['RCX', 'ECX', 'CX', 'CH', 'CL']
d_dep = ['RDX', 'EDX', 'DX', 'DH', 'DL']
sp_dep = ['RSP', 'ESP', 'SP', 'SPL']
src_dep = ['RSI', 'ESI', 'SI', 'SIL']
dst_dep = ['RDI', 'EDI', 'DI', 'DIL']
basic_gprs = [a_dep, b_dep, c_dep, d_dep, sp_dep, src_dep, dst_dep]
if self.is_basic_gpr(reg_a):
if self.is_basic_gpr(reg_b):
for dep_group in basic_gprs:
if reg_a['name'].upper() in dep_group:
if reg_b['name'].upper() in dep_group:
return True
return False
# Check other GPRs
gpr_parser = (
pp.CaselessLiteral('R')
+ pp.Word(pp.nums).setResultsName('id')
+ pp.Optional(pp.Word('dwbDWB', exact=1))
)
try:
id_a = gpr_parser.parseString(reg_a['name'], parseAll=True).asDict()['id']
id_b = gpr_parser.parseString(reg_b['name'], parseAll=True).asDict()['id']
if id_a == id_b:
return True
except pp.ParseException:
return False
# No dependencies
return False
def is_basic_gpr(self, register):
if any(char.isdigit() for char in register['name']):
return False
return True
def is_gpr(self, register):
gpr_parser = (
pp.CaselessLiteral('R')
+ pp.Word(pp.nums).setResultsName('id')
+ pp.Optional(pp.Word('dwbDWB', exact=1))
)
if self.is_basic_gpr(register):
return True
else:
try:
gpr_parser.parseString(register['name'], parseAll=True)
return True
except pp.ParseException:
return False
def is_vector_register(self, register):
if len(register['name']) > 2 and register['name'][1:3].lower() == 'mm':
return True
return False
def get_reg_type(self, register):
if self.is_gpr(register):
return 'gpr'
elif self.is_vector_register(register):
return register['name'][:3].lower()
raise ValueError

View File

@@ -0,0 +1,11 @@
"""
Tools for semantic analysis of parser result.
Only the classes below will be exported, so please add new semantic tools to __all__.
"""
from .hw_model import MachineModel
from .kernel_dg import KernelDG
from .marker_utils import reduce_to_section
from .semantics_appender import SemanticsAppender, INSTR_FLAGS
__all__ = ['MachineModel', 'KernelDG', 'reduce_to_section', 'SemanticsAppender', 'INSTR_FLAGS']

404
osaca/semantics/hw_model.py Executable file
View File

@@ -0,0 +1,404 @@
#!/usr/bin/env python3
import re
from copy import deepcopy
from itertools import product
import ruamel.yaml
from ruamel.yaml.compat import StringIO
from osaca import __version__, utils
from osaca.parser import ParserX86ATT
class MachineModel(object):
def __init__(self, arch=None, path_to_yaml=None, isa=None):
if not arch and not path_to_yaml:
if not isa:
raise ValueError('One of arch, path_to_yaml and isa must be specified')
self._data = {
'osaca_version': str(__version__),
'micro_architecture': None,
'arch_code': None,
'isa': isa,
'ROB_size': None,
'retired_uOps_per_cycle': None,
'scheduler_size': None,
'hidden_loads': None,
'load_latency': {},
'load_throughput': [
{'base': b, 'index': i, 'offset': o, 'scale': s, 'port_pressure': []}
for b, i, o, s in product(['gpr'], ['gpr', None], ['imd', None], [1, 8])
],
'ports': [],
'port_model_scheme': None,
'instruction_forms': [],
}
else:
if arch and path_to_yaml:
raise ValueError('Only one of arch and path_to_yaml is allowed.')
self._path = path_to_yaml
self._arch = arch
yaml = self._create_yaml_object()
if arch:
self._arch = arch.lower()
with open(utils.find_file(self._arch + '.yml'), 'r') as f:
self._data = yaml.load(f)
elif path_to_yaml:
with open(self._path, 'r') as f:
self._data = yaml.load(f)
def __getitem__(self, key):
"""Return configuration entry."""
return self._data[key]
def __contains__(self, key):
"""Return true if configuration key is present."""
return key in self._data
######################################################
def get_instruction(self, name, operands):
"""Find and return instruction data from name and operands."""
if name is None:
return None
try:
return next(
instruction_form
for instruction_form in self._data['instruction_forms']
if instruction_form['name'].upper() == name.upper()
and self._match_operands(instruction_form['operands'], operands)
)
except StopIteration:
return None
except TypeError as e:
print('\nname: {}\noperands: {}'.format(name, operands))
raise TypeError from e
def average_port_pressure(self, port_pressure):
"""Construct average port pressure list from instruction data."""
port_list = self._data['ports']
average_pressure = [0.0] * len(port_list)
for cycles, ports in port_pressure:
for p in ports:
average_pressure[port_list.index(p)] += cycles / len(ports)
return average_pressure
def set_instruction(
self, name, operands=None, latency=None, port_pressure=None, throughput=None, uops=None
):
"""Import instruction form information."""
# If it already exists. Overwrite information.
instr_data = self.get_instruction(name, operands)
if instr_data is None:
instr_data = {}
self._data['instruction_forms'].append(instr_data)
instr_data['name'] = name
instr_data['operands'] = operands
instr_data['latency'] = latency
instr_data['port_pressure'] = port_pressure
instr_data['throughput'] = throughput
instr_data['uops'] = uops
def set_instruction_entry(self, entry):
self.set_instruction(
entry['name'],
entry['operands'] if 'operands' in entry else None,
entry['latency'] if 'latency' in entry else None,
entry['port_pressure'] if 'port_pressure' in entry else None,
entry['throughput'] if 'throughput' in entry else None,
entry['uops'] if 'uops' in entry else None,
)
def add_port(self, port):
if port not in self._data['ports']:
self._data['ports'].append(port)
def get_ISA(self):
return self._data['isa'].lower()
def get_arch(self):
return self._data['arch_code'].lower()
def get_ports(self):
return self._data['ports']
def has_hidden_loads(self):
if 'hidden_loads' in self._data:
return self._data['hidden_loads']
return False
def get_load_latency(self, reg_type):
return self._data['load_latency'][reg_type]
def get_load_throughput(self, memory):
ld_tp = [m for m in self._data['load_throughput'] if self._match_mem_entries(memory, m)]
if len(ld_tp) > 0:
return ld_tp[0]['port_pressure']
return None
def _match_mem_entries(self, mem, i_mem):
if self._data['isa'].lower() == 'aarch64':
return self._is_AArch64_mem_type(i_mem, mem)
if self._data['isa'].lower() == 'x86':
return self._is_x86_mem_type(i_mem, mem)
def get_data_ports(self):
data_port = re.compile(r'^[0-9]+D$')
data_ports = [x for x in filter(data_port.match, self._data['ports'])]
return data_ports
@staticmethod
def get_full_instruction_name(instruction_form):
operands = []
for op in instruction_form['operands']:
op_attrs = [
y + ':' + str(op[y])
for y in list(filter(lambda x: True if x != 'class' else False, op))
]
operands.append('{}({})'.format(op['class'], ','.join(op_attrs)))
return '{} {}'.format(instruction_form['name'], ','.join(operands))
@staticmethod
def get_isa_for_arch(arch):
arch_dict = {
'tx2': 'aarch64',
'zen1': 'x86',
'snb': 'x86',
'ivb': 'x86',
'hsw': 'x86',
'bdw': 'x86',
'skl': 'x86',
'skx': 'x86',
'csx': 'x86',
'wsm': 'x86',
'nhm': 'x86',
'kbl': 'x86',
'cnl': 'x86',
'cfl': 'x86',
'zen+': 'x86',
}
arch = arch.lower()
if arch in arch_dict:
return arch_dict[arch].lower()
else:
raise ValueError("Unknown architecture {!r}.".format(arch))
def dump(self, stream=None):
# Replace instruction form's port_pressure with styled version for RoundtripDumper
formatted_instruction_forms = deepcopy(self._data['instruction_forms'])
for instruction_form in formatted_instruction_forms:
cs = ruamel.yaml.comments.CommentedSeq(instruction_form['port_pressure'])
cs.fa.set_flow_style()
instruction_form['port_pressure'] = cs
# Create YAML object
yaml = self._create_yaml_object()
if not stream:
# Create stream object to output string
stream = StringIO()
yaml.dump({k: v for k, v in self._data.items() if k != 'instruction_forms'}, stream)
yaml.dump({'instruction_forms': formatted_instruction_forms}, stream)
return stream.getvalue()
else:
# Write in given stream
yaml.dump({k: v for k, v in self._data.items() if k != 'instruction_forms'}, stream)
yaml.dump({'instruction_forms': formatted_instruction_forms}, stream)
######################################################
def _check_for_duplicate(self, name, operands):
matches = [
instruction_form
for instruction_form in self._data['instruction_forms']
if instruction_form['name'].lower() == name.lower()
and self._match_operands(instruction_form['operands'], operands)
]
if len(matches) > 1:
return True
return False
def _match_operands(self, i_operands, operands):
if isinstance(operands, dict):
operands = operands['operand_list']
operands_ok = True
if len(operands) != len(i_operands):
return False
for idx, operand in enumerate(operands):
i_operand = i_operands[idx]
operands_ok = operands_ok and self._check_operands(i_operand, operand)
if operands_ok:
return True
else:
return False
def _check_operands(self, i_operands, operands):
if self._data['isa'].lower() == 'aarch64':
return self._check_AArch64_operands(i_operands, operands)
if self._data['isa'].lower() == 'x86':
return self._check_x86_operands(i_operands, operands)
def _check_AArch64_operands(self, i_operand, operand):
if 'class' in operand:
# compare two DB entries
return self._compare_db_entries(i_operand, operand)
# register
if 'register' in operand:
if i_operand['class'] != 'register':
return False
return self._is_AArch64_reg_type(i_operand, operand['register'])
# memory
if 'memory' in operand:
if i_operand['class'] != 'memory':
return False
return self._is_AArch64_mem_type(i_operand, operand['memory'])
# immediate
if 'value' in operand or ('immediate' in operand and 'value' in operand['immediate']):
return i_operand['class'] == 'immediate' and i_operand['imd'] == 'int'
if 'float' in operand or ('immediate' in operand and 'float' in operand['immediate']):
return i_operand['class'] == 'immediate' and i_operand['imd'] == 'float'
if 'double' in operand or ('immediate' in operand and 'double' in operand['immediate']):
return i_operand['class'] == 'immediate' and i_operand['imd'] == 'double'
if 'identifier' in operand or (
'immediate' in operand and 'identifier' in operand['immediate']
):
return i_operand['class'] == 'identifier'
# prefetch option
if 'prfop' in operand:
return i_operand['class'] == 'prfop'
# no match
return False
def _check_x86_operands(self, i_operand, operand):
if 'class' in operand:
# compare two DB entries
return self._compare_db_entries(i_operand, operand)
# register
if 'register' in operand:
if i_operand['class'] != 'register':
return False
return self._is_x86_reg_type(i_operand['name'], operand['register'])
# memory
if 'memory' in operand:
if i_operand['class'] != 'memory':
return False
return self._is_x86_mem_type(i_operand, operand['memory'])
# immediate
if 'immediate' in operand or 'value' in operand:
return i_operand['class'] == 'immediate' and i_operand['imd'] == 'int'
# identifier (e.g., labels)
if 'identifier' in operand:
return i_operand['class'] == 'identifier'
def _compare_db_entries(self, operand_1, operand_2):
operand_attributes = list(
filter(lambda x: True if x != 'source' and x != 'destination' else False, operand_1)
)
for key in operand_attributes:
try:
if operand_1[key] != operand_2[key]:
return False
except KeyError:
return False
return True
def _is_AArch64_reg_type(self, i_reg, reg):
if reg['prefix'] != i_reg['prefix']:
return False
if 'shape' in reg:
if 'shape' in i_reg and reg['shape'] == i_reg['shape']:
return True
return False
return True
def _is_x86_reg_type(self, i_reg_name, reg):
# differentiate between vector registers (xmm, ymm, zmm) and others (gpr)
parser_x86 = ParserX86ATT()
if parser_x86.is_vector_register(reg):
if reg['name'][0:3] == i_reg_name:
return True
else:
if i_reg_name == 'gpr':
return True
return False
def _is_AArch64_mem_type(self, i_mem, mem):
if (
# check base
mem['base']['prefix'] == i_mem['base']
# check offset
and (
mem['offset'] == i_mem['offset']
or (
mem['offset'] is not None
and 'identifier' in mem['offset']
and i_mem['offset'] == 'identifier'
)
or (
mem['offset'] is not None
and 'value' in mem['offset']
and i_mem['offset'] == 'imd'
)
)
# check index
and (
mem['index'] == i_mem['index']
or (
mem['index'] is not None
and 'prefix' in mem['index']
and mem['index']['prefix'] == i_mem['index']
)
)
and (mem['scale'] == i_mem['scale'] or (mem['scale'] != 1 and i_mem['scale'] != 1))
and (('pre_indexed' in mem) == (i_mem['pre-indexed']))
and (('post_indexed' in mem) == (i_mem['post-indexed']))
):
return True
return False
def _is_x86_mem_type(self, i_mem, mem):
if (
# check base
self._is_x86_reg_type(i_mem['base'], mem['base'])
# check offset
and (
mem['offset'] == i_mem['offset']
or (
mem['offset'] is not None
and 'identifier' in mem['offset']
and i_mem['offset'] == 'identifier'
)
or (
mem['offset'] is not None
and 'value' in mem['offset']
and (
i_mem['offset'] == 'imd'
or (i_mem['offset'] is None and mem['offset']['value'] == '0')
)
)
)
# check index
and (
mem['index'] == i_mem['index']
or (
mem['index'] is not None
and 'name' in mem['index']
and self._is_x86_reg_type(i_mem['index'], mem['index'])
)
)
and (mem['scale'] == i_mem['scale'] or (mem['scale'] != 1 and i_mem['scale'] != 1))
):
return True
return False
def _create_yaml_object(self):
yaml_obj = ruamel.yaml.YAML()
yaml_obj.representer.add_representer(type(None), self.__represent_none)
yaml_obj.default_flow_style = None
yaml_obj.width = 120
yaml_obj.representer.ignore_aliases = lambda *args: True
return yaml_obj
def __represent_none(self, yaml_obj, data):
return yaml_obj.represent_scalar(u'tag:yaml.org,2002:null', u'~')

335
osaca/semantics/kernel_dg.py Executable file
View File

@@ -0,0 +1,335 @@
#!/usr/bin/env python3
import copy
from itertools import chain, product
import networkx as nx
from osaca.parser import AttrDict
from osaca.semantics import MachineModel
class KernelDG(nx.DiGraph):
def __init__(self, parsed_kernel, parser, hw_model: MachineModel):
self.kernel = parsed_kernel
self.parser = parser
self.model = hw_model
self.dg = self.create_DG(self.kernel)
self.loopcarried_deps = self.check_for_loopcarried_dep(self.kernel)
def create_DG(self, kernel):
# 1. go through kernel instruction forms and add them as node attribute
# 2. find edges (to dependend further instruction)
# 3. get LT value and set as edge weight
dg = nx.DiGraph()
for i, instruction_form in enumerate(kernel):
dg.add_node(instruction_form['line_number'])
dg.nodes[instruction_form['line_number']]['instruction_form'] = instruction_form
# add load as separate node if existent
# TODO use INSTR_FLAGS here
if (
'performs_load' in instruction_form['flags']
and 'is_load_instruction' not in instruction_form['flags']
):
# add new node
dg.add_node(instruction_form['line_number'] + 0.1)
dg.nodes[instruction_form['line_number'] + 0.1][
'instruction_form'
] = instruction_form
# and set LD latency as edge weight
dg.add_edge(
instruction_form['line_number'] + 0.1,
instruction_form['line_number'],
latency=instruction_form['latency'] - instruction_form['latency_wo_load'],
)
for dep in self.find_depending(instruction_form, kernel[i + 1:]):
edge_weight = (
instruction_form['latency']
if 'latency_wo_load' not in instruction_form
else instruction_form['latency_wo_load']
)
dg.add_edge(
instruction_form['line_number'], dep['line_number'], latency=edge_weight
)
dg.nodes[dep['line_number']]['instruction_form'] = dep
return dg
def check_for_loopcarried_dep(self, kernel):
multiplier = len(kernel) + 1
# increase line number for second kernel loop
kernel_length = len(kernel)
first_line_no = kernel[0].line_number
kernel_copy = [AttrDict.convert_dict(d) for d in copy.deepcopy(kernel)]
tmp_kernel = kernel + kernel_copy
for i, instruction_form in enumerate(tmp_kernel[kernel_length:]):
tmp_kernel[i + kernel_length].line_number = instruction_form.line_number * multiplier
# get dependency graph
dg = self.create_DG(tmp_kernel)
# build cyclic loop-carried dependencies
loopcarried_deps = [
(node, list(nx.algorithms.simple_paths.all_simple_paths(dg, node, node * multiplier)))
for node in dg.nodes
if node < first_line_no * multiplier and node == int(node)
]
# filter others and create graph
loopcarried_deps = list(
chain.from_iterable(
[list(product([dep_chain[0]], dep_chain[1])) for dep_chain in loopcarried_deps]
)
)
# adjust line numbers, filter duplicates
# and add reference to kernel again
loopcarried_deps_dict = {}
tmp_list = []
for i, dep in enumerate(loopcarried_deps):
nodes = [int(n / multiplier) for n in dep[1] if n >= first_line_no * multiplier]
loopcarried_deps[i] = (dep[0], nodes)
for dep in loopcarried_deps:
is_subset = False
for other_dep in [x for x in loopcarried_deps if x[0] != dep[0]]:
if set(dep[1]).issubset(set(other_dep[1])) and dep[0] in other_dep[1]:
is_subset = True
if not is_subset:
tmp_list.append(dep)
loopcarried_deps = tmp_list
for dep in loopcarried_deps:
nodes = []
for n in dep[1]:
self._get_node_by_lineno(int(n))['latency_lcd'] = 0
for n in dep[1]:
node = self._get_node_by_lineno(int(n))
if int(n) != n and int(n) in dep[1]:
node['latency_lcd'] += node['latency'] - node['latency_wo_load']
else:
node['latency_lcd'] += node['latency_wo_load']
nodes.append(node)
loopcarried_deps_dict[dep[0]] = {
'root': self._get_node_by_lineno(dep[0]),
'dependencies': nodes,
}
return loopcarried_deps_dict
def _get_node_by_lineno(self, lineno):
return [instr for instr in self.kernel if instr.line_number == lineno][0]
def get_critical_path(self):
if nx.algorithms.dag.is_directed_acyclic_graph(self.dg):
longest_path = nx.algorithms.dag.dag_longest_path(self.dg, weight='latency')
for line_number in longest_path:
self._get_node_by_lineno(int(line_number))['latency_cp'] = 0
# add LD latency to instruction
for line_number in longest_path:
node = self._get_node_by_lineno(int(line_number))
if line_number != int(line_number) and int(line_number) in longest_path:
node['latency_cp'] += self.dg.edges[(line_number, int(line_number))]['latency']
elif (
line_number == int(line_number)
and 'mem_dep' in node
and self.dg.has_edge(node['mem_dep']['line_number'], line_number)
):
node['latency_cp'] += node['latency']
else:
node['latency_cp'] += (
node['latency']
if 'latency_wo_load' not in node
else node['latency_wo_load']
)
return [x for x in self.kernel if x['line_number'] in longest_path]
else:
# split to DAG
raise NotImplementedError('Kernel is cyclic.')
def get_loopcarried_dependencies(self):
if nx.algorithms.dag.is_directed_acyclic_graph(self.dg):
return self.loopcarried_deps
else:
# split to DAG
raise NotImplementedError('Kernel is cyclic.')
def find_depending(self, instruction_form, kernel, include_write=False):
if instruction_form.operands is None:
return
for dst in instruction_form.operands.destination + instruction_form.operands.src_dst:
if 'register' in dst:
# Check for read of register until overwrite
for instr_form in kernel:
if self.is_read(dst.register, instr_form):
yield instr_form
if self.is_written(dst.register, instr_form):
# operand in src_dst list
if include_write:
yield instr_form
break
elif self.is_written(dst.register, instr_form):
if include_write:
yield instr_form
break
elif 'memory' in dst:
# Check if base register is altered during memory access
if 'pre_indexed' in dst.memory or 'post_indexed' in dst.memory:
# Check for read of base register until overwrite
for instr_form in kernel:
if self.is_read(dst.memory.base, instr_form):
instr_form['mem_dep'] = instruction_form
yield instr_form
if self.is_written(dst.memory.base, instr_form):
# operand in src_dst list
if include_write:
instr_form['mem_dep'] = instruction_form
yield instr_form
break
elif self.is_written(dst.memory.base, instr_form):
if include_write:
instr_form['mem_dep'] = instruction_form
yield instr_form
break
def get_dependent_instruction_forms(self, instr_form=None, line_number=None):
"""
Returns iterator
"""
if not instr_form and not line_number:
raise ValueError('Either instruction form or line_number required.')
line_number = line_number if line_number else instr_form['line_number']
if self.dg.has_node(line_number):
return self.dg.successors(line_number)
return iter([])
def is_read(self, register, instruction_form):
is_read = False
if instruction_form.operands is None:
return is_read
for src in instruction_form.operands.source + instruction_form.operands.src_dst:
if 'register' in src:
is_read = self.parser.is_reg_dependend_of(register, src.register) or is_read
if 'memory' in src:
if src.memory.base is not None:
is_read = self.parser.is_reg_dependend_of(register, src.memory.base) or is_read
if src.memory.index is not None:
is_read = (
self.parser.is_reg_dependend_of(register, src.memory.index) or is_read
)
# Check also if read in destination memory address
for dst in instruction_form.operands.destination + instruction_form.operands.src_dst:
if 'memory' in dst:
if dst.memory.base is not None:
is_read = self.parser.is_reg_dependend_of(register, dst.memory.base) or is_read
if dst.memory.index is not None:
is_read = (
self.parser.is_reg_dependend_of(register, dst.memory.index) or is_read
)
return is_read
def is_written(self, register, instruction_form):
is_written = False
if instruction_form.operands is None:
return is_written
for dst in instruction_form.operands.destination + instruction_form.operands.src_dst:
if 'register' in dst:
is_written = self.parser.is_reg_dependend_of(register, dst.register) or is_written
if 'memory' in dst:
if 'pre_indexed' in dst.memory or 'post_indexed' in dst.memory:
is_written = (
self.parser.is_reg_dependend_of(register, dst.memory.base) or is_written
)
# Check also for possible pre- or post-indexing in memory addresses
for src in instruction_form.operands.source + instruction_form.operands.src_dst:
if 'memory' in src:
if 'pre_indexed' in src.memory or 'post_indexed' in src.memory:
is_written = (
self.parser.is_reg_dependend_of(register, src.memory.base) or is_written
)
return is_written
def export_graph(self, filepath=None):
graph = copy.deepcopy(self.dg)
cp = self.get_critical_path()
cp_line_numbers = [x['line_number'] for x in cp]
lcd = self.get_loopcarried_dependencies()
lcd_line_numbers = {}
for dep in lcd:
lcd_line_numbers[dep] = [x['line_number'] for x in lcd[dep]['dependencies']]
# add color scheme
graph.graph['node'] = {'colorscheme': 'accent8'}
graph.graph['edge'] = {'colorscheme': 'accent8'}
# create LCD edges
for dep in lcd_line_numbers:
min_line_number = min(lcd_line_numbers[dep])
max_line_number = max(lcd_line_numbers[dep])
graph.add_edge(max_line_number, min_line_number)
graph.edges[max_line_number, min_line_number]['latency'] = [
x for x in lcd[dep]['dependencies'] if x['line_number'] == max_line_number
][0]['latency_lcd']
# add label to edges
for e in graph.edges:
graph.edges[e]['label'] = graph.edges[e]['latency']
# add CP values to graph
for n in cp:
graph.nodes[n['line_number']]['instruction_form']['latency_cp'] = n['latency_cp']
# color CP and LCD
for n in graph.nodes:
if n in cp_line_numbers:
# graph.nodes[n]['color'] = 1
graph.nodes[n]['style'] = 'bold'
graph.nodes[n]['penwidth'] = 4
for col, dep in enumerate(lcd):
if n in lcd_line_numbers[dep]:
if 'style' not in graph.nodes[n]:
graph.nodes[n]['style'] = 'filled'
else:
graph.nodes[n]['style'] += ',filled'
graph.nodes[n]['fillcolor'] = 2 + col
# color edges
for e in graph.edges:
if (
graph.nodes[e[0]]['instruction_form']['line_number'] in cp_line_numbers
and graph.nodes[e[1]]['instruction_form']['line_number'] in cp_line_numbers
and e[0] < e[1]
):
bold_edge = True
for i in range(e[0] + 1, e[1]):
if i in cp_line_numbers:
bold_edge = False
if bold_edge:
graph.edges[e]['style'] = 'bold'
graph.edges[e]['penwidth'] = 3
for dep in lcd_line_numbers:
if (
graph.nodes[e[0]]['instruction_form']['line_number'] in lcd_line_numbers[dep]
and graph.nodes[e[1]]['instruction_form']['line_number']
in lcd_line_numbers[dep]
):
graph.edges[e]['color'] = graph.nodes[e[1]]['fillcolor']
# rename node from [idx] to [idx mnemonic] and add shape
mapping = {}
for n in graph.nodes:
if int(n) != n:
mapping[n] = '{}: LOAD'.format(int(n))
graph.nodes[n]['fontname'] = 'italic'
graph.nodes[n]['fontsize'] = 11.0
else:
node = graph.nodes[n]['instruction_form']
if node['instruction'] is not None:
mapping[n] = '{}: {}'.format(n, node['instruction'])
else:
label = 'label' if node['label'] else None
label = 'directive' if node['directive'] else label
label = 'comment' if node['comment'] and label is None else label
mapping[n] = '{}: {}'.format(n, label)
graph.nodes[n]['fontname'] = 'italic'
graph.nodes[n]['fontsize'] = 11.0
graph.nodes[n]['shape'] = 'rectangle'
nx.relabel.relabel_nodes(graph, mapping, copy=False)
if filepath:
nx.drawing.nx_agraph.write_dot(graph, filepath)
else:
nx.drawing.nx_agraph.write_dot(graph, 'osaca_dg.dot')

85
osaca/semantics/marker_utils.py Executable file
View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
from osaca.parser import ParserAArch64v81, ParserX86ATT
def reduce_to_section(kernel, isa):
isa = isa.lower()
if isa == 'x86':
start, end = find_marked_kernel_x86ATT(kernel)
elif isa == 'aarch64':
start, end = find_marked_kernel_AArch64(kernel)
else:
raise ValueError('ISA not supported.')
if start == -1:
raise LookupError('Could not find START MARKER. Make sure it is inserted!')
if end == -1:
raise LookupError('Could not find END MARKER. Make sure it is inserted!')
return kernel[start:end]
def find_marked_kernel_AArch64(lines):
nop_bytes = ['213', '3', '32', '31']
return find_marked_kernel(
lines, ParserAArch64v81(), ['mov'], 'x1', [111, 222], nop_bytes, reverse=True
)
def find_marked_kernel_x86ATT(lines):
nop_bytes = ['100', '103', '144']
return find_marked_kernel(lines, ParserX86ATT(), ['mov', 'movl'], 'ebx', [111, 222], nop_bytes)
def find_marked_kernel(lines, parser, mov_instr, mov_reg, mov_vals, nop_bytes, reverse=False):
index_start = -1
index_end = -1
for i, line in enumerate(lines):
try:
if line.instruction in mov_instr and lines[i + 1].directive is not None:
source = line.operands[0 if not reverse else 1]
destination = line.operands[1 if not reverse else 0]
# instruction pair matches, check for operands
if (
'immediate' in source
and parser.normalize_imd(source.immediate) == mov_vals[0]
and 'register' in destination
and parser.get_full_reg_name(destination.register) == mov_reg
):
# operands of first instruction match start, check for second one
match, line_count = match_bytes(lines, i + 1, nop_bytes)
if match:
# return first line after the marker
index_start = i + 1 + line_count
elif (
'immediate' in source
and parser.normalize_imd(source.immediate) == mov_vals[1]
and 'register' in destination
and parser.get_full_reg_name(destination.register) == mov_reg
):
# operand of first instruction match end, check for second one
match, line_count = match_bytes(lines, i + 1, nop_bytes)
if match:
# return line of the marker
index_end = i
except TypeError:
print(i, line)
if index_start != -1 and index_end != -1:
break
return index_start, index_end
def match_bytes(lines, index, byte_list):
# either all bytes are in one line or in separate ones
extracted_bytes = []
line_count = 0
while (
index < len(lines)
and lines[index].directive is not None
and lines[index].directive.name == 'byte'
):
line_count += 1
extracted_bytes += lines[index].directive.parameters
index += 1
if extracted_bytes[0:len(byte_list)] == byte_list:
return True, line_count
return False, -1

View File

@@ -0,0 +1,348 @@
#!/usr/bin/env python3
import warnings
from functools import reduce
from osaca import utils
from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
from osaca.semantics import MachineModel
class INSTR_FLAGS:
"""
Flags used for unknown or special instructions
"""
LD = 'is_load_instruction'
TP_UNKWN = 'tp_unknown'
LT_UNKWN = 'lt_unknown'
NOT_BOUND = 'not_bound'
HIDDEN_LD = 'hidden_load'
HAS_LD = 'performs_load'
HAS_ST = 'performs_store'
class SemanticsAppender(object):
def __init__(self, machine_model: MachineModel, path_to_yaml=None):
self._machine_model = machine_model
self._isa = machine_model.get_ISA().lower()
path = utils.find_file('isa/' + self._isa + '.yml')
self._isa_model = MachineModel(path_to_yaml=path)
if self._isa == 'x86':
self._parser = ParserX86ATT()
elif self._isa == 'aarch64':
self._parser = ParserAArch64v81()
# SUMMARY FUNCTION
def add_semantics(self, kernel):
for instruction_form in kernel:
self.assign_src_dst(instruction_form)
self.assign_tp_lt(instruction_form)
if self._machine_model.has_hidden_loads():
self.set_hidden_loads(kernel)
def set_hidden_loads(self, kernel):
loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr['flags']]
stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr['flags']]
# Filter instructions including load and store
load_ids = [instr['line_number'] for instr in loads]
store_ids = [instr['line_number'] for instr in stores]
shared_ldst = list(set(load_ids).intersection(set(store_ids)))
loads = [instr for instr in loads if instr['line_number'] not in shared_ldst]
stores = [instr for instr in stores if instr['line_number'] not in shared_ldst]
if len(stores) == 0 or len(loads) == 0:
# nothing to do
return
if len(loads) <= len(stores):
# Hide all loads
for load in loads:
load['flags'] += [INSTR_FLAGS.HIDDEN_LD]
load['port_pressure'] = self._nullify_data_ports(load['port_pressure'])
else:
for store in stores:
# Get 'closest' load instruction
min_distance_load = min(
[
(
abs(load_instr['line_number'] - store['line_number']),
load_instr['line_number'],
)
for load_instr in loads
if INSTR_FLAGS.HIDDEN_LD not in load_instr['flags']
]
)
load = [instr for instr in kernel if instr['line_number'] == min_distance_load[1]][
0
]
# Hide load
load['flags'] += [INSTR_FLAGS.HIDDEN_LD]
load['port_pressure'] = self._nullify_data_ports(load['port_pressure'])
# get parser result and assign throughput and latency value to instruction form
# mark instruction form with semantic flags
def assign_tp_lt(self, instruction_form):
flags = []
port_number = len(self._machine_model['ports'])
if instruction_form['instruction'] is None:
# No instruction (label, comment, ...) --> ignore
throughput = 0.0
latency = 0.0
latency_wo_load = latency
instruction_form['port_pressure'] = [0.0 for i in range(port_number)]
else:
instruction_data = self._machine_model.get_instruction(
instruction_form['instruction'], instruction_form['operands']
)
if instruction_data:
# instruction form in DB
throughput = instruction_data['throughput']
port_pressure = self._machine_model.average_port_pressure(
instruction_data['port_pressure']
)
try:
assert isinstance(port_pressure, list)
assert len(port_pressure) == port_number
instruction_form['port_pressure'] = port_pressure
if sum(port_pressure) == 0 and throughput is not None:
# port pressure on all ports 0 --> not bound to a port
flags.append(INSTR_FLAGS.NOT_BOUND)
except AssertionError:
warnings.warn(
'Port pressure could not be imported correctly from database. '
+ 'Please check entry for:\n {}'.format(instruction_form)
)
instruction_form['port_pressure'] = [0.0 for i in range(port_number)]
flags.append(INSTR_FLAGS.TP_UNKWN)
if throughput is None:
# assume 0 cy and mark as unknown
throughput = 0.0
flags.append(INSTR_FLAGS.TP_UNKWN)
latency = instruction_data['latency']
latency_wo_load = latency
if latency is None:
# assume 0 cy and mark as unknown
latency = 0.0
latency_wo_load = latency
flags.append(INSTR_FLAGS.LT_UNKWN)
if INSTR_FLAGS.HAS_LD in instruction_form['flags']:
flags.append(INSTR_FLAGS.LD)
else:
# instruction could not be found in DB
assign_unknown = True
# check for equivalent register-operands DB entry if LD
if INSTR_FLAGS.HAS_LD in instruction_form['flags']:
# --> combine LD and reg form of instruction form
operands = self.substitute_mem_address(instruction_form['operands'])
instruction_data_reg = self._machine_model.get_instruction(
instruction_form['instruction'], operands
)
if instruction_data_reg:
assign_unknown = False
reg_types = [
self._parser.get_reg_type(op['register'])
for op in operands['operand_list']
if 'register' in op
]
load_port_pressure = self._machine_model.average_port_pressure(
self._machine_model.get_load_throughput(
[
x['memory']
for x in instruction_form['operands']['source']
if 'memory' in x
][0]
)
)
if 'load_throughput_multiplier' in self._machine_model:
multiplier = self._machine_model['load_throughput_multiplier'][
reg_types[0]
]
load_port_pressure = [pp * multiplier for pp in load_port_pressure]
throughput = max(
max(load_port_pressure), instruction_data_reg['throughput']
)
latency = (
self._machine_model.get_load_latency(reg_types[0])
+ instruction_data_reg['latency']
)
latency_wo_load = instruction_data_reg['latency']
instruction_form['port_pressure'] = [
sum(x)
for x in zip(
load_port_pressure,
self._machine_model.average_port_pressure(
instruction_data_reg['port_pressure']
),
)
]
if assign_unknown:
# --> mark as unknown and assume 0 cy for latency/throughput
throughput = 0.0
latency = 0.0
latency_wo_load = latency
instruction_form['port_pressure'] = [0.0 for i in range(port_number)]
flags += [INSTR_FLAGS.TP_UNKWN, INSTR_FLAGS.LT_UNKWN]
# flatten flag list
flags = list(set(flags))
if 'flags' not in instruction_form:
instruction_form['flags'] = flags
else:
instruction_form['flags'] += flags
instruction_form['throughput'] = throughput
instruction_form['latency'] = latency
instruction_form['latency_wo_load'] = latency_wo_load
# for later CP and loop-carried dependency analysis
instruction_form['latency_cp'] = 0
instruction_form['latency_lcd'] = 0
def substitute_mem_address(self, operands):
regs = [op for op in operands['operand_list'] if 'register' in op]
if (
len(regs) > 1
and len(set([self._parser.get_reg_type(x['register']) for x in regs])) != 1
):
warnings.warn('Load type could not be identified clearly.')
reg_type = self._parser.get_reg_type(regs[0]['register'])
source = [
operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
for operand in operands['source']
]
destination = [
operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
for operand in operands['destination']
]
src_dst = [
operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
for operand in operands['destination']
]
operand_list = [
operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
for operand in operands['operand_list']
]
return {
'source': source,
'destination': destination,
'src_dst': src_dst,
'operand_list': operand_list,
}
def convert_mem_to_reg(self, memory, reg_type, reg_id='0'):
if self._isa == 'x86':
register = {'register': {'name': reg_type + reg_id}}
elif self._isa == 'aarch64':
register = {'register': {'prefix': reg_type, 'name': reg_id}}
return register
# get ;parser result and assign operands to
# - source
# - destination
# - source/destination
def assign_src_dst(self, instruction_form):
# if the instruction form doesn't have operands, there's nothing to do
if instruction_form['operands'] is None:
return
# check if instruction form is in ISA yaml, otherwise apply standard operand assignment
# (one dest, others source)
isa_data = self._isa_model.get_instruction(
instruction_form['instruction'], instruction_form['operands']
)
operands = instruction_form['operands']
op_dict = {}
if isa_data is None:
# no irregular operand structure, apply default
op_dict['source'] = self._get_regular_source_operands(instruction_form)
op_dict['destination'] = self._get_regular_destination_operands(instruction_form)
op_dict['src_dst'] = []
else:
# load src/dst structure from isa_data
op_dict['source'] = []
op_dict['destination'] = []
op_dict['src_dst'] = []
for i, op in enumerate(isa_data['operands']):
if op['source'] and op['destination']:
op_dict['src_dst'].append(operands[i])
continue
if op['source']:
op_dict['source'].append(operands[i])
continue
if op['destination']:
op_dict['destination'].append(operands[i])
continue
# store operand list in dict and reassign operand key/value pair
op_dict['operand_list'] = operands
instruction_form['operands'] = AttrDict.convert_dict(op_dict)
# assign LD/ST flags
instruction_form['flags'] = (
instruction_form['flags'] if 'flags' in instruction_form else []
)
if self._has_load(instruction_form):
instruction_form['flags'] += [INSTR_FLAGS.HAS_LD]
if self._has_store(instruction_form):
instruction_form['flags'] += [INSTR_FLAGS.HAS_ST]
def _nullify_data_ports(self, port_pressure):
data_ports = self._machine_model.get_data_ports()
for port in data_ports:
index = self._machine_model.get_ports().index(port)
port_pressure[index] = 0.0
return port_pressure
def _has_load(self, instruction_form):
for operand in (
instruction_form['operands']['source'] + instruction_form['operands']['src_dst']
):
if 'memory' in operand:
return True
return False
def _has_store(self, instruction_form):
for operand in (
instruction_form['operands']['destination'] + instruction_form['operands']['src_dst']
):
if 'memory' in operand:
return True
return False
def _get_regular_source_operands(self, instruction_form):
if self._isa == 'x86':
return self._get_regular_source_x86ATT(instruction_form)
if self._isa == 'aarch64':
return self._get_regular_source_AArch64(instruction_form)
def _get_regular_destination_operands(self, instruction_form):
if self._isa == 'x86':
return self._get_regular_destination_x86ATT(instruction_form)
if self._isa == 'aarch64':
return self._get_regular_destination_AArch64(instruction_form)
def _get_regular_source_x86ATT(self, instruction_form):
# return all but last operand
sources = [
op for op in instruction_form['operands'][0 : len(instruction_form['operands']) - 1]
]
return sources
def _get_regular_source_AArch64(self, instruction_form):
# return all but first operand
sources = [
op for op in instruction_form['operands'][1 : len(instruction_form['operands'])]
]
return sources
def _get_regular_destination_x86ATT(self, instruction_form):
# return last operand
return instruction_form['operands'][-1:]
def _get_regular_destination_AArch64(self, instruction_form):
# return first operand
return instruction_form['operands'][:1]
@staticmethod
def get_throughput_sum(kernel):
tp_sum = reduce(
(lambda x, y: [sum(z) for z in zip(x, y)]),
[instr['port_pressure'] for instr in kernel],
)
tp_sum = [round(x, 2) for x in tp_sum]
return tp_sum

View File

@@ -1,410 +0,0 @@
#!/usr/bin/env python3
import os
from subprocess import call
from math import ceil
from osaca.param import Register, MemAddr, Parameter
#from param import Register, MemAddr, Parameter
class Testcase(object):
# ------------------Constant variables--------------------------
# Lookup tables for regs
gprs64 = ['rax', 'rbx', 'rcx', 'rdx', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15']
gprs32 = ['eax', 'ebx', 'ecx', 'edx', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d']
gprs16 = ['ax', 'bx', 'cx', 'dx', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w']
gprs8 = ['al', 'bl', 'cl', 'dl', 'r9l', 'r10l', 'r11l', 'r12l', 'r13l', 'r14l', 'r15l']
fpus = ['st0', 'st1', 'st2', 'st3', 'st4', 'st5', 'st6', 'st7']
mmxs = ['mm0', 'mm1', 'mm2', 'mm3', 'mm4', 'mm5', 'mm6', 'mm7']
ks = ['k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7']
bnds = ['bnd0', 'bnd1', 'bnd2', 'bnd3', 'bnd4', 'bnd5', 'bnd6', 'bnd7']
xmms = ['xmm0', 'xmm1', 'xmm2', 'xmm3', 'xmm4', 'xmm5', 'xmm6', 'xmm7', 'xmm8', 'xmm9',
'xmm10', 'xmm11', 'xmm12', 'xmm13', 'xmm14', 'xmm15']
ymms = ['ymm0', 'ymm1', 'ymm2', 'ymm3', 'ymm4', 'ymm5', 'ymm6', 'ymm7', 'ymm8', 'ymm9',
'ymm10', 'ymm11', 'ymm12', 'ymm13', 'ymm14', 'ymm15']
zmms = ['zmm0', 'zmm1', 'zmm2', 'zmm3', 'zmm4', 'zmm5', 'zmm6', 'zmm7', 'zmm8', 'zmm9',
'zmm10', 'zmm11', 'zmm12', 'zmm13', 'zmm14', 'zmm15']
# Lookup table for memory
mems = ['[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]',
'[rip+PI]']
# Lookup table for immediates
imds = ['1', '2', '13', '22', '8', '78', '159', '222', '3', '9', '5', '55', '173', '317',
'254', '255']
# TODO Differentiate between AVX512 (with additional xmm16-31) and the rest
# ...
# ...
# end TODO
ops = {'gpr64': gprs64, 'gpr32': gprs32, 'gpr16': gprs16, 'gpr8': gprs8, 'fpu': fpus,
'mmx': mmxs, 'k': ks, 'bnd': bnds, 'xmm': xmms, 'ymm': ymms, 'zmm': zmms, 'mem': mems,
'imd': imds}
# Create Single Precision 1.0
sp1 = ('\t\t# create SP 1.0\n'
'\t\tvpcmpeqw xmm0, xmm0, xmm0\n'
'\t\tvpslld xmm0, xmm0, 25\t\t\t# logical left shift: 11111110..0 (25=32-(8-1))\n'
'\t\tvpsrld xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading '
'mantissa bit is zero\n'
'\t\t# copy SP 1.0\n')
# Create Double Precision 1.0
dp1 = ('\t\t# create DP 1.0\n'
'\t\tvpcmpeqw xmm0, xmm0, xmm0\t\t# all ones\n'
'\t\tvpsllq xmm0, xmm0, 54\t\t\t# logical left shift: 11111110..0 (54=64-(10-1))\n'
'\t\tvpsrlq xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading '
'mantissa bit is zero\n')
# Create epilogue
done = ('done:\n'
'\t\tmov\trsp, rbp\n'
'\t\tpop\trbp\n'
'\t\tret\n'
'.size latency, .-latency')
# ----------------------------------------------------------------
# Constructor
def __init__(self, _mnemonic, _param_list, _num_instr='32'):
self.instr = _mnemonic.lower()
self.param_list = _param_list
# num_instr must be an even number
self.num_instr = str(ceil(int(_num_instr)/2)*2)
# Check for the number of operands and initialise the GPRs if necessary
self.op_a, self.op_b, self.op_c, self.gprPush, self.gprPop, self.zeroGPR, self.copy = \
self.__define_operands()
self.num_operands = len(self.param_list)
# Create asm header
self.def_instr, self.ninstr, self.init, self.expand = self.__define_header()
# Create latency and throughput loop
self.loop_lat = self.__define_loop_lat()
self.loop_thrpt = self.__define_loop_thrpt()
# Create extension for testcase name
sep0 = '-' if (self.num_operands > 0) else ''
sep1 = '_' if (self.num_operands > 1) else ''
sep2 = '_' if (self.num_operands > 2) else ''
self.extension = (sep0 + (self.op_a if ('gpr' not in self.op_a) else 'r' + self.op_a[3:])
+ sep1 + (self.op_b if ('gpr' not in self.op_b) else 'r' + self.op_b[3:])
+ sep2 + (self.op_c if ('gpr' not in self.op_c) else 'r' + self.op_c[3:]))
def write_testcase(self, tp=True, lt=True):
"""
Write testcase for class attributes in a file.
Parameters
----------
tp : bool
Controls if throughput testcase should be written
(default True)
lt : bool
Controls if latency testcase should be written
(default True)
"""
osaca_dir = os.path.expanduser('~') + '/.osaca/'
if lt:
# Write latency file
call(['mkdir', '-p', osaca_dir + 'benchmarks'])
f = open(osaca_dir + 'benchmarks/'+self.instr+self.extension+'.S', 'w')
data = (self.def_instr + self.ninstr + self.init + self.dp1 + self.expand + self.gprPush
+ self.zeroGPR + self.copy + self.loop_lat + self.gprPop + self.done)
f.write(data)
f.close()
if tp:
# Write throughput file
call(['mkdir', '-p', osaca_dir + 'benchmarks'])
f = open(osaca_dir + 'benchmarks/' + self.instr + self.extension
+ '-TP.S', 'w')
data = (self.def_instr + self.ninstr + self.init + self.dp1 + self.expand + self.gprPush
+ self.zeroGPR + self.copy + self.loop_thrpt + self.gprPop + self.done)
f.write(data)
f.close()
# Check operands
def __define_operands(self):
"""
Check for the number of operands and initialise the GPRs if necessary.
Returns
-------
(str, str, str, str, str, str)
String tuple containing types of operands and if needed push/pop operations, the
initialisation of general purpose regs and the copy if registers.
"""
operands = self.param_list
op_a, op_b, op_c = ('', '', '')
gpr_push, gpr_pop, zero_gpr = ('', '', '')
if len(operands) > 0:
if isinstance(operands[0], Register):
op_a = operands[0].reg_type.lower()
elif isinstance(operands[0], MemAddr):
op_a = 'mem'
elif isinstance(operands[0], Parameter) and str(operands[0]) == 'IMD':
op_a = 'imd'
if op_a == 'gpr':
gpr_push, gpr_pop, zero_gpr = self.__initialise_gprs()
op_a += str(operands[0].size)
if len(operands) > 1:
if isinstance(operands[1], Register):
op_b = operands[1].reg_type.lower()
elif isinstance(operands[1], MemAddr):
op_b = 'mem'
elif isinstance(operands[1], Parameter) and str(operands[1]) == 'IMD':
op_b = 'imd'
if op_b == 'gpr':
op_b += str(operands[1].size)
if 'gpr' not in op_a:
gpr_push, gpr_pop, zero_gpr = self.__initialise_gprs()
if len(operands) == 3:
if isinstance(operands[2], Register):
op_c = operands[2].reg_type.lower()
elif isinstance(operands[2], MemAddr):
op_c = 'mem'
elif isinstance(operands[2], Parameter) and str(operands[2]) == 'IMD':
op_c = 'imd'
if op_c == 'gpr':
op_c += str(operands[2].size)
if ('gpr' not in op_a) and ('gpr' not in op_b):
gpr_push, gpr_pop, zero_gpr = self.__initialise_gprs()
if len(operands) == 1 and isinstance(operands[0], Register):
copy = self.__copy_regs(operands[0])
elif len(operands) > 1 and isinstance(operands[1], Register):
copy = self.__copy_regs(operands[1])
elif len(operands) > 2 and isinstance(operands[2], Register):
copy = self.__copy_regs(operands[1])
else:
copy = ''
return op_a, op_b, op_c, gpr_push, gpr_pop, zero_gpr, copy
def __initialise_gprs(self):
"""
Initialize eleven general purpose registers and set them to zero.
Returns
-------
(str, str, str)
String tuple for push, pop and initalisation operations
"""
gpr_push = ''
gpr_pop = ''
zero_gpr = ''
for reg in self.gprs64:
gpr_push += '\t\tpush {}\n'.format(reg)
for reg in reversed(self.gprs64):
gpr_pop += '\t\tpop {}\n'.format(reg)
for reg in self.gprs64:
zero_gpr += '\t\txor {}, {}\n'.format(reg, reg)
return gpr_push, gpr_pop, zero_gpr
# Copy created values in specific register
def __copy_regs(self, reg):
"""
Copy created values in specific register.
Parameters
----------
reg : Register
Register for copying the value
Returns
-------
str
String containing the copy instructions
"""
copy = '\t\t# copy DP 1.0\n'
# Different handling for GPR, MMX and SSE/AVX registers
if reg.reg_type == 'GPR':
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][1])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tadd {}, {}\n'.format(self.ops['gpr64'][1], self.ops['gpr64'][0])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
copy += '\t\tmovq {}, {}\n'.format(self.ops['gpr64'][2], self.ops['gpr64'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
elif reg.reg_type == 'MMX':
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][1])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tadd {}, {}\n'.format(self.ops['mmx'][1], self.ops['mmx'][0])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
copy += '\t\tmovq {}, {}\n'.format(self.ops['mmx'][2], self.ops['gpr64'][0])
elif reg.reg_type == 'XMM' or reg.reg_type == 'YMM' or reg.reg_type == 'ZMM':
key = reg.reg_type.lower()
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][0], self.ops[key][0])
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][1], self.ops[key][0])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tvaddpd {}, {}, {}\n'.format(self.ops[key][1], self.ops[key][1],
self.ops[key][1])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tvdivpd {}, {}, {}\n'.format(self.ops[key][2], self.ops[key][0],
self.ops[key][1])
else:
copy = ''
return copy
def __define_header(self):
"""
Define header.
Returns
-------
(str, str, str, str)
String tuple containing the header, value initalisations and extensions
"""
def_instr = '#define INSTR '+self.instr+'\n'
ninstr = '#define NINST '+self.num_instr+'\n'
pi = ('PI:\n'
'.long 0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' # 128 bit
'0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' # 256 bit
'0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, ' # 384 bit
'0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9\n') # 512 bit
init = ('#define N edi\n'
'#define i r8d\n\n\n'
'.intel_syntax noprefix\n'
'.globl ninst\n'
'.data\n'
'ninst:\n'
'.long NINST\n'
'.align 32\n'
+ pi +
'.text\n'
'.globl latency\n'
'.type latency, @function\n'
'.align 32\n'
'latency:\n'
'\t\tpush rbp\n'
'\t\tmov rbp, rsp\n'
'\t\txor i, i\n'
'\t\ttest N, N\n'
'\t\tjle done\n')
# Expand to AVX(512) if necessary
expand = ''
if self.op_a == 'ymm' or self.op_b == 'ymm' or self.op_c == 'ymm':
expand = ('\t\t# expand from SSE to AVX\n'
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n')
if self.op_a == 'zmm' or self.op_b == 'zmm' or self.op_c == 'zmm':
expand = ('\t\t# expand from SSE to AVX\n'
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n'
'\t\t# expand from AVX to AVX512\n'
'\t\tvinsert64x4 zmm0, zmm0, ymm0, 0x1\n')
return def_instr, ninstr, init, expand
def __define_loop_lat(self):
"""
Create latency loop.
Returns
-------
str
Latency loop as string
"""
loop_lat = ('loop:\n'
'\t\tinc i\n')
if self.num_operands == 0:
for i in range(0, int(self.num_instr)):
loop_lat += '\t\tINSTR\n'
if self.num_operands == 1:
for i in range(0, int(self.num_instr)):
loop_lat += '\t\tINSTR {}\n'.format(self.ops[self.op_a][0])
elif self.num_operands == 2 and self.op_a == self.op_b:
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0],
self.ops[self.op_b][1])
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_b][1],
self.ops[self.op_b][0])
elif self.num_operands == 2 and self.op_a != self.op_b:
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0],
self.ops[self.op_b][0])
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.op_a][0],
self.ops[self.op_b][0])
elif self.num_operands == 3 and self.op_a == self.op_b:
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][0],
self.ops[self.op_b][1],
self.ops[self.op_c][0])
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][1],
self.ops[self.op_b][0],
self.ops[self.op_c][0])
elif self.num_operands == 3 and self.op_a == self.op_c:
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][0],
self.ops[self.op_b][0],
self.ops[self.op_c][0])
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.op_a][1],
self.ops[self.op_b][0],
self.ops[self.op_c][0])
loop_lat += ('\t\tcmp i, N\n'
'\t\tjl loop\n')
return loop_lat
def __define_loop_thrpt(self):
"""
Create throughput loop.
Returns
-------
str
Throughput loop as string
"""
loop_thrpt = ('loop:\n'
'\t\tinc i\n')
ext = ''
ext1 = False
ext2 = False
if self.num_operands == 2:
ext1 = True
if self.num_operands == 3:
ext1 = True
ext2 = True
for i in range(0, int(self.num_instr)):
if self.num_operands == 0:
loop_thrpt += '\t\tINSTR\n'
continue
if ext1:
ext = ', {}'.format(self.ops[self.op_b][i % 3])
if ext2:
ext += ', {}'.format(self.ops[self.op_c][i % 3])
reg_num = (i % (len(self.ops[self.op_a]) - 3)) + 3
loop_thrpt += '\t\tINSTR {}{}\n'.format(self.ops[self.op_a][reg_num], ext)
loop_thrpt += ('\t\tcmp i, N\n'
'\t\tjl loop\n')
return loop_thrpt
def is_in_dir(self):
"""
Check if testcases with the same name already exist in testcase
directory.
Returns
-------
(bool, bool)
True if file is in directory
False if file is not in directory
While the first value stands for the throughput testcase
and the second value stands for the latency testcase
"""
tp = False
lt = False
name = self.instr+self.extension
for root, dirs, files in os.walk(os.path.dirname(__file__)+'/benchmarks'):
if (name + '-tp.S') in files:
tp = True
if name+'.S' in files:
lt = True
return tp, lt
def get_entryname(self):
"""
Return the name of the entry the instruction form would be the data file
Returns
-------
str
The composited string out of instruction mnemonic and operands
"""
name = self.instr+self.extension
return name

13
osaca/utils.py Normal file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python3
import os.path
def find_file(name):
"""Check for existence of name in user or package data folders and return path."""
search_paths = [os.path.expanduser('~/.osaca/data'),
os.path.join(os.path.dirname(__file__), 'data')]
for dir in search_paths:
path = os.path.join(dir, name)
if os.path.exists(path):
return path
raise FileNotFoundError("Could not find {!r} in {!r}.".format(name, search_paths))

View File

@@ -1,5 +1,5 @@
[pep8]
max-line-length=100
max-line-length=99
[metadata]
license-file=LICENSE

View File

@@ -75,6 +75,7 @@ setup(
# that you indicate wheter you support Python2, Python 3 or both.
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
],
# What doesd your project relate to?
@@ -89,9 +90,10 @@ setup(
# requirements files see:
# https://packaging.python.org/en/latest/requirements.html
install_requires=[
'numpy',
'pandas',
'kerncraft',
'networkx',
'pyparsing',
'pygraphviz',
],
python_requires='>=3.5',

View File

@@ -1,14 +1,19 @@
#!/usr//bin/env python
#!/usr/bin/env python3
import sys
import unittest
sys.path[0:0] = ['.', '..']
suite = unittest.TestLoader().loadTestsFromNames(
[
'test_osaca'
'test_base_parser',
'test_parser_x86att',
'test_parser_AArch64v81',
'test_marker_utils',
'test_semantics',
'test_frontend',
'test_db_interface',
'test_kerncraftAPI',
]
)

76
tests/test_base_parser.py Executable file
View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""
Unit tests for base assembly parser
"""
import os
import unittest
from osaca.parser import AttrDict, BaseParser
class TestBaseParser(unittest.TestCase):
@classmethod
def setUpClass(self):
try:
self.parser = BaseParser()
except NotImplementedError:
pass
with open(self._find_file('triad-x86-iaca.s')) as f:
self.triad_code = f.read()
##################
# Test
##################
def test_parse_file(self):
with self.assertRaises(NotImplementedError):
self.parser.parse_file(self.triad_code)
def test_parse_line(self):
line_instruction = '\t\tlea 2(%rax,%rax), %ecx #12.9'
with self.assertRaises(NotImplementedError):
self.parser.parse_line(line_instruction)
def test_parse_instruction(self):
instr1 = '\t\tvcvtsi2ss %edx, %xmm2, %xmm2\t\t\t#12.27'
with self.assertRaises(NotImplementedError):
self.parser.parse_instruction(instr1)
def test_register_funcs(self):
reg_a1 = AttrDict({'name': 'rax'})
reg_a2 = AttrDict({'name': 'eax'})
register_string = 'v1.2d'
with self.assertRaises(NotImplementedError):
self.parser.is_reg_dependend_of(reg_a1, reg_a2)
with self.assertRaises(NotImplementedError):
self.parser.parse_register(register_string)
with self.assertRaises(NotImplementedError):
self.parser.is_gpr(reg_a1)
with self.assertRaises(NotImplementedError):
self.parser.is_vector_register(reg_a1)
with self.assertRaises(NotImplementedError):
self.parser.process_operand(reg_a1)
with self.assertRaises(NotImplementedError):
self.parser.get_full_reg_name(reg_a1)
def test_normalize_imd(self):
imd_hex_1 = {'value': '0x4f'}
with self.assertRaises(NotImplementedError):
self.parser.normalize_imd(imd_hex_1)
##################
# Helper functions
##################
@staticmethod
def _find_file(name):
testdir = os.path.dirname(__file__)
name = os.path.join(testdir, 'test_files', name)
assert os.path.exists(name)
return name
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestBaseParser)
unittest.TextTestRunner(verbosity=2).run(suite)

86
tests/test_db_interface.py Executable file
View File

@@ -0,0 +1,86 @@
#!/usr/bin/env python3
"""
Unit tests for DB interface
"""
import unittest
from osaca.db_interface import sanity_check
from osaca.semantics import MachineModel
class TestDBInterface(unittest.TestCase):
@classmethod
def setUpClass(self):
sample_entry = {
'name': 'DoItRightAndDoItFast',
'operands': [
{'class': 'memory', 'offset': 'imd', 'base': 'gpr', 'index': 'gpr', 'scale': 8},
{'class': 'register', 'name': 'xmm'},
],
'throughput': 1.25,
'latency': 125,
'uops': 6,
}
self.entry_csx = sample_entry.copy()
self.entry_tx2 = sample_entry.copy()
self.entry_zen1 = sample_entry.copy()
# self.entry_csx['port_pressure'] = [1.25, 0, 1.25, 0.5, 0.5, 0.5, 0.5, 0, 1.25, 1.25, 0]
self.entry_csx['port_pressure'] = [[5, '0156'], [1, '23'], [1, ['2D', '3D']]]
# self.entry_tx2['port_pressure'] = [2.5, 2.5, 0, 0, 0.5, 0.5]
self.entry_tx2['port_pressure'] = [[5, '01'], [1, '45']]
del self.entry_tx2['operands'][1]['name']
self.entry_tx2['operands'][1]['prefix'] = 'x'
# self.entry_zen1['port_pressure'] = [1, 1, 1, 1, 0, 1, 0, 0, 0, 0.5, 1, 0.5, 1]
self.entry_zen1['port_pressure'] = [[4, '0123'], [1, '4'], [1, '89'], [2, ['8D', '9D']]]
###########
# Tests
###########
def test_add_single_entry(self):
mm_csx = MachineModel('csx')
mm_tx2 = MachineModel('tx2')
mm_zen1 = MachineModel('zen1')
num_entries_csx = len(mm_csx['instruction_forms'])
num_entries_tx2 = len(mm_tx2['instruction_forms'])
num_entries_zen1 = len(mm_zen1['instruction_forms'])
mm_csx.set_instruction_entry(self.entry_csx)
mm_tx2.set_instruction_entry(self.entry_tx2)
mm_zen1.set_instruction_entry({'name': 'empty_operation'})
num_entries_csx = len(mm_csx['instruction_forms']) - num_entries_csx
num_entries_tx2 = len(mm_tx2['instruction_forms']) - num_entries_tx2
num_entries_zen1 = len(mm_zen1['instruction_forms']) - num_entries_zen1
self.assertEqual(num_entries_csx, 1)
self.assertEqual(num_entries_tx2, 1)
self.assertEqual(num_entries_zen1, 1)
def test_invalid_add(self):
entry = {}
with self.assertRaises(KeyError):
MachineModel('csx').set_instruction_entry(entry)
with self.assertRaises(TypeError):
MachineModel('csx').set_instruction()
def test_sanity_check(self):
# non-verbose
sanity_check('csx', verbose=False)
sanity_check('tx2', verbose=False)
sanity_check('zen1', verbose=False)
# verbose
sanity_check('csx', verbose=True)
sanity_check('tx2', verbose=True)
sanity_check('zen1', verbose=True)
##################
# Helper functions
##################
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestDBInterface)
unittest.TextTestRunner(verbosity=2).run(suite)

View File

@@ -0,0 +1,539 @@
osaca_version: 0.3.1
micro_architecture: AMD Zen (family 17h)
arch_code: ZEN1
isa: x86
load_latency: {gpr: 4.0, xmm: 4.0, ymm: 4.0}
load_throughput_multiplier: {gpr: 1.0, xmm: 1.0, ymm: 2.0}
load_throughput:
- {base: gpr, index: ~, offset: ~, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: ~, offset: ~, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: ~, offset: imd, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: ~, offset: imd, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: ~, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: ~, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: imd, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
- {base: gpr, index: gpr, offset: imd, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
hidden_loads: true
ports: ['0', '1', '2', '3', 3DV, '4', '5', '6', '7', '8', '9', 8D, 9D, ST]
port_model_scheme: |
┌--------------------------------------┐ ┌-----------------------------------------------┐
| 96 entries OoO scheduler | | 84 entries OoO scheduler |
└--------------------------------------┘ └-----------------------------------------------┘
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐ ┌------┐ ┌-----┐ ┌-----┐ ┌------┐ ┌-----┐ ┌-----┐
|SSE ALU| |SSE ALU| |SSE ALU| |SSE ALU| | ALU | | ALU | | ALU | | ALU | | AGU | | AGU |
└-------┘ └-------┘ └-------┘ └-------┘ └------┘ └-----┘ └-----┘ └------┘ └-----┘ └-----┘
┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐ ┌------┐ ┌-----┐ ┌-----┐ ┌------┐ | |
|SSE MUL| |SSE MUL| |SSE ADD| |SSE ADD| |BRANCH| | MUL | | MUL | |BRANCH| ▼ ▼
└-------┘ └-------┘ └-------┘ └-------┘ └------┘ └-----┘ └-----┘ └------┘ ┌-------------┐
┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐ | LOAD |
|SSE FMA| |SSE FMA| | SSE | |SSE DIV| └-------------┘
└-------┘ └-------┘ | SHUF | └-------┘ ┌-------------┐
┌-------┐ └-------┘ | LOAD |
| SSE | └-------------┘
| SHUF | ┌-------------┐
└-------┘ | STORE |
└-------------┘
instruction_forms:
- name: add
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: add
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: 1 # 1*p4567
port_pressure: [[1, '4567']]
- name: addl
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: addq
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: cmpl
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: ~ # 1*p4567
port_pressure: [[1, '4567']]
- name: cmpq
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: ~ # 1*p4567
port_pressure: [[1, '4567']]
- name: incq
operands:
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: ja
operands:
- class: identifier
throughput: 0.0
latency: ~
port_pressure: []
- name: jb
operands:
- class: identifier
throughput: 0.0
latency: ~
port_pressure: []
- name: jne
operands:
- class: identifier
throughput: 0.0
latency: ~
port_pressure: []
- name: leaq
operands:
- class: memory
base: gpr
offset: imd
index: ~
scale: 1
- class: register
name: gpr
throughput: 0.5
latency: ~ # 1*p89
port_pressure: [[1, '89']]
- name: movl
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.0
latency: 0.0
port_pressure: []
- name: mulsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: mulss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p01
port_pressure: [[1, '01']]
- name: rcpss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: ~ #1.0
latency: 5.0
port_pressure: []
- name: sqrtsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: ~ #8.0
latency: 23.0
port_pressure: []
- name: sqrtss
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: ~ #5.0
latency: 17.0
port_pressure: []
- name: subq
operands:
- class: register
name: gpr
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: subq
operands:
- class: immediate
imd: int
- class: register
name: gpr
throughput: 0.25
latency: 1.0 # 1*p4567
port_pressure: [[1, '4567']]
- name: vaddpd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 3.0 # 2*p23
port_pressure: [[2, '23']]
- name: vaddsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p23
port_pressure: [[1, '23']]
- name: vaddss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p23
port_pressure: [[1, '23']]
- name: vdivsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 4.0
latency: 13.0 # 1*p3+4*p3DV
port_pressure: [[1, '3'], [4.0, [3DV]]]
- name: vdivss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 3.0
latency: 10.0
port_pressure: [[1, '3'], [3.0, [3DV]]]
- name: vfmadd213pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vfmadd231pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vfmadd132pd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vmulsd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulss
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 3.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulpd
operands:
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01+1*p89+1*p8D9D
port_pressure: [[1, '01'], [1, '89'], [1, [8D, 9D]]]
- name: vmulpd
operands:
- class: register
name: xmm
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p01
port_pressure: [[1, '01']]
- name: vmulpd
operands:
- class: register
name: ymm
- class: register
name: ymm
- class: register
name: ymm
throughput: 1.0
latency: 4.0 # 2*p01
port_pressure: [[2, '01']]
- name: vmovapd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovapd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovapd
operands:
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovapd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovapd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovaps
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovaps
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 2.0
latency: 3.0 # 2*p89+2*pST
port_pressure: [[2, '89'], [2, [ST]]]
- name: vmovupd
operands:
- class: register
name: ymm
- class: register
name: ymm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovsd
operands:
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
- class: register
name: xmm
throughput: 0.5
latency: 4.0 # 1*p89+1*p8D9D
port_pressure: [[1, '89'], [1, [8D, 9D]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: register
name: xmm
throughput: 0.0
latency: 0.0
port_pressure: []
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: ~
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: imd
index: ~
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: ~
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]
- name: vmovsd
operands:
- class: register
name: xmm
- class: memory
base: gpr
offset: imd
index: gpr
scale: 1
throughput: 1.0
latency: 4.0 # 1*p89+1*pST
port_pressure: [[1, '89'], [1, [ST]]]

View File

@@ -0,0 +1,27 @@
// mov x1, #111
// .byte 213,3,32,31
.LBB0_32:
ldp q4, q5, [x9, #-32]
ldp q6, q7, [x9], #64
ldp q16, q17, [x11, #-32]!
ldp q18, q19, [x11], #64
fmul v4.2d, v4.2d, v16.2d
fmul v5.2d, v5.2d, v17.2d
fmul v6.2d, v6.2d, v18.2d
fmul v7.2d, v7.2d, v19.2d
ldp q0, q1, [x8, #-32]
ldp q2, q3, [x8], #64
fadd v0.2d, v0.2d, v4.2d
fadd v1.2d, v1.2d, v5.2d
stp q0, q1, [x10, #-32]
fadd v2.2d, v2.2d, v6.2d
fadd v3.2d, v3.2d, v7.2d
stp q2, q3, [x10]
add x10, x10, #64 // =64
adds x12, x12, #1 // =1
fmov s0, -1.0e+0
fmov s1, #2.0e+2f
prfm pldl1keep, [x26, #2112]
b.ne .LBB0_32
// mov x1, #222
// .byte 213,3,32,31

View File

@@ -0,0 +1,13 @@
#movl $111,%ebx
#.byte 100,103,144
.L10:
vmovapd (%r15,%rax), %ymm0
vmovapd (%r12,%rax), %ymm3
addl $1, %ecx
vfmadd132pd 0(%r13,%rax), %ymm3, %ymm0
vmovapd %ymm0, (%r14,%rax)
addq $32, %rax
cmpl %ecx, %r10d
ja .L10
#movl $222,%ebx
#.byte 100,103,144

View File

@@ -0,0 +1,645 @@
.text
.file "triad.c"
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 // -- Begin function triad
.LCPI0_0:
.xword 4596373779694328218 // double 0.20000000000000001
.LCPI0_1:
.xword 4652007308841189376 // double 1000
.LCPI0_2:
.xword 4517329193108106637 // double 9.9999999999999995E-7
.LCPI0_3:
.xword 4629700416936869888 // double 32
.LCPI0_4:
.xword 4562146422526312448 // double 9.765625E-4
.text
.globl triad
.p2align 6
.type triad,@function
triad: // @triad
.cfi_startproc
// %bb.0:
sub sp, sp, #224 // =224
str d8, [sp, #112] // 8-byte Folded Spill
stp x28, x27, [sp, #128] // 16-byte Folded Spill
stp x26, x25, [sp, #144] // 16-byte Folded Spill
stp x24, x23, [sp, #160] // 16-byte Folded Spill
stp x22, x21, [sp, #176] // 16-byte Folded Spill
stp x20, x19, [sp, #192] // 16-byte Folded Spill
stp x29, x30, [sp, #208] // 16-byte Folded Spill
add x29, sp, #208 // =208
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_offset w19, -24
.cfi_offset w20, -32
.cfi_offset w21, -40
.cfi_offset w22, -48
.cfi_offset w23, -56
.cfi_offset w24, -64
.cfi_offset w25, -72
.cfi_offset w26, -80
.cfi_offset w27, -88
.cfi_offset w28, -96
.cfi_offset b8, -112
mov w19, w0
orr w0, wzr, #0x40
sbfiz x23, x19, #3, #32
mov x1, x23
bl aligned_alloc
mov x20, x0
orr w0, wzr, #0x40
mov x1, x23
bl aligned_alloc
str x0, [sp, #88] // 8-byte Folded Spill
orr w0, wzr, #0x40
mov x1, x23
bl aligned_alloc
mov x22, x0
orr w0, wzr, #0x40
mov x1, x23
bl aligned_alloc
mov x23, x0
cmp w19, #0 // =0
b.le .LBB0_3
// %bb.1:
mov w24, w19
cmp w19, #7 // =7
b.hi .LBB0_9
// %bb.2:
mov x8, xzr
b .LBB0_17
.LBB0_3:
adrp x8, .LCPI0_0
orr w25, wzr, #0x1
ldr d8, [x8, :lo12:.LCPI0_0]
.p2align 6
.LBB0_4: // =>This Loop Header: Depth=1
// Child Loop BB0_5 Depth 2
sub x0, x29, #88 // =88
add x1, sp, #96 // =96
bl timing
mov w21, w25
cbz w25, .LBB0_8
.p2align 6
.LBB0_5: // Parent Loop BB0_4 Depth=1
// => This Inner Loop Header: Depth=2
ldr d0, [x20]
fcmp d0, #0.0
b.le .LBB0_7
// %bb.6: // in Loop: Header=BB0_5 Depth=2
mov x0, x20
bl dummy
.LBB0_7: // in Loop: Header=BB0_5 Depth=2
subs w21, w21, #1 // =1
b.ne .LBB0_5
.LBB0_8: // in Loop: Header=BB0_4 Depth=1
add x0, sp, #104 // =104
add x1, sp, #96 // =96
bl timing
ldr d0, [sp, #104]
ldur d1, [x29, #-88]
fsub d1, d0, d1
lsl w25, w25, #1
fcmp d1, d8
b.mi .LBB0_4
b .LBB0_38
.LBB0_9:
and x8, x24, #0xfffffff8
sub x10, x8, #8 // =8
lsr x11, x10, #3
add w9, w11, #1 // =1
and x9, x9, #0x3
cmp x10, #24 // =24
b.hs .LBB0_11
// %bb.10:
orr w13, wzr, #0x20
cbnz x9, .LBB0_14
b .LBB0_16
.LBB0_11:
mov x16, #28286
movk x16, #29109, lsl #16
ldr x15, [sp, #88] // 8-byte Folded Reload
movk x16, #34426, lsl #32
movk x16, #16000, lsl #48
dup v0.2d, x16
mvn x11, x11
mov x10, xzr
add x11, x9, x11
add x12, x23, #128 // =128
add x13, x20, #128 // =128
add x14, x22, #128 // =128
add x15, x15, #128 // =128
.p2align 6
.LBB0_12: // =>This Inner Loop Header: Depth=1
stp q0, q0, [x12]
stp q0, q0, [x12, #-128]
stp q0, q0, [x12, #32]
stp q0, q0, [x12, #-96]
stp q0, q0, [x14]
add x10, x10, #32 // =32
stp q0, q0, [x14, #-128]
stp q0, q0, [x14, #32]
stp q0, q0, [x14, #-96]
stp q0, q0, [x15]
stp q0, q0, [x15, #-128]
stp q0, q0, [x15, #32]
stp q0, q0, [x15, #-96]
stp q0, q0, [x13]
stp q0, q0, [x13, #-128]
stp q0, q0, [x13, #32]
stp q0, q0, [x13, #-96]
stp q0, q0, [x12, #64]
stp q0, q0, [x12, #-64]
stp q0, q0, [x12, #96]
stp q0, q0, [x12, #-32]
add x12, x12, #256 // =256
stp q0, q0, [x14, #64]
stp q0, q0, [x14, #-64]
stp q0, q0, [x14, #96]
stp q0, q0, [x14, #-32]
add x14, x14, #256 // =256
stp q0, q0, [x15, #64]
stp q0, q0, [x15, #-64]
stp q0, q0, [x15, #96]
stp q0, q0, [x15, #-32]
add x15, x15, #256 // =256
stp q0, q0, [x13, #64]
stp q0, q0, [x13, #-64]
stp q0, q0, [x13, #96]
stp q0, q0, [x13, #-32]
add x13, x13, #256 // =256
adds x11, x11, #4 // =4
b.ne .LBB0_12
// %bb.13:
lsl x10, x10, #3
orr x13, x10, #0x20
cbz x9, .LBB0_16
.LBB0_14:
ldr x14, [sp, #88] // 8-byte Folded Reload
add x10, x23, x13
add x11, x22, x13
add x12, x20, x13
add x13, x14, x13
mov x14, #28286
movk x14, #29109, lsl #16
movk x14, #34426, lsl #32
movk x14, #16000, lsl #48
dup v0.2d, x14
neg x9, x9
.p2align 6
.LBB0_15: // =>This Inner Loop Header: Depth=1
stp q0, q0, [x10]
stp q0, q0, [x11]
stp q0, q0, [x10, #-32]
stp q0, q0, [x13]
stp q0, q0, [x11, #-32]
add x10, x10, #64 // =64
stp q0, q0, [x12]
stp q0, q0, [x13, #-32]
add x11, x11, #64 // =64
stp q0, q0, [x12, #-32]
add x12, x12, #64 // =64
add x13, x13, #64 // =64
adds x9, x9, #1 // =1
b.ne .LBB0_15
.LBB0_16:
cmp x8, x24
b.eq .LBB0_19
.LBB0_17:
ldr x10, [sp, #88] // 8-byte Folded Reload
mov x13, #28286
movk x13, #29109, lsl #16
lsl x12, x8, #3
movk x13, #34426, lsl #32
add x9, x20, x12
movk x13, #16000, lsl #48
add x10, x10, x12
add x11, x22, x12
add x12, x23, x12
sub x8, x24, x8
.p2align 6
.LBB0_18: // =>This Inner Loop Header: Depth=1
str x13, [x12], #8
str x13, [x11], #8
str x13, [x10], #8
str x13, [x9], #8
subs x8, x8, #1 // =1
b.ne .LBB0_18
.LBB0_19:
ldr x10, [sp, #88] // 8-byte Folded Reload
add x8, x20, #256 // =256
and x26, x24, #0xfffffff8
str x8, [sp, #40] // 8-byte Folded Spill
add x8, x23, #256 // =256
sub x27, x26, #8 // =8
str x8, [sp, #32] // 8-byte Folded Spill
add x8, x22, #256 // =256
orr w25, wzr, #0x1
str x8, [sp, #24] // 8-byte Folded Spill
add x8, x10, #256 // =256
str x8, [sp, #16] // 8-byte Folded Spill
lsr x8, x27, #3
add w9, w8, #1 // =1
mvn x8, x8
and x28, x9, #0x7
add x8, x28, x8
str x8, [sp, #8] // 8-byte Folded Spill
neg x8, x28
str x8, [sp, #80] // 8-byte Folded Spill
add x8, x10, #32 // =32
str x8, [sp, #72] // 8-byte Folded Spill
add x8, x22, #32 // =32
str x8, [sp, #64] // 8-byte Folded Spill
add x8, x20, #32 // =32
str x8, [sp, #56] // 8-byte Folded Spill
add x8, x23, #32 // =32
str x8, [sp, #48] // 8-byte Folded Spill
adrp x8, .LCPI0_0
ldr d8, [x8, :lo12:.LCPI0_0]
.p2align 6
.LBB0_20: // =>This Loop Header: Depth=1
// Child Loop BB0_22 Depth 2
// Child Loop BB0_29 Depth 3
// Child Loop BB0_32 Depth 3
// Child Loop BB0_35 Depth 3
sub x0, x29, #88 // =88
add x1, sp, #96 // =96
bl timing
cbz w25, .LBB0_37
// %bb.21: // in Loop: Header=BB0_20 Depth=1
mov w21, wzr
.p2align 6
.LBB0_22: // Parent Loop BB0_20 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_29 Depth 3
// Child Loop BB0_32 Depth 3
// Child Loop BB0_35 Depth 3
ldr d0, [x20]
fcmp d0, #0.0
b.le .LBB0_24
// %bb.23: // in Loop: Header=BB0_22 Depth=2
mov x0, x20
bl dummy
.LBB0_24: // in Loop: Header=BB0_22 Depth=2
cmp w19, #7 // =7
b.hi .LBB0_26
// %bb.25: // in Loop: Header=BB0_22 Depth=2
mov x12, xzr
b .LBB0_34
.p2align 6
.LBB0_26: // in Loop: Header=BB0_22 Depth=2
cmp x27, #56 // =56
b.hs .LBB0_28
// %bb.27: // in Loop: Header=BB0_22 Depth=2
mov x8, xzr
cbnz x28, .LBB0_31
b .LBB0_33
.p2align 6
.LBB0_28: // in Loop: Header=BB0_22 Depth=2
ldp x9, x10, [sp, #16] // 8-byte Folded Reload
ldp x11, x12, [sp, #32] // 8-byte Folded Reload
ldr x13, [sp, #8] // 8-byte Folded Reload
mov x8, xzr
.p2align 6
mov x1, #111 // OSACA START
.byte 213,3,32,31 // OSACA START
.LBB0_29: // Parent Loop BB0_20 Depth=1
// Parent Loop BB0_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q2, q5, [x10, #-256]
ldp q6, q7, [x10, #-224]
ldp q16, q17, [x11, #-256]
ldp q18, q19, [x11, #-224]
fmul v2.2d, v2.2d, v16.2d
fmul v5.2d, v5.2d, v17.2d
fmul v6.2d, v6.2d, v18.2d
ldp q0, q1, [x9, #-256]
ldp q3, q4, [x9, #-224]
fmul v7.2d, v7.2d, v19.2d
fadd v0.2d, v0.2d, v2.2d
fadd v2.2d, v1.2d, v5.2d
stp q0, q2, [x12, #-256]
fadd v1.2d, v3.2d, v6.2d
ldp q6, q17, [x10, #-192]
ldp q18, q19, [x10, #-160]
ldp q20, q21, [x11, #-192]
ldp q22, q23, [x11, #-160]
fmul v6.2d, v6.2d, v20.2d
fmul v17.2d, v17.2d, v21.2d
fmul v18.2d, v18.2d, v22.2d
fadd v3.2d, v4.2d, v7.2d
stp q1, q3, [x12, #-224]
ldp q4, q5, [x9, #-192]
ldp q7, q16, [x9, #-160]
fmul v19.2d, v19.2d, v23.2d
fadd v4.2d, v4.2d, v6.2d
fadd v6.2d, v5.2d, v17.2d
stp q4, q6, [x12, #-192]
fadd v5.2d, v7.2d, v18.2d
ldp q18, q21, [x10, #-128]
ldp q22, q23, [x10, #-96]
ldp q24, q25, [x11, #-128]
ldp q26, q27, [x11, #-96]
fmul v18.2d, v18.2d, v24.2d
fmul v21.2d, v21.2d, v25.2d
fmul v22.2d, v22.2d, v26.2d
fadd v7.2d, v16.2d, v19.2d
stp q5, q7, [x12, #-160]
ldp q16, q17, [x9, #-128]
ldp q19, q20, [x9, #-96]
fadd v16.2d, v16.2d, v18.2d
fadd v18.2d, v17.2d, v21.2d
stp q16, q18, [x12, #-128]
fadd v17.2d, v19.2d, v22.2d
ldp q22, q25, [x10, #-64]
ldp q28, q29, [x11, #-64]
fmul v23.2d, v23.2d, v27.2d
ldp q26, q27, [x10, #-32]
fmul v22.2d, v22.2d, v28.2d
fmul v25.2d, v25.2d, v29.2d
ldp q28, q29, [x11, #-32]
fmul v26.2d, v26.2d, v28.2d
fmul v27.2d, v27.2d, v29.2d
fadd v19.2d, v20.2d, v23.2d
stp q17, q19, [x12, #-96]
ldp q20, q21, [x9, #-64]
ldp q23, q24, [x9, #-32]
fadd v20.2d, v20.2d, v22.2d
fadd v22.2d, v21.2d, v25.2d
stp q20, q22, [x12, #-64]
fadd v21.2d, v23.2d, v26.2d
fadd v23.2d, v24.2d, v27.2d
stp q21, q23, [x12, #-32]
ldp q24, q25, [x10]
ldp q28, q29, [x11]
ldp q26, q27, [x10, #32]
fmul v24.2d, v24.2d, v28.2d
fmul v25.2d, v25.2d, v29.2d
ldp q28, q29, [x11, #32]
fmul v26.2d, v26.2d, v28.2d
fmul v27.2d, v27.2d, v29.2d
ldp q28, q29, [x9]
fadd v24.2d, v28.2d, v24.2d
fadd v25.2d, v29.2d, v25.2d
stp q24, q25, [x12]
ldp q28, q29, [x9, #32]
fadd v26.2d, v28.2d, v26.2d
fadd v27.2d, v29.2d, v27.2d
stp q26, q27, [x12, #32]
ldp q24, q25, [x10, #64]
ldp q28, q29, [x11, #64]
ldp q26, q27, [x10, #96]
fmul v24.2d, v24.2d, v28.2d
fmul v25.2d, v25.2d, v29.2d
ldp q28, q29, [x11, #96]
fmul v26.2d, v26.2d, v28.2d
fmul v27.2d, v27.2d, v29.2d
ldp q28, q29, [x9, #64]
fadd v24.2d, v28.2d, v24.2d
fadd v25.2d, v29.2d, v25.2d
stp q24, q25, [x12, #64]
ldp q28, q29, [x9, #96]
fadd v26.2d, v28.2d, v26.2d
fadd v27.2d, v29.2d, v27.2d
stp q26, q27, [x12, #96]
ldp q24, q25, [x10, #128]
ldp q28, q29, [x11, #128]
ldp q26, q27, [x10, #160]
fmul v24.2d, v24.2d, v28.2d
fmul v25.2d, v25.2d, v29.2d
ldp q28, q29, [x11, #160]
fmul v26.2d, v26.2d, v28.2d
fmul v27.2d, v27.2d, v29.2d
ldp q28, q29, [x9, #128]
fadd v24.2d, v28.2d, v24.2d
fadd v25.2d, v29.2d, v25.2d
stp q24, q25, [x12, #128]
ldp q28, q29, [x9, #160]
fadd v26.2d, v28.2d, v26.2d
fadd v27.2d, v29.2d, v27.2d
stp q26, q27, [x12, #160]
ldp q24, q25, [x10, #192]
ldp q26, q27, [x11, #192]
fmul v24.2d, v24.2d, v26.2d
ldp q26, q28, [x10, #224]
fmul v25.2d, v25.2d, v27.2d
ldp q27, q0, [x11, #224]
fmul v2.2d, v26.2d, v27.2d
fmul v0.2d, v28.2d, v0.2d
ldp q1, q3, [x9, #192]
ldp q4, q5, [x9, #224]
fadd v1.2d, v1.2d, v24.2d
fadd v3.2d, v3.2d, v25.2d
stp q1, q3, [x12, #192]
fadd v2.2d, v4.2d, v2.2d
fadd v0.2d, v5.2d, v0.2d
stp q2, q0, [x12, #224]
add x8, x8, #64 // =64
add x12, x12, #512 // =512
add x11, x11, #512 // =512
add x10, x10, #512 // =512
add x9, x9, #512 // =512
adds x13, x13, #8 // =8
b.ne .LBB0_29
mov x1, #222 // OSACA END
.byte 213,3,32,31 // OSACA END
// %bb.30: // in Loop: Header=BB0_22 Depth=2
cbz x28, .LBB0_33
.LBB0_31: // in Loop: Header=BB0_22 Depth=2
lsl x11, x8, #3
ldp x9, x8, [sp, #64] // 8-byte Folded Reload
ldp x12, x10, [sp, #48] // 8-byte Folded Reload
add x8, x8, x11
add x9, x9, x11
add x10, x10, x11
add x11, x12, x11
ldr x12, [sp, #80] // 8-byte Folded Reload
.p2align 6
.LBB0_32: // Parent Loop BB0_20 Depth=1
// Parent Loop BB0_22 Depth=2
// => This Inner Loop Header: Depth=3
ldp q4, q5, [x9, #-32]
ldp q6, q7, [x9], #64
ldp q16, q17, [x11, #-32]
ldp q18, q19, [x11], #64
fmul v4.2d, v4.2d, v16.2d
fmul v5.2d, v5.2d, v17.2d
fmul v6.2d, v6.2d, v18.2d
fmul v7.2d, v7.2d, v19.2d
ldp q0, q1, [x8, #-32]
ldp q2, q3, [x8], #64
fadd v0.2d, v0.2d, v4.2d
fadd v1.2d, v1.2d, v5.2d
stp q0, q1, [x10, #-32]
fadd v2.2d, v2.2d, v6.2d
fadd v3.2d, v3.2d, v7.2d
stp q2, q3, [x10]
add x10, x10, #64 // =64
adds x12, x12, #1 // =1
b.ne .LBB0_32
.LBB0_33: // in Loop: Header=BB0_22 Depth=2
mov x12, x26
cmp x26, x24
b.eq .LBB0_36
.LBB0_34: // in Loop: Header=BB0_22 Depth=2
ldr x8, [sp, #88] // 8-byte Folded Reload
lsl x11, x12, #3
sub x12, x24, x12
add x8, x8, x11
add x9, x22, x11
add x10, x23, x11
add x11, x20, x11
.p2align 6
.LBB0_35: // Parent Loop BB0_20 Depth=1
// Parent Loop BB0_22 Depth=2
// => This Inner Loop Header: Depth=3
ldr d0, [x8], #8
ldr d1, [x9], #8
ldr d2, [x10], #8
fmul d1, d1, d2
fadd d0, d0, d1
str d0, [x11], #8
subs x12, x12, #1 // =1
b.ne .LBB0_35
.LBB0_36: // in Loop: Header=BB0_22 Depth=2
add w21, w21, #1 // =1
cmp w21, w25
b.ne .LBB0_22
.LBB0_37: // in Loop: Header=BB0_20 Depth=1
add x0, sp, #104 // =104
add x1, sp, #96 // =96
bl timing
ldr d0, [sp, #104]
ldur d1, [x29, #-88]
fsub d1, d0, d1
lsl w25, w25, #1
fcmp d1, d8
b.mi .LBB0_20
.LBB0_38:
scvtf d4, w19
lsr w1, w25, #1
adrp x8, .LCPI0_1
scvtf d6, w1
fadd d2, d4, d4
ldr d5, [x8, :lo12:.LCPI0_1]
adrp x8, .LCPI0_2
fmov d0, #8.00000000
fmul d2, d2, d6
ldr d3, [x8, :lo12:.LCPI0_2]
adrp x8, .LCPI0_3
adrp x0, .L.str
fmul d2, d2, d3
ldr d3, [x8, :lo12:.LCPI0_3]
adrp x8, .LCPI0_4
add x0, x0, :lo12:.L.str
fmul d3, d6, d3
fmul d0, d4, d0
fmul d3, d3, d4
fmul d4, d4, d6
fdiv d3, d3, d1
fdiv d4, d4, d1
fdiv d4, d4, d5
fdiv d0, d0, d5
fdiv d2, d2, d1
ldr d7, [x8, :lo12:.LCPI0_4]
fmul d3, d3, d7
fdiv d4, d4, d5
fmul d3, d3, d7
mov w2, w19
bl printf
mov x0, x20
bl free
ldr x0, [sp, #88] // 8-byte Folded Reload
bl free
mov x0, x22
bl free
mov x0, x23
bl free
ldp x29, x30, [sp, #208] // 16-byte Folded Reload
ldp x20, x19, [sp, #192] // 16-byte Folded Reload
ldp x22, x21, [sp, #176] // 16-byte Folded Reload
ldp x24, x23, [sp, #160] // 16-byte Folded Reload
ldp x26, x25, [sp, #144] // 16-byte Folded Reload
ldp x28, x27, [sp, #128] // 16-byte Folded Reload
ldr d8, [sp, #112] // 8-byte Folded Reload
add sp, sp, #224 // =224
ret
.Lfunc_end0:
.size triad, .Lfunc_end0-triad
.cfi_endproc
// -- End function
.globl main // -- Begin function main
.p2align 6
.type main,@function
main: // @main
.cfi_startproc
// %bb.0:
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
adrp x0, .Lstr
add x0, x0, :lo12:.Lstr
bl puts
adrp x0, .Lstr.3
add x0, x0, :lo12:.Lstr.3
bl puts
mov w0, #190
bl triad
mov w0, #247
bl triad
mov w0, #321
bl triad
mov w0, #417
bl triad
mov w0, #542
bl triad
mov w0, #705
bl triad
mov w0, #917
bl triad
mov w0, #1192
bl triad
mov w0, #1550
bl triad
mov w0, #2015
bl triad
mov w0, #2619
bl triad
mov w0, #3405
bl triad
mov w0, #4427
bl triad
mov w0, #5756
bl triad
mov w0, #7482
bl triad
mov w0, #9727
bl triad
mov w0, wzr
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.Lfunc_end1:
.size main, .Lfunc_end1-main
.cfi_endproc
.type .L.str,@object // @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
.size .L.str, 53
.type .Lstr,@object // @str
.section .rodata.str1.16,"aMS",@progbits,1
.p2align 4
.Lstr:
.asciz "TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
.size .Lstr, 51
.type .Lstr.3,@object // @str.3
.p2align 4
.Lstr.3:
.asciz "Size (KByte) | runtime | MFlop/s | MB/s | MLUP/s | repeat | size"
.size .Lstr.3, 74
.ident "Arm C/C++/Fortran Compiler version 19.0 (build number 69) (based on LLVM 7.0.2)"
.section ".note.GNU-stack","",@progbits
.addrsig

View File

@@ -0,0 +1,353 @@
.file "triad.c"
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC9:
.string "%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
.text
.p2align 4,,15
.globl triad
.type triad, @function
triad:
.LFB24:
.cfi_startproc
pushq %r13
.cfi_def_cfa_offset 16
.cfi_offset 13, -16
movslq %edi, %rax
movl $64, %edi
leaq 16(%rsp), %r13
.cfi_def_cfa 13, 0
andq $-32, %rsp
pushq -8(%r13)
pushq %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
movq %rsp, %rbp
pushq %r15
.cfi_escape 0x10,0xf,0x2,0x76,0x78
leaq 0(,%rax,8), %r15
pushq %r14
movq %r15, %rsi
pushq %r13
.cfi_escape 0xf,0x3,0x76,0x68,0x6
.cfi_escape 0x10,0xe,0x2,0x76,0x70
pushq %r12
pushq %rbx
.cfi_escape 0x10,0xc,0x2,0x76,0x60
.cfi_escape 0x10,0x3,0x2,0x76,0x58
movq %rax, %rbx
subq $72, %rsp
call aligned_alloc
movq %r15, %rsi
movl $64, %edi
movq %rax, %r14
call aligned_alloc
movq %r15, %rsi
movl $64, %edi
movq %rax, %r12
call aligned_alloc
movq %r15, %rsi
movl $64, %edi
movq %rax, %r13
call aligned_alloc
movq %rax, %r15
leal -1(%rbx), %eax
movl %eax, -96(%rbp)
testl %ebx, %ebx
jle .L2
cmpl $2, %eax
jbe .L14
movl %ebx, %esi
vmovapd .LC0(%rip), %ymm0
xorl %eax, %eax
xorl %ecx, %ecx
shrl $2, %esi
.p2align 4,,10
.p2align 3
.L4:
addl $1, %ecx
vmovapd %ymm0, (%r15,%rax)
vmovapd %ymm0, 0(%r13,%rax)
vmovapd %ymm0, (%r12,%rax)
vmovapd %ymm0, (%r14,%rax)
addq $32, %rax
cmpl %ecx, %esi
ja .L4
movl %ebx, %eax
andl $-4, %eax
cmpl %eax, %ebx
je .L26
vzeroupper
.L3:
vmovsd .LC1(%rip), %xmm0
movslq %eax, %rcx
vmovsd %xmm0, (%r15,%rcx,8)
vmovsd %xmm0, 0(%r13,%rcx,8)
vmovsd %xmm0, (%r12,%rcx,8)
vmovsd %xmm0, (%r14,%rcx,8)
leal 1(%rax), %ecx
cmpl %ecx, %ebx
jle .L2
movslq %ecx, %rcx
addl $2, %eax
vmovsd %xmm0, (%r15,%rcx,8)
vmovsd %xmm0, 0(%r13,%rcx,8)
vmovsd %xmm0, (%r12,%rcx,8)
vmovsd %xmm0, (%r14,%rcx,8)
cmpl %eax, %ebx
jle .L2
cltq
vmovsd %xmm0, (%r15,%rax,8)
vmovsd %xmm0, 0(%r13,%rax,8)
vmovsd %xmm0, (%r12,%rax,8)
vmovsd %xmm0, (%r14,%rax,8)
.L2:
movl %ebx, %eax
movl $1, -84(%rbp)
movl %ebx, %r10d
andl $-4, %eax
shrl $2, %r10d
movl %eax, -100(%rbp)
.p2align 4,,10
.p2align 3
.L13:
leaq -56(%rbp), %rsi
leaq -72(%rbp), %rdi
movl %r10d, -88(%rbp)
call timing
movl -88(%rbp), %r10d
xorl %r11d, %r11d
.p2align 4,,10
.p2align 3
.L12:
vmovsd (%r14), %xmm0
vxorpd %xmm7, %xmm7, %xmm7
vucomisd %xmm7, %xmm0
jbe .L6
movq %r14, %rdi
movl %r11d, -92(%rbp)
movl %r10d, -88(%rbp)
vzeroupper
call dummy
movl -92(%rbp), %r11d
movl -88(%rbp), %r10d
.L6:
testl %ebx, %ebx
jle .L8
cmpl $2, -96(%rbp)
jbe .L15
xorl %eax, %eax
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.L10:
vmovapd (%r15,%rax), %ymm0
vmovapd (%r12,%rax), %ymm3
addl $1, %ecx
vfmadd132pd 0(%r13,%rax), %ymm3, %ymm0
vmovapd %ymm0, (%r14,%rax)
addq $32, %rax
cmpl %ecx, %r10d
ja .L10
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
movl -100(%rbp), %eax
cmpl %ebx, %eax
je .L8
.L9:
movslq %eax, %rcx
vmovsd 0(%r13,%rcx,8), %xmm0
vmovsd (%r12,%rcx,8), %xmm5
vfmadd132sd (%r15,%rcx,8), %xmm5, %xmm0
vmovsd %xmm0, (%r14,%rcx,8)
leal 1(%rax), %ecx
cmpl %ebx, %ecx
jge .L8
movslq %ecx, %rcx
addl $2, %eax
vmovsd 0(%r13,%rcx,8), %xmm0
vmovsd (%r12,%rcx,8), %xmm6
vfmadd132sd (%r15,%rcx,8), %xmm6, %xmm0
vmovsd %xmm0, (%r14,%rcx,8)
cmpl %eax, %ebx
jle .L8
cltq
vmovsd (%r15,%rax,8), %xmm0
vmovsd (%r12,%rax,8), %xmm4
vfmadd132sd 0(%r13,%rax,8), %xmm4, %xmm0
vmovsd %xmm0, (%r14,%rax,8)
.L8:
addl $1, %r11d
cmpl -84(%rbp), %r11d
jne .L12
leaq -56(%rbp), %rsi
leaq -64(%rbp), %rdi
movl %r11d, -84(%rbp)
movl %r10d, -88(%rbp)
vzeroupper
call timing
vmovsd -64(%rbp), %xmm1
vsubsd -72(%rbp), %xmm1, %xmm1
vmovsd .LC3(%rip), %xmm2
movl -84(%rbp), %r11d
movl -88(%rbp), %r10d
vucomisd %xmm1, %xmm2
leal (%r11,%r11), %eax
movl %eax, -84(%rbp)
ja .L13
movl %eax, %esi
vxorpd %xmm6, %xmm6, %xmm6
vxorpd %xmm0, %xmm0, %xmm0
movl %ebx, %edx
sarl %esi
vcvtsi2sd %ebx, %xmm0, %xmm0
movl $.LC9, %edi
movl $5, %eax
vcvtsi2sd %esi, %xmm6, %xmm6
vmulsd .LC5(%rip), %xmm6, %xmm2
vmovsd .LC4(%rip), %xmm5
vmovsd .LC6(%rip), %xmm7
vmulsd %xmm0, %xmm6, %xmm4
vmulsd %xmm0, %xmm2, %xmm2
vdivsd %xmm1, %xmm4, %xmm4
vdivsd %xmm1, %xmm2, %xmm2
vdivsd %xmm5, %xmm4, %xmm4
vmulsd %xmm7, %xmm2, %xmm3
vaddsd %xmm0, %xmm0, %xmm2
vmulsd .LC8(%rip), %xmm0, %xmm0
vmulsd %xmm6, %xmm2, %xmm2
vmulsd .LC7(%rip), %xmm2, %xmm2
vmulsd %xmm7, %xmm3, %xmm3
vdivsd %xmm5, %xmm0, %xmm0
vdivsd %xmm5, %xmm4, %xmm4
vdivsd %xmm1, %xmm2, %xmm2
call printf
movq %r14, %rdi
call free
movq %r12, %rdi
call free
movq %r13, %rdi
call free
addq $72, %rsp
movq %r15, %rdi
popq %rbx
popq %r12
popq %r13
.cfi_remember_state
.cfi_def_cfa 13, 0
popq %r14
popq %r15
popq %rbp
leaq -16(%r13), %rsp
.cfi_def_cfa 7, 16
popq %r13
.cfi_def_cfa_offset 8
jmp free
.p2align 4,,10
.p2align 3
.L15:
.cfi_restore_state
xorl %eax, %eax
jmp .L9
.L26:
vzeroupper
jmp .L2
.L14:
xorl %eax, %eax
jmp .L3
.cfi_endproc
.LFE24:
.size triad, .-triad
.section .rodata.str1.8
.align 8
.LC10:
.string "TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
.align 8
.LC11:
.string "Size (KByte) | runtime | MFlop/s | MB/s | MLUP/s | repeat | size"
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB25:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movl $.LC10, %edi
movl $20, %ebx
call puts
movl $.LC11, %edi
call puts
.p2align 4,,10
.p2align 3
.L28:
vxorpd %xmm1, %xmm1, %xmm1
movq .LC12(%rip), %rax
vcvtsi2sd %ebx, %xmm1, %xmm1
addl $1, %ebx
vmovq %rax, %xmm0
call pow
vcvttsd2si %xmm0, %edi
call triad
cmpl $36, %ebx
jne .L28
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE25:
.size main, .-main
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.long 1907715710
.long 1048610426
.long 1907715710
.long 1048610426
.long 1907715710
.long 1048610426
.long 1907715710
.long 1048610426
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LC1:
.long 1907715710
.long 1048610426
.align 8
.LC3:
.long 2576980378
.long 1070176665
.align 8
.LC4:
.long 0
.long 1083129856
.align 8
.LC5:
.long 0
.long 1077936128
.align 8
.LC6:
.long 0
.long 1062207488
.align 8
.LC7:
.long 2696277389
.long 1051772663
.align 8
.LC8:
.long 0
.long 1075838976
.align 8
.LC12:
.long 3435973837
.long 1073007820
.ident "GCC: (GNU) 7.2.0"
.section .note.GNU-stack,"",@progbits

94
tests/test_frontend.py Executable file
View File

@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""
Unit tests for OSACA Frontend
"""
import os
import unittest
from osaca.frontend import Frontend
from osaca.parser import ParserAArch64v81, ParserX86ATT
from osaca.semantics.hw_model import MachineModel
from osaca.semantics.kernel_dg import KernelDG
from osaca.semantics.semantics_appender import SemanticsAppender
class TestFrontend(unittest.TestCase):
MODULE_DATA_DIR = os.path.join(
os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
)
@classmethod
def setUpClass(self):
# set up parser and kernels
self.parser_x86 = ParserX86ATT()
self.parser_AArch64 = ParserAArch64v81()
with open(self._find_file('kernel-x86.s')) as f:
code_x86 = f.read()
with open(self._find_file('kernel-AArch64.s')) as f:
code_AArch64 = f.read()
self.kernel_x86 = self.parser_x86.parse_file(code_x86)
self.kernel_AArch64 = self.parser_AArch64.parse_file(code_AArch64)
# set up machine models
self.machine_model_csx = MachineModel(
path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
)
self.machine_model_tx2 = MachineModel(
path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
)
self.semantics_csx = SemanticsAppender(
self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
)
self.semantics_tx2 = SemanticsAppender(
self.machine_model_tx2,
path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/aarch64.yml'),
)
for i in range(len(self.kernel_x86)):
self.semantics_csx.assign_src_dst(self.kernel_x86[i])
self.semantics_csx.assign_tp_lt(self.kernel_x86[i])
for i in range(len(self.kernel_AArch64)):
self.semantics_tx2.assign_src_dst(self.kernel_AArch64[i])
self.semantics_tx2.assign_tp_lt(self.kernel_AArch64[i])
###########
# Tests
###########
def test_frontend_creation(self):
with self.assertRaises(ValueError):
Frontend()
with self.assertRaises(ValueError):
Frontend(arch='csx', path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml'))
with self.assertRaises(FileNotFoundError):
Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'THE_MACHINE.yml'))
with self.assertRaises(FileNotFoundError):
Frontend(arch='THE_MACHINE')
Frontend(arch='zen1')
def test_frontend_x86(self):
dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
fe = Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml'))
fe.print_throughput_analysis(self.kernel_x86, show_cmnts=False)
fe.print_latency_analysis(dg.get_critical_path())
def test_frontend_AArch64(self):
dg = KernelDG(self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2)
fe = Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml'))
fe.print_full_analysis(self.kernel_AArch64, dg, verbose=True)
##################
# Helper functions
##################
@staticmethod
def _find_file(name):
testdir = os.path.dirname(__file__)
name = os.path.join(testdir, 'test_files', name)
assert os.path.exists(name)
return name
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestFrontend)
unittest.TextTestRunner(verbosity=2).run(suite)

90
tests/test_kerncraftAPI.py Executable file
View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""
Unit tests for OSACA Kerncraft API
"""
import os
import unittest
from collections import OrderedDict
from osaca.api import KerncraftAPI
from osaca.parser import ParserAArch64v81, ParserX86ATT
class TestKerncraftAPI(unittest.TestCase):
@classmethod
def setUpClass(self):
# set up parser and kernels
self.parser_x86 = ParserX86ATT()
self.parser_AArch64 = ParserAArch64v81()
with open(self._find_file('triad-x86-iaca.s')) as f:
self.code_x86 = f.read()
with open(self._find_file('triad-arm-iaca.s')) as f:
self.code_AArch64 = f.read()
###########
# Tests
###########
def test_kerncraft_API_x86(self):
kapi = KerncraftAPI('csx', self.code_x86)
kapi.create_output()
self.assertEqual(kapi.get_unmatched_instruction_ratio(), 0.0)
port_occupation = OrderedDict(
[
('0', 1.25),
('0DV', 0.0),
('1', 1.25),
('2', 2.0),
('2D', 1.5),
('3', 2.0),
('3D', 1.5),
('4', 1.0),
('5', 0.75),
('6', 0.75),
('7', 0.0),
]
)
self.assertEqual(kapi.get_port_occupation_cycles(), port_occupation)
self.assertEqual(kapi.get_total_throughput(), 2.0)
self.assertEqual(kapi.get_latency(), (1.0, 13.0))
def test_kerncraft_API_AArch64(self):
kapi = KerncraftAPI('tx2', self.code_AArch64)
kapi.create_output()
self.assertEqual(kapi.get_unmatched_instruction_ratio(), 0.0)
port_occupation = OrderedDict(
[
('0', 34.0),
('0DV', 0.0),
('1', 34.0),
('1DV', 0.0),
('2', 2.0),
('3', 64.0),
('4', 64.0),
('5', 32.0),
]
)
self.assertEqual(kapi.get_port_occupation_cycles(), port_occupation)
self.assertEqual(kapi.get_total_throughput(), 64.0)
# TODO add missing latency values
# self.assertEqual(kapi.get_latency(kernel), 20.0)
##################
# Helper functions
##################
@staticmethod
def _find_file(name):
testdir = os.path.dirname(__file__)
name = os.path.join(testdir, 'test_files', name)
assert os.path.exists(name)
return name
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestKerncraftAPI)
unittest.TextTestRunner(verbosity=2).run(suite)

308
tests/test_marker_utils.py Executable file
View File

@@ -0,0 +1,308 @@
#!/usr/bin/env python3
"""
Unit tests for IACA/OSACA marker utilities
"""
import os
import unittest
from osaca.semantics import reduce_to_section
from osaca.parser import ParserAArch64v81, ParserX86ATT
class TestMarkerUtils(unittest.TestCase):
@classmethod
def setUpClass(self):
self.parser_AArch = ParserAArch64v81()
self.parser_x86 = ParserX86ATT()
with open(self._find_file('triad-arm-iaca.s')) as f:
triad_code_arm = f.read()
with open(self._find_file('triad-x86-iaca.s')) as f:
triad_code_x86 = f.read()
self.parsed_AArch = self.parser_AArch.parse_file(triad_code_arm)
self.parsed_x86 = self.parser_x86.parse_file(triad_code_x86)
#################
# Test
#################
def test_marker_detection_AArch64(self):
kernel = reduce_to_section(self.parsed_AArch, 'AArch64')
self.assertEquals(len(kernel), 138)
self.assertEquals(kernel[0].line_number, 307)
self.assertEquals(kernel[-1].line_number, 444)
def test_marker_detection_x86(self):
kernel = reduce_to_section(self.parsed_x86, 'x86')
self.assertEquals(len(kernel), 9)
self.assertEquals(kernel[0].line_number, 146)
self.assertEquals(kernel[-1].line_number, 154)
def test_marker_matching_AArch64(self):
# preparation
bytes_1_line = '.byte 213,3,32,31\n'
bytes_2_lines_1 = '.byte 213,3,32\n' + '.byte 31\n'
bytes_2_lines_2 = '.byte 213,3\n' + '.byte 32,31\n'
bytes_2_lines_3 = '.byte 213\n' + '.byte 3,32,31\n'
bytes_3_lines_1 = '.byte 213,3\n' + '.byte 32\n' + '.byte 31\n'
bytes_3_lines_2 = '.byte 213\n' + '.byte 3,32\n' + '.byte 31\n'
bytes_3_lines_3 = '.byte 213\n' + '.byte 3\n' + '.byte 32,31\n'
bytes_4_lines = '.byte 213\n' + '.byte 3\n' + '.byte 32\n' + '.byte 31\n'
mov_start_1 = 'mov x1, #111\n'
mov_start_2 = 'mov x1, 111 // should work as well\n'
mov_end_1 = 'mov x1, #222 // preferred way\n'
mov_end_2 = 'mov x1, 222\n'
prologue = (
'mov x12, xzr\n'
+ '\tldp x9, x10, [sp, #16] // 8-byte Folded Reload\n'
+ ' .p2align 6\n'
)
kernel = (
'.LBB0_28:\n'
+ 'fmul v7.2d, v7.2d, v19.2d\n'
+ 'stp q0, q1, [x10, #-32]\n'
+ 'b.ne .LBB0_28\n'
)
epilogue = '.LBB0_29: // Parent Loop BB0_20 Depth=1\n' + 'bl dummy\n'
kernel_length = len(list(filter(None, kernel.split('\n'))))
bytes_variations = [
bytes_1_line,
bytes_2_lines_1,
bytes_2_lines_2,
bytes_2_lines_3,
bytes_3_lines_1,
bytes_3_lines_2,
bytes_3_lines_3,
bytes_4_lines,
]
mov_start_variations = [mov_start_1, mov_start_2]
mov_end_variations = [mov_end_1, mov_end_2]
# actual tests
for mov_start_var in mov_start_variations:
for bytes_var_1 in bytes_variations:
for mov_end_var in mov_end_variations:
for bytes_var_2 in bytes_variations:
sample_code = (
prologue
+ mov_start_var
+ bytes_var_1
+ kernel
+ mov_end_var
+ bytes_var_2
+ epilogue
)
with self.subTest(
mov_start=mov_start_var,
bytes_start=bytes_var_1,
mov_end=mov_end_var,
bytes_end=bytes_var_2,
):
sample_parsed = self.parser_AArch.parse_file(sample_code)
sample_kernel = reduce_to_section(sample_parsed, 'AArch64')
self.assertEquals(len(sample_kernel), kernel_length)
kernel_start = len(
list(
filter(
None, (prologue + mov_start_var + bytes_var_1).split('\n')
)
)
)
parsed_kernel = self.parser_AArch.parse_file(
kernel, start_line=kernel_start
)
self.assertEquals(sample_kernel, parsed_kernel)
def test_marker_matching_x86(self):
# preparation
bytes_1_line = '.byte 100,103,144\n'
bytes_2_lines_1 = '.byte 100,103\n' + '.byte 144\n'
bytes_2_lines_2 = '.byte 100\n' + '.byte 103,144\n'
bytes_3_lines = (
'.byte 100 # IACA MARKER UTILITY\n'
+ '.byte 103 # IACA MARKER UTILITY\n'
+ '.byte 144 # IACA MARKER UTILITY\n'
)
mov_start_1 = 'movl $111, %ebx # IACA START\n'
mov_start_2 = 'mov $111, %ebx # IACA START\n'
mov_end_1 = 'movl $222, %ebx # IACA END\n'
mov_end_2 = 'mov $222, %ebx # IACA END\n'
prologue = 'movl -92(%rbp), %r11d\n' + 'movl $111, %ebx\n'
kernel = (
'vfmadd132sd (%r15,%rcx,8), %xmm5, %xmm0\n'
+ 'vmovsd %xmm0, (%r14,%rcx,8)\n'
+ 'cmpl %ebx, %ecx\n'
+ 'jge .L8\n'
)
epilogue = '.LE9:\t\t#12.2\n' 'call dummy\n'
kernel_length = len(list(filter(None, kernel.split('\n'))))
bytes_variations = [bytes_1_line, bytes_2_lines_1, bytes_2_lines_2, bytes_3_lines]
mov_start_variations = [mov_start_1, mov_start_2]
mov_end_variations = [mov_end_1, mov_end_2]
# actual tests
for mov_start_var in mov_start_variations:
for bytes_var_1 in bytes_variations:
for mov_end_var in mov_end_variations:
for bytes_var_2 in bytes_variations:
sample_code = (
prologue
+ mov_start_var
+ bytes_var_1
+ kernel
+ mov_end_var
+ bytes_var_2
+ epilogue
)
with self.subTest(
mov_start=mov_start_var,
bytes_start=bytes_var_1,
mov_end=mov_end_var,
bytes_end=bytes_var_2,
):
sample_parsed = self.parser_x86.parse_file(sample_code)
sample_kernel = reduce_to_section(sample_parsed, 'x86')
self.assertEquals(len(sample_kernel), kernel_length)
kernel_start = len(
list(
filter(
None, (prologue + mov_start_var + bytes_var_1).split('\n')
)
)
)
parsed_kernel = self.parser_x86.parse_file(
kernel, start_line=kernel_start
)
self.assertEquals(sample_kernel, parsed_kernel)
def test_marker_special_cases_AArch(self):
bytes_line = '.byte 213,3,32,31\n'
mov_start = 'mov x1, #111\n'
mov_end = 'mov x1, #222\n'
prologue = 'dup v0.2d, x14\n' + ' neg x9, x9\n' + ' .p2align 6\n'
kernel = (
'.LBB0_28:\n'
+ 'fmul v7.2d, v7.2d, v19.2d\n'
+ 'stp q0, q1, [x10, #-32]\n'
+ 'b.ne .LBB0_28\n'
)
epilogue = '.LBB0_29: // Parent Loop BB0_20 Depth=1\n' + 'bl dummy\n'
kernel_length = len(list(filter(None, kernel.split('\n'))))
# marker directly at the beginning
code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
beginning_parsed = self.parser_AArch.parse_file(code_beginning)
test_kernel = reduce_to_section(beginning_parsed, 'AArch64')
self.assertEquals(len(test_kernel), kernel_length)
kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
self.assertEquals(test_kernel, parsed_kernel)
# marker at the end
code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
end_parsed = self.parser_AArch.parse_file(code_end)
test_kernel = reduce_to_section(end_parsed, 'AArch64')
self.assertEquals(len(test_kernel), kernel_length)
kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
self.assertEquals(test_kernel, parsed_kernel)
# no kernel
code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
empty_parsed = self.parser_AArch.parse_file(code_empty)
test_kernel = reduce_to_section(empty_parsed, 'AArch64')
self.assertEquals(len(test_kernel), 0)
kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
self.assertEquals(test_kernel, [])
# no start marker
code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
no_start_parsed = self.parser_AArch.parse_file(code_no_start)
with self.assertRaises(LookupError):
reduce_to_section(no_start_parsed, 'AArch64')
# no end marker
code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
no_end_parsed = self.parser_AArch.parse_file(code_no_end)
with self.assertRaises(LookupError):
reduce_to_section(no_end_parsed, 'AArch64')
# no marker at all
code_no_marker = prologue + kernel + epilogue
no_marker_parsed = self.parser_AArch.parse_file(code_no_marker)
with self.assertRaises(LookupError):
reduce_to_section(no_marker_parsed, 'AArch64')
def test_marker_special_cases_x86(self):
bytes_line = '.byte 100\n.byte 103\n.byte 144\n'
mov_start = 'movl $111, %ebx\n'
mov_end = 'movl $222, %ebx\n'
prologue = 'movl -88(%rbp), %r10d\n' + 'xorl %r11d, %r11d\n' + '.p2align 4,,10\n'
kernel = (
'.L3: #L3\n'
+ 'vmovsd .LC1(%rip), %xmm0\n'
+ 'vmovsd %xmm0, (%r15,%rcx,8)\n'
+ 'cmpl %ecx, %ebx\n'
+ 'jle .L3\n'
)
epilogue = 'leaq -56(%rbp), %rsi\n' + 'movl %r10d, -88(%rbp)\n' + 'call timing\n'
kernel_length = len(list(filter(None, kernel.split('\n'))))
# marker directly at the beginning
code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
beginning_parsed = self.parser_x86.parse_file(code_beginning)
test_kernel = reduce_to_section(beginning_parsed, 'x86')
self.assertEquals(len(test_kernel), kernel_length)
kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
self.assertEquals(test_kernel, parsed_kernel)
# marker at the end
code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
end_parsed = self.parser_x86.parse_file(code_end)
test_kernel = reduce_to_section(end_parsed, 'x86')
self.assertEquals(len(test_kernel), kernel_length)
kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
self.assertEquals(test_kernel, parsed_kernel)
# no kernel
code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
empty_parsed = self.parser_x86.parse_file(code_empty)
test_kernel = reduce_to_section(empty_parsed, 'x86')
self.assertEquals(len(test_kernel), 0)
kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
self.assertEquals(test_kernel, [])
# no start marker
code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
no_start_parsed = self.parser_x86.parse_file(code_no_start)
with self.assertRaises(LookupError):
reduce_to_section(no_start_parsed, 'x86')
# no end marker
code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
no_end_parsed = self.parser_x86.parse_file(code_no_end)
with self.assertRaises(LookupError):
reduce_to_section(no_end_parsed, 'x86')
# no marker at all
code_no_marker = prologue + kernel + epilogue
no_marker_parsed = self.parser_x86.parse_file(code_no_marker)
with self.assertRaises(LookupError):
reduce_to_section(no_marker_parsed, 'x86')
##################
# Helper functions
##################
@staticmethod
def _find_file(name):
testdir = os.path.dirname(__file__)
name = os.path.join(testdir, 'test_files', name)
assert os.path.exists(name)
return name
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestMarkerUtils)
unittest.TextTestRunner(verbosity=2).run(suite)

View File

@@ -1,69 +0,0 @@
#!/usr/bin/env python3
import sys
from io import StringIO
import os
import unittest
sys.path.insert(0, '..')
from osaca import osaca
class TestOsaca(unittest.TestCase):
maxDiff = None
def setUp(self):
self.curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
@unittest.skip("Binary analysis is error prone and currently not working with FSF's objdump")
def testIACABinary(self):
assembly = osaca.get_assembly_from_binary(self.curr_dir + '/testfiles/taxCalc-ivb-iaca')
osa = osaca.OSACA('IVB', assembly)
result = osa.generate_text_output()
result = result[result.find('Port Binding in Cycles Per Iteration:'):]
with open(self.curr_dir + '/test_osaca_iaca.out', encoding='utf-8') as f:
assertion = f.read()
self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
# Test ASM file with IACA marker in two lines
def testIACAasm1(self):
with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca.S') as f:
osa = osaca.OSACA('IVB', f.read())
result = osa.generate_text_output()
result = result[result.find('Port Binding in Cycles Per Iteration:'):]
with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
assertion = f.read()
self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
# Test ASM file with IACA marker in four lines
def testIACAasm2(self):
with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca2.S') as f:
osa = osaca.OSACA('IVB', f.read())
result = osa.generate_text_output()
result = result[result.find('Port Binding in Cycles Per Iteration:'):]
with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
assertion = f.read()
self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
#@unittest.skip("Skip until required instructions are supported.")
def test_asm_API(self):
with open(self.curr_dir + '/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s') as f:
osa = osaca.OSACA('SKX', f.read())
text_output = osa.create_output()
print(text_output)
# Derived from IACA (and manually considering OSACAs equal distribution to ports)
self.assertEqual(dict(osa.get_port_occupation_cycles()),
{'0': 4.0,
'0DV': 0.0,
'1': 3.5,
'2': 3.5,
'3': 3.5,
'4': 1.0,
'5': 4.5,
'6': 3.5,
'7': 0.0})
# TODO consider frontend bottleneck -> 6.25 cy
self.assertEqual(osa.get_total_throughput(),
4.5)

View File

@@ -1,26 +0,0 @@
Port Binding in Cycles Per Iteration:
-------------------------------------------------
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------------
| Cycles | 3.67 | 5.67 | 1.0 | 1.0 | 2.0 | 3.67 |
-------------------------------------------------
Ports Pressure in cycles
| 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------
| 0.50 | 0.50 | | | | | lea 1(%rax,%rax),%edx
| | 1.00 | | | | 1.00 | vcvtsi2ss %edx,%xmm2,%xmm2
| 1.00 | | | | | | vmulss %xmm2,%xmm0,%xmm3
| 0.50 | 0.50 | | | | | lea 2(%rax,%rax),%ecx
| | 1.00 | | | | | vaddss %xmm3,%xmm1,%xmm4
| | | | | | 1.00 | vxorps %xmm1,%xmm1,%xmm1
| | 1.00 | | | | 1.00 | vcvtsi2ss %ecx,%xmm1,%xmm1
| 1.00 | | | | | | vmulss %xmm1,%xmm0,%xmm5
| | | 0.50 | 0.50 | 1.00 | | vmovss %xmm4,4(%rsp,%rax,8)
| | 1.00 | | | | | vaddss %xmm5,%xmm4,%xmm1
| | | 0.50 | 0.50 | 1.00 | | vmovss %xmm1,8(%rsp,%rax,8)
| 0.33 | 0.33 | | | | 0.33 | inc %rax
| 0.33 | 0.33 | | | | 0.33 | cmp $499,%rax
| | | | | | | X jb main_98
Total number of estimated throughput: 5.67

View File

@@ -1,26 +0,0 @@
Port Binding in Cycles Per Iteration:
-------------------------------------------------
| Port | 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------------
| Cycles | 3.67 | 5.67 | 1.0 | 1.0 | 2.0 | 3.67 |
-------------------------------------------------
Ports Pressure in cycles
| 0 | 1 | 2 | 3 | 4 | 5 |
-------------------------------------------
| 0.50 | 0.50 | | | | | lea 1(%rax,%rax), %edx
| | 1.00 | | | | 1.00 | vcvtsi2ss %edx, %xmm2, %xmm2
| 1.00 | | | | | | vmulss %xmm2, %xmm0, %xmm3
| 0.50 | 0.50 | | | | | lea 2(%rax,%rax), %ecx
| | 1.00 | | | | | vaddss %xmm3, %xmm1, %xmm4
| | | | | | 1.00 | vxorps %xmm1, %xmm1, %xmm1
| | 1.00 | | | | 1.00 | vcvtsi2ss %ecx, %xmm1, %xmm1
| 1.00 | | | | | | vmulss %xmm1, %xmm0, %xmm5
| | | 0.50 | 0.50 | 1.00 | | vmovss %xmm4, 4(%rsp,%rax,8)
| | 1.00 | | | | | vaddss %xmm5, %xmm4, %xmm1
| | | 0.50 | 0.50 | 1.00 | | vmovss %xmm1, 8(%rsp,%rax,8)
| 0.33 | 0.33 | | | | 0.33 | incq %rax
| 0.33 | 0.33 | | | | 0.33 | cmpq $499, %rax
| | | | | | | jb ..B1.4
Total number of estimated throughput: 5.67

413
tests/test_parser_AArch64v81.py Executable file
View File

@@ -0,0 +1,413 @@
#!/usr/bin/env python3
"""
Unit tests for ARMv8 AArch64 assembly parser
"""
import os
import unittest
from pyparsing import ParseException
from osaca.parser import AttrDict, ParserAArch64v81
class TestParserAArch64v81(unittest.TestCase):
@classmethod
def setUpClass(self):
self.parser = ParserAArch64v81()
with open(self._find_file('triad-arm-iaca.s')) as f:
self.triad_code = f.read()
##################
# Test
##################
def test_comment_parser(self):
self.assertEqual(self._get_comment(self.parser, '// some comments'), 'some comments')
self.assertEqual(
self._get_comment(self.parser, '\t\t//AA BB CC \t end \t'), 'AA BB CC end'
)
self.assertEqual(
self._get_comment(self.parser, '\t//// comment //// comment'),
'// comment //// comment',
)
def test_label_parser(self):
self.assertEqual(self._get_label(self.parser, 'main:').name, 'main')
self.assertEqual(self._get_label(self.parser, '..B1.10:').name, '..B1.10')
self.assertEqual(self._get_label(self.parser, '.2.3_2_pack.3:').name, '.2.3_2_pack.3')
self.assertEqual(self._get_label(self.parser, '.L1:\t\t\t//label1').name, '.L1')
self.assertEqual(
' '.join(self._get_label(self.parser, '.L1:\t\t\t//label1').comment), 'label1'
)
with self.assertRaises(ParseException):
self._get_label(self.parser, '\t.cfi_startproc')
def test_directive_parser(self):
self.assertEqual(self._get_directive(self.parser, '\t.text').name, 'text')
self.assertEqual(len(self._get_directive(self.parser, '\t.text').parameters), 0)
self.assertEqual(self._get_directive(self.parser, '\t.align\t16,0x90').name, 'align')
self.assertEqual(len(self._get_directive(self.parser, '\t.align\t16,0x90').parameters), 2)
self.assertEqual(
self._get_directive(self.parser, '\t.align\t16,0x90').parameters[1], '0x90'
)
self.assertEqual(
self._get_directive(self.parser, ' .byte 100,103,144 //IACA START')[
'name'
],
'byte',
)
self.assertEqual(
self._get_directive(self.parser, ' .byte 100,103,144 //IACA START')[
'parameters'
][2],
'144',
)
self.assertEqual(
' '.join(
self._get_directive(self.parser, ' .byte 100,103,144 //IACA START')[
'comment'
]
),
'IACA START',
)
def test_parse_instruction(self):
instr1 = '\t\tvcvt.F32.S32 w1, w2\t\t\t//12.27'
instr2 = 'b.lo ..B1.4 \t'
instr3 = ' mov x2,#0x222 //NOT IACA END'
instr4 = 'str x28, [sp, x1, lsl #4] //12.9'
instr5 = 'ldr x0, [x0, #:got_lo12:q2c]'
instr6 = 'adrp x0, :got:visited'
instr7 = 'fadd v17.2d, v16.2d, v1.2d'
parsed_1 = self.parser.parse_instruction(instr1)
parsed_2 = self.parser.parse_instruction(instr2)
parsed_3 = self.parser.parse_instruction(instr3)
parsed_4 = self.parser.parse_instruction(instr4)
parsed_5 = self.parser.parse_instruction(instr5)
parsed_6 = self.parser.parse_instruction(instr6)
parsed_7 = self.parser.parse_instruction(instr7)
self.assertEqual(parsed_1.instruction, 'vcvt.F32.S32')
self.assertEqual(parsed_1.operands[0].register.name, '1')
self.assertEqual(parsed_1.operands[0].register.prefix, 'w')
self.assertEqual(parsed_1.operands[1].register.name, '2')
self.assertEqual(parsed_1.operands[1].register.prefix, 'w')
self.assertEqual(parsed_1.comment, '12.27')
self.assertEqual(parsed_2.instruction, 'b.lo')
self.assertEqual(parsed_2.operands[0].identifier.name, '..B1.4')
self.assertEqual(len(parsed_2.operands), 1)
self.assertIsNone(parsed_2.comment)
self.assertEqual(parsed_3.instruction, 'mov')
self.assertEqual(parsed_3.operands[0].register.name, '2')
self.assertEqual(parsed_3.operands[0].register.prefix, 'x')
self.assertEqual(parsed_3.operands[1].immediate.value, '0x222')
self.assertEqual(parsed_3.comment, 'NOT IACA END')
self.assertEqual(parsed_4.instruction, 'str')
self.assertIsNone(parsed_4.operands[1].memory.offset)
self.assertEqual(parsed_4.operands[1].memory.base.name, 'sp')
self.assertEqual(parsed_4.operands[1].memory.base.prefix, 'x')
self.assertEqual(parsed_4.operands[1].memory.index.name, '1')
self.assertEqual(parsed_4.operands[1].memory.index.prefix, 'x')
self.assertEqual(parsed_4.operands[1].memory.scale, 16)
self.assertEqual(parsed_4.operands[0].register.name, '28')
self.assertEqual(parsed_4.operands[0].register.prefix, 'x')
self.assertEqual(parsed_4.comment, '12.9')
self.assertEqual(parsed_5.instruction, 'ldr')
self.assertEqual(parsed_5.operands[0].register.name, '0')
self.assertEqual(parsed_5.operands[0].register.prefix, 'x')
self.assertEqual(parsed_5.operands[1].memory.offset.identifier.name, 'q2c')
self.assertEqual(parsed_5.operands[1].memory.offset.identifier.relocation, ':got_lo12:')
self.assertEqual(parsed_5.operands[1].memory.base.name, '0')
self.assertEqual(parsed_5.operands[1].memory.base.prefix, 'x')
self.assertIsNone(parsed_5.operands[1].memory.index)
self.assertEqual(parsed_5.operands[1].memory.scale, 1)
self.assertEqual(parsed_6.instruction, 'adrp')
self.assertEqual(parsed_6.operands[0].register.name, '0')
self.assertEqual(parsed_6.operands[0].register.prefix, 'x')
self.assertEqual(parsed_6.operands[1].identifier.relocation, ':got:')
self.assertEqual(parsed_6.operands[1].identifier.name, 'visited')
self.assertEqual(parsed_7.instruction, 'fadd')
self.assertEqual(parsed_7.operands[0].register.name, '17')
self.assertEqual(parsed_7.operands[0].register.prefix, 'v')
self.assertEqual(parsed_7.operands[0].register.lanes, '2')
self.assertEqual(parsed_7.operands[0].register.shape, 'd')
self.assertEqual(self.parser.get_full_reg_name(parsed_7.operands[2].register), 'v1.2d')
def test_parse_line(self):
line_comment = '// -- Begin main'
line_label = '.LBB0_1: // =>This Inner Loop Header: Depth=1'
line_directive = '\t.cfi_def_cfa w29, -16'
line_instruction = '\tldr s0, [x11, w10, sxtw #2]\t\t// = <<2'
line_prefetch = 'prfm pldl1keep, [x26, #2048] //HPL'
line_preindexed = 'stp x29, x30, [sp, #-16]!'
line_postindexed = 'ldp q2, q3, [x11], #64'
instruction_form_1 = {
'instruction': None,
'operands': None,
'directive': None,
'comment': '-- Begin main',
'label': None,
'line': '// -- Begin main',
'line_number': 1,
}
instruction_form_2 = {
'instruction': None,
'operands': None,
'directive': None,
'comment': '=>This Inner Loop Header: Depth=1',
'label': '.LBB0_1',
'line': '.LBB0_1: // =>This Inner Loop Header: Depth=1',
'line_number': 2,
}
instruction_form_3 = {
'instruction': None,
'operands': None,
'directive': {'name': 'cfi_def_cfa', 'parameters': ['w29', '-16']},
'comment': None,
'label': None,
'line': '.cfi_def_cfa w29, -16',
'line_number': 3,
}
instruction_form_4 = {
'instruction': 'ldr',
'operands': [
{'register': {'prefix': 's', 'name': '0'}},
{
'memory': {
'offset': None,
'base': {'prefix': 'x', 'name': '11'},
'index': {
'prefix': 'w',
'name': '10',
'shift_op': 'sxtw',
'shift': {'value': '2'},
},
'scale': 4,
}
},
],
'directive': None,
'comment': '= <<2',
'label': None,
'line': 'ldr s0, [x11, w10, sxtw #2]\t\t// = <<2',
'line_number': 4,
}
instruction_form_5 = {
'instruction': 'prfm',
'operands': [
{'prfop': {'type': ['PLD'], 'target': ['L1'], 'policy': ['KEEP']}},
{
'memory': {
'offset': {'value': '2048'},
'base': {'prefix': 'x', 'name': '26'},
'index': None,
'scale': 1,
}
},
],
'directive': None,
'comment': 'HPL',
'label': None,
'line': 'prfm pldl1keep, [x26, #2048] //HPL',
'line_number': 5,
}
instruction_form_6 = {
'instruction': 'stp',
'operands': [
{'register': {'prefix': 'x', 'name': '29'}},
{'register': {'prefix': 'x', 'name': '30'}},
{
'memory': {
'offset': {'value': '-16'},
'base': {'name': 'sp', 'prefix': 'x'},
'index': None,
'scale': 1,
'pre_indexed': True,
}
},
],
'directive': None,
'comment': None,
'label': None,
'line': 'stp x29, x30, [sp, #-16]!',
'line_number': 6,
}
instruction_form_7 = {
'instruction': 'ldp',
'operands': [
{'register': {'prefix': 'q', 'name': '2'}},
{'register': {'prefix': 'q', 'name': '3'}},
{
'memory': {
'offset': None,
'base': {'prefix': 'x', 'name': '11'},
'index': None,
'scale': 1,
'post_indexed': {'value': '64'},
}
},
],
'directive': None,
'comment': None,
'label': None,
'line': 'ldp q2, q3, [x11], #64',
'line_number': 7,
}
parsed_1 = self.parser.parse_line(line_comment, 1)
parsed_2 = self.parser.parse_line(line_label, 2)
parsed_3 = self.parser.parse_line(line_directive, 3)
parsed_4 = self.parser.parse_line(line_instruction, 4)
parsed_5 = self.parser.parse_line(line_prefetch, 5)
parsed_6 = self.parser.parse_line(line_preindexed, 6)
parsed_7 = self.parser.parse_line(line_postindexed, 7)
self.assertEqual(parsed_1, instruction_form_1)
self.assertEqual(parsed_2, instruction_form_2)
self.assertEqual(parsed_3, instruction_form_3)
self.assertEqual(parsed_4, instruction_form_4)
self.assertEqual(parsed_5, instruction_form_5)
self.assertEqual(parsed_6, instruction_form_6)
self.assertEqual(parsed_7, instruction_form_7)
def test_parse_file(self):
parsed = self.parser.parse_file(self.triad_code)
self.assertEqual(parsed[0].line_number, 1)
self.assertEqual(len(parsed), 645)
def test_normalize_imd(self):
imd_decimal_1 = {'value': '79'}
imd_hex_1 = {'value': '0x4f'}
imd_decimal_2 = {'value': '8'}
imd_hex_2 = {'value': '0x8'}
imd_float_11 = {'float': {'mantissa': '0.79', 'e_sign': '+', 'exponent': '2'}}
imd_float_12 = {'float': {'mantissa': '790.0', 'e_sign': '-', 'exponent': '1'}}
imd_double_11 = {'double': {'mantissa': '0.79', 'e_sign': '+', 'exponent': '2'}}
imd_double_12 = {'double': {'mantissa': '790.0', 'e_sign': '-', 'exponent': '1'}}
identifier = {'identifier': {'name': '..B1.4'}}
value1 = self.parser.normalize_imd(imd_decimal_1)
self.assertEqual(value1, self.parser.normalize_imd(imd_hex_1))
self.assertEqual(
self.parser.normalize_imd(imd_decimal_2), self.parser.normalize_imd(imd_hex_2)
)
self.assertEqual(self.parser.normalize_imd(imd_float_11), value1)
self.assertEqual(self.parser.normalize_imd(imd_float_12), value1)
self.assertEqual(self.parser.normalize_imd(imd_double_11), value1)
self.assertEqual(self.parser.normalize_imd(imd_double_12), value1)
self.assertEqual(self.parser.normalize_imd(identifier), identifier)
def test_multiple_regs(self):
instr_range = 'PUSH {r5-r7}'
reg_range = AttrDict({
'register': {
'range': [
{'prefix': 'r', 'name': '5'},
{'prefix': 'r', 'name': '7'}
],
'index': None
}
})
instr_list = 'POP {r5, r7, r9}'
reg_list = AttrDict({
'register': {
'list': [
{'prefix': 'r', 'name': '5'},
{'prefix': 'r', 'name': '7'},
{'prefix': 'r', 'name': '9'}
],
'index': None
}
})
prange = self.parser.parse_line(instr_range)
plist = self.parser.parse_line(instr_list)
self.assertEqual(prange.operands[0], reg_range)
self.assertEqual(plist.operands[0], reg_list)
def test_reg_dependency(self):
reg_1_1 = AttrDict({'prefix': 'b', 'name': '1'})
reg_1_2 = AttrDict({'prefix': 'h', 'name': '1'})
reg_1_3 = AttrDict({'prefix': 's', 'name': '1'})
reg_1_4 = AttrDict({'prefix': 'd', 'name': '1'})
reg_1_4 = AttrDict({'prefix': 'q', 'name': '1'})
reg_2_1 = AttrDict({'prefix': 'w', 'name': '2'})
reg_2_2 = AttrDict({'prefix': 'x', 'name': '2'})
reg_v1_1 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '16', 'shape': 'b'})
reg_v1_2 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '8', 'shape': 'h'})
reg_v1_3 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '4', 'shape': 's'})
reg_v1_4 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '2', 'shape': 'd'})
reg_b5 = AttrDict({'prefix': 'b', 'name': '5'})
reg_q15 = AttrDict({'prefix': 'q', 'name': '15'})
reg_v10 = AttrDict({'prefix': 'v', 'name': '10', 'lanes': '2', 'shape': 's'})
reg_v20 = AttrDict({'prefix': 'v', 'name': '20', 'lanes': '2', 'shape': 'd'})
reg_1 = [reg_1_1, reg_1_2, reg_1_3, reg_1_4]
reg_2 = [reg_2_1, reg_2_2]
reg_v = [reg_v1_1, reg_v1_2, reg_v1_3, reg_v1_4]
reg_others = [reg_b5, reg_q15, reg_v10, reg_v20]
regs = reg_1 + reg_2 + reg_v + reg_others
# test each register against each other
for ri in reg_1:
for rj in regs:
assert_value = True if rj in reg_1 else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
for ri in reg_2:
for rj in regs:
assert_value = True if rj in reg_2 else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
for ri in reg_v:
for rj in regs:
assert_value = True if rj in reg_v else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
for ri in reg_others:
for rj in regs:
assert_value = True if rj == ri else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
##################
# Helper functions
##################
def _get_comment(self, parser, comment):
return ' '.join(
AttrDict.convert_dict(
parser.process_operand(parser.comment.parseString(comment, parseAll=True).asDict())
).comment
)
def _get_label(self, parser, label):
return AttrDict.convert_dict(
parser.process_operand(parser.label.parseString(label, parseAll=True).asDict())
).label
def _get_directive(self, parser, directive):
return AttrDict.convert_dict(
parser.process_operand(parser.directive.parseString(directive, parseAll=True).asDict())
).directive
@staticmethod
def _find_file(name):
testdir = os.path.dirname(__file__)
name = os.path.join(testdir, 'test_files', name)
assert os.path.exists(name)
return name
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestParserAArch64v81)
unittest.TextTestRunner(verbosity=2).run(suite)

306
tests/test_parser_x86att.py Executable file
View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
Unit tests for x86 AT&T assembly parser
"""
import os
import unittest
from pyparsing import ParseException
from osaca.parser import AttrDict, ParserX86ATT
class TestParserX86ATT(unittest.TestCase):
@classmethod
def setUpClass(self):
self.parser = ParserX86ATT()
with open(self._find_file('triad-x86-iaca.s')) as f:
self.triad_code = f.read()
##################
# Test
##################
def test_comment_parser(self):
self.assertEqual(self._get_comment(self.parser, '# some comments'), 'some comments')
self.assertEqual(self._get_comment(self.parser, '\t\t#AA BB CC \t end \t'), 'AA BB CC end')
self.assertEqual(
self._get_comment(self.parser, '\t## comment ## comment'), '# comment ## comment'
)
def test_label_parser(self):
self.assertEqual(self._get_label(self.parser, 'main:').name, 'main')
self.assertEqual(self._get_label(self.parser, '..B1.10:').name, '..B1.10')
self.assertEqual(self._get_label(self.parser, '.2.3_2_pack.3:').name, '.2.3_2_pack.3')
self.assertEqual(self._get_label(self.parser, '.L1:\t\t\t#label1').name, '.L1')
self.assertEqual(
' '.join(self._get_label(self.parser, '.L1:\t\t\t#label1').comment), 'label1'
)
with self.assertRaises(ParseException):
self._get_label(self.parser, '\t.cfi_startproc')
def test_directive_parser(self):
self.assertEqual(self._get_directive(self.parser, '\t.text').name, 'text')
self.assertEqual(len(self._get_directive(self.parser, '\t.text').parameters), 0)
self.assertEqual(self._get_directive(self.parser, '\t.align\t16,0x90').name, 'align')
self.assertEqual(len(self._get_directive(self.parser, '\t.align\t16,0x90').parameters), 2)
self.assertEqual(
self._get_directive(self.parser, '\t.align\t16,0x90').parameters[1], '0x90'
)
self.assertEqual(
self._get_directive(self.parser, ' .byte 100,103,144 #IACA START')[
'name'
],
'byte',
)
self.assertEqual(
self._get_directive(self.parser, ' .byte 100,103,144 #IACA START')[
'parameters'
][2],
'144',
)
self.assertEqual(
' '.join(
self._get_directive(self.parser, ' .byte 100,103,144 #IACA START')[
'comment'
]
),
'IACA START',
)
def test_parse_instruction(self):
instr1 = '\t\tvcvtsi2ss %edx, %xmm2, %xmm2\t\t\t#12.27'
instr2 = 'jb ..B1.4 \t'
instr3 = ' movl $222,%ebx #IACA END'
instr4 = 'vmovss %xmm4, -4(%rsp,%rax,8) #12.9'
instr5 = 'mov %ebx,var(,1)'
instr6 = 'lea (,%rax,8),%rbx'
instr7 = 'vinsertf128 $0x1, %xmm0, %ymm1, %ymm1'
parsed_1 = self.parser.parse_instruction(instr1)
parsed_2 = self.parser.parse_instruction(instr2)
parsed_3 = self.parser.parse_instruction(instr3)
parsed_4 = self.parser.parse_instruction(instr4)
parsed_5 = self.parser.parse_instruction(instr5)
parsed_6 = self.parser.parse_instruction(instr6)
parsed_7 = self.parser.parse_instruction(instr7)
self.assertEqual(parsed_1.instruction, 'vcvtsi2ss')
self.assertEqual(parsed_1.operands[0].register.name, 'edx')
self.assertEqual(parsed_1.operands[1].register.name, 'xmm2')
self.assertEqual(parsed_1.comment, '12.27')
self.assertEqual(parsed_2.instruction, 'jb')
self.assertEqual(parsed_2.operands[0].identifier.name, '..B1.4')
self.assertEqual(len(parsed_2.operands), 1)
self.assertIsNone(parsed_2.comment)
self.assertEqual(parsed_3.instruction, 'movl')
self.assertEqual(parsed_3.operands[0].immediate.value, '222')
self.assertEqual(parsed_3.operands[1].register.name, 'ebx')
self.assertEqual(parsed_3.comment, 'IACA END')
self.assertEqual(parsed_4.instruction, 'vmovss')
self.assertEqual(parsed_4.operands[1].memory.offset.value, '-4')
self.assertEqual(parsed_4.operands[1].memory.base.name, 'rsp')
self.assertEqual(parsed_4.operands[1].memory.index.name, 'rax')
self.assertEqual(parsed_4.operands[1].memory.scale, 8)
self.assertEqual(parsed_4.operands[0].register.name, 'xmm4')
self.assertEqual(parsed_4.comment, '12.9')
self.assertEqual(parsed_5.instruction, 'mov')
self.assertEqual(parsed_5.operands[1].memory.offset.identifier.name, 'var')
self.assertIsNone(parsed_5.operands[1].memory.base)
self.assertIsNone(parsed_5.operands[1].memory.index)
self.assertEqual(parsed_5.operands[1].memory.scale, 1)
self.assertEqual(parsed_5.operands[0].register.name, 'ebx')
self.assertEqual(parsed_6.instruction, 'lea')
self.assertIsNone(parsed_6.operands[0].memory.offset)
self.assertIsNone(parsed_6.operands[0].memory.base)
self.assertEqual(parsed_6.operands[0].memory.index.name, 'rax')
self.assertEqual(parsed_6.operands[0].memory.scale, 8)
self.assertEqual(parsed_6.operands[1].register.name, 'rbx')
self.assertEqual(parsed_7.operands[0].immediate.value, '0x1')
self.assertEqual(parsed_7.operands[1].register.name, 'xmm0')
self.assertEqual(parsed_7.operands[2].register.name, 'ymm1')
self.assertEqual(parsed_7.operands[3].register.name, 'ymm1')
def test_parse_line(self):
line_comment = '# -- Begin main'
line_label = '..B1.7: # Preds ..B1.6'
line_directive = '\t\t.quad .2.3_2__kmpc_loc_pack.2 #qed'
line_instruction = '\t\tlea 2(%rax,%rax), %ecx #12.9'
instruction_form_1 = {
'instruction': None,
'operands': None,
'directive': None,
'comment': '-- Begin main',
'label': None,
'line': '# -- Begin main',
'line_number': 1,
}
instruction_form_2 = {
'instruction': None,
'operands': None,
'directive': None,
'comment': 'Preds ..B1.6',
'label': '..B1.7',
'line': '..B1.7: # Preds ..B1.6',
'line_number': 2,
}
instruction_form_3 = {
'instruction': None,
'operands': None,
'directive': {'name': 'quad', 'parameters': ['.2.3_2__kmpc_loc_pack.2']},
'comment': 'qed',
'label': None,
'line': '.quad .2.3_2__kmpc_loc_pack.2 #qed',
'line_number': 3,
}
instruction_form_4 = {
'instruction': 'lea',
'operands': [
{
'memory': {
'offset': {'value': '2'},
'base': {'name': 'rax'},
'index': {'name': 'rax'},
'scale': 1,
}
},
{
'register': {'name': 'ecx'}
}
],
'directive': None,
'comment': '12.9',
'label': None,
'line': 'lea 2(%rax,%rax), %ecx #12.9',
'line_number': 4,
}
parsed_1 = self.parser.parse_line(line_comment, 1)
parsed_2 = self.parser.parse_line(line_label, 2)
parsed_3 = self.parser.parse_line(line_directive, 3)
parsed_4 = self.parser.parse_line(line_instruction, 4)
self.assertEqual(parsed_1, instruction_form_1)
self.assertEqual(parsed_2, instruction_form_2)
self.assertEqual(parsed_3, instruction_form_3)
self.assertEqual(parsed_4, instruction_form_4)
def test_parse_file(self):
parsed = self.parser.parse_file(self.triad_code)
self.assertEqual(parsed[0].line_number, 1)
self.assertEqual(len(parsed), 353)
def test_parse_register(self):
register_str_1 = '%rax'
register_str_2 = '%r9'
register_str_3 = '%xmm1'
register_str_4 = '%rip'
parsed_reg_1 = {'register': {'name': 'rax'}}
parsed_reg_2 = {'register': {'name': 'r9'}}
parsed_reg_3 = {'register': {'name': 'xmm1'}}
parsed_reg_4 = {'register': {'name': 'rip'}}
self.assertEqual(self.parser.parse_register(register_str_1), parsed_reg_1)
self.assertEqual(self.parser.parse_register(register_str_2), parsed_reg_2)
self.assertEqual(self.parser.parse_register(register_str_3), parsed_reg_3)
self.assertEqual(self.parser.parse_register(register_str_4), parsed_reg_4)
self.assertIsNone(self.parser.parse_register('rax'))
def test_normalize_imd(self):
imd_decimal_1 = {'value': '79'}
imd_hex_1 = {'value': '0x4f'}
imd_decimal_2 = {'value': '8'}
imd_hex_2 = {'value': '0x8'}
self.assertEqual(
self.parser.normalize_imd(imd_decimal_1), self.parser.normalize_imd(imd_hex_1)
)
self.assertEqual(
self.parser.normalize_imd(imd_decimal_2), self.parser.normalize_imd(imd_hex_2)
)
def test_reg_dependency(self):
reg_a1 = AttrDict({'name': 'rax'})
reg_a2 = AttrDict({'name': 'eax'})
reg_a3 = AttrDict({'name': 'ax'})
reg_a4 = AttrDict({'name': 'al'})
reg_r11 = AttrDict({'name': 'r11'})
reg_r11b = AttrDict({'name': 'r11b'})
reg_r11d = AttrDict({'name': 'r11d'})
reg_r11w = AttrDict({'name': 'r11w'})
reg_xmm1 = AttrDict({'name': 'xmm1'})
reg_ymm1 = AttrDict({'name': 'ymm1'})
reg_zmm1 = AttrDict({'name': 'zmm1'})
reg_b1 = AttrDict({'name': 'rbx'})
reg_r15 = AttrDict({'name': 'r15'})
reg_xmm2 = AttrDict({'name': 'xmm2'})
reg_ymm3 = AttrDict({'name': 'ymm3'})
reg_a = [reg_a1, reg_a2, reg_a3, reg_a4]
reg_r = [reg_r11, reg_r11b, reg_r11d, reg_r11w]
reg_vec_1 = [reg_xmm1, reg_ymm1, reg_zmm1]
reg_others = [reg_b1, reg_r15, reg_xmm2, reg_ymm3]
regs = reg_a + reg_r + reg_vec_1 + reg_others
# test each register against each other
for ri in reg_a:
for rj in regs:
assert_value = True if rj in reg_a else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
for ri in reg_r:
for rj in regs:
assert_value = True if rj in reg_r else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
for ri in reg_vec_1:
for rj in regs:
assert_value = True if rj in reg_vec_1 else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
for ri in reg_others:
for rj in regs:
assert_value = True if rj == ri else False
with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
##################
# Helper functions
##################
def _get_comment(self, parser, comment):
return ' '.join(
AttrDict.convert_dict(
parser.process_operand(parser.comment.parseString(comment, parseAll=True).asDict())
).comment
)
def _get_label(self, parser, label):
return AttrDict.convert_dict(
parser.process_operand(parser.label.parseString(label, parseAll=True).asDict())
).label
def _get_directive(self, parser, directive):
return AttrDict.convert_dict(
parser.process_operand(parser.directive.parseString(directive, parseAll=True).asDict())
).directive
@staticmethod
def _find_file(name):
testdir = os.path.dirname(__file__)
name = os.path.join(testdir, 'test_files', name)
assert os.path.exists(name)
return name
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestParserX86ATT)
unittest.TextTestRunner(verbosity=2).run(suite)

359
tests/test_semantics.py Executable file
View File

@@ -0,0 +1,359 @@
#!/usr/bin/env python3
"""
Unit tests for Semantic Analysis
"""
import os
import unittest
from subprocess import call
import networkx as nx
from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
from osaca.semantics import (INSTR_FLAGS, KernelDG, MachineModel,
SemanticsAppender)
class TestSemanticTools(unittest.TestCase):
MODULE_DATA_DIR = os.path.join(
os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
)
USER_DATA_DIR = os.path.join(os.path.expanduser('~'), '.osaca/')
@classmethod
def setUpClass(self):
# copy db files in user directory
if not os.path.isdir(os.path.join(self.USER_DATA_DIR, 'data')):
os.makedirs(os.path.join(self.USER_DATA_DIR, 'data'))
call(['cp', '-r', self.MODULE_DATA_DIR, self.USER_DATA_DIR])
# set up parser and kernels
self.parser_x86 = ParserX86ATT()
self.parser_AArch64 = ParserAArch64v81()
with open(self._find_file('kernel-x86.s')) as f:
self.code_x86 = f.read()
with open(self._find_file('kernel-AArch64.s')) as f:
self.code_AArch64 = f.read()
self.kernel_x86 = self.parser_x86.parse_file(self.code_x86)
self.kernel_AArch64 = self.parser_AArch64.parse_file(self.code_AArch64)
# set up machine models
self.machine_model_csx = MachineModel(
path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
)
self.machine_model_tx2 = MachineModel(
path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
)
self.semantics_csx = SemanticsAppender(
self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
)
self.semantics_tx2 = SemanticsAppender(
self.machine_model_tx2,
path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/aarch64.yml'),
)
self.machine_model_zen = MachineModel(arch='zen1')
for i in range(len(self.kernel_x86)):
self.semantics_csx.assign_src_dst(self.kernel_x86[i])
self.semantics_csx.assign_tp_lt(self.kernel_x86[i])
for i in range(len(self.kernel_AArch64)):
self.semantics_tx2.assign_src_dst(self.kernel_AArch64[i])
self.semantics_tx2.assign_tp_lt(self.kernel_AArch64[i])
###########
# Tests
###########
def test_creation_by_name(self):
try:
tmp_mm = MachineModel(arch='CSX')
SemanticsAppender(tmp_mm)
except ValueError:
self.fail()
def test_src_dst_assignment_x86(self):
for instruction_form in self.kernel_x86:
with self.subTest(instruction_form=instruction_form):
if instruction_form['operands'] is not None:
self.assertTrue('source' in instruction_form['operands'])
self.assertTrue('destination' in instruction_form['operands'])
self.assertTrue('src_dst' in instruction_form['operands'])
def test_src_dst_assignment_AArch64(self):
for instruction_form in self.kernel_AArch64:
with self.subTest(instruction_form=instruction_form):
if instruction_form['operands'] is not None:
self.assertTrue('source' in instruction_form['operands'])
self.assertTrue('destination' in instruction_form['operands'])
self.assertTrue('src_dst' in instruction_form['operands'])
def test_tp_lt_assignment_x86(self):
self.assertTrue('ports' in self.machine_model_csx)
port_num = len(self.machine_model_csx['ports'])
for instruction_form in self.kernel_x86:
with self.subTest(instruction_form=instruction_form):
self.assertTrue('throughput' in instruction_form)
self.assertTrue('latency' in instruction_form)
self.assertIsInstance(instruction_form['port_pressure'], list)
self.assertEqual(len(instruction_form['port_pressure']), port_num)
def test_tp_lt_assignment_AArch64(self):
self.assertTrue('ports' in self.machine_model_tx2)
port_num = len(self.machine_model_tx2['ports'])
for instruction_form in self.kernel_AArch64:
with self.subTest(instruction_form=instruction_form):
self.assertTrue('throughput' in instruction_form)
self.assertTrue('latency' in instruction_form)
self.assertIsInstance(instruction_form['port_pressure'], list)
self.assertEqual(len(instruction_form['port_pressure']), port_num)
def test_kernelDG_x86(self):
#
# 3
# \___>5__>6
# /
# 2
# 4_______>8
#
dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=4))), 1)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=4)), 7)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=5))), 1)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=5)), 7)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=6))), 1)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=6)), 10)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=7))), 1)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=7)), 8)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=8))), 0)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=9))), 0)
with self.assertRaises(ValueError):
dg.get_dependent_instruction_forms()
def test_kernelDG_AArch64(self):
dg = KernelDG(self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2)
self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=4)), {8, 9})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=5)), {10, 11})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=6)), {7, 8, 9})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=7)), {10, 11})
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=8)), 14)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=9)), 15)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=10)), 17)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=11)), 18)
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=12)), {14, 15})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=13)), {17, 18})
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=14)), 16)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=15)), 16)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=16))), 0)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=17)), 19)
self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=18)), 19)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=19))), 0)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=20))), 0)
self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=21))), 0)
with self.assertRaises(ValueError):
dg.get_dependent_instruction_forms()
def test_hidden_load(self):
machine_model_hld = MachineModel(
path_to_yaml=self._find_file('hidden_load_machine_model.yml')
)
self.assertTrue(machine_model_hld.has_hidden_loads())
semantics_hld = SemanticsAppender(machine_model_hld)
kernel_hld = self.parser_x86.parse_file(self.code_x86)
kernel_hld_2 = self.parser_x86.parse_file(self.code_x86)
kernel_hld_2 = self.parser_x86.parse_file(self.code_x86)[-3:]
kernel_hld_3 = self.parser_x86.parse_file(self.code_x86)[5:8]
semantics_hld.add_semantics(kernel_hld)
semantics_hld.add_semantics(kernel_hld_2)
semantics_hld.add_semantics(kernel_hld_3)
num_hidden_loads = len([x for x in kernel_hld if INSTR_FLAGS.HIDDEN_LD in x['flags']])
num_hidden_loads_2 = len([x for x in kernel_hld_2 if INSTR_FLAGS.HIDDEN_LD in x['flags']])
num_hidden_loads_3 = len([x for x in kernel_hld_3 if INSTR_FLAGS.HIDDEN_LD in x['flags']])
self.assertEqual(num_hidden_loads, 1)
self.assertEqual(num_hidden_loads_2, 0)
self.assertEqual(num_hidden_loads_3, 1)
def test_cyclic_dag(self):
dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
dg.dg.add_edge(100, 101, latency=1.0)
dg.dg.add_edge(101, 102, latency=2.0)
dg.dg.add_edge(102, 100, latency=3.0)
with self.assertRaises(NotImplementedError):
dg.get_critical_path()
with self.assertRaises(NotImplementedError):
dg.get_loopcarried_dependencies()
def test_loop_carried_dependency_x86(self):
lcd_id = 9
lcd_id2 = 6
dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
lc_deps = dg.get_loopcarried_dependencies()
self.assertEqual(len(lc_deps), 2)
# ID 9
self.assertEqual(
lc_deps[lcd_id]['root'], dg.dg.nodes(data=True)[lcd_id]['instruction_form']
)
self.assertEqual(len(lc_deps[lcd_id]['dependencies']), 1)
self.assertEqual(
lc_deps[lcd_id]['dependencies'][0], dg.dg.nodes(data=True)[lcd_id]['instruction_form']
)
# ID 6
self.assertEqual(
lc_deps[lcd_id2]['root'], dg.dg.nodes(data=True)[lcd_id2]['instruction_form']
)
self.assertEqual(len(lc_deps[lcd_id2]['dependencies']), 1)
self.assertEqual(
lc_deps[lcd_id2]['dependencies'][0],
dg.dg.nodes(data=True)[lcd_id2]['instruction_form'],
)
def test_is_read_is_written_x86(self):
# independent form HW model
dag = KernelDG(self.kernel_x86, self.parser_x86, None)
reg_rcx = AttrDict({'name': 'rcx'})
reg_ymm1 = AttrDict({'name': 'ymm1'})
instr_form_r_c = self.parser_x86.parse_line('vmovsd %xmm0, (%r15,%rcx,8)')
self.semantics_csx.assign_src_dst(instr_form_r_c)
instr_form_non_r_c = self.parser_x86.parse_line('movl %xmm0, (%r15,%rax,8)')
self.semantics_csx.assign_src_dst(instr_form_non_r_c)
instr_form_w_c = self.parser_x86.parse_line('movi $0x05ACA, %rcx')
self.semantics_csx.assign_src_dst(instr_form_w_c)
instr_form_rw_ymm_1 = self.parser_x86.parse_line('vinsertf128 $0x1, %xmm1, %ymm0, %ymm1')
self.semantics_csx.assign_src_dst(instr_form_rw_ymm_1)
instr_form_rw_ymm_2 = self.parser_x86.parse_line('vinsertf128 $0x1, %xmm0, %ymm1, %ymm1')
self.semantics_csx.assign_src_dst(instr_form_rw_ymm_2)
instr_form_r_ymm = self.parser_x86.parse_line('vmovapd %ymm1, %ymm0')
self.semantics_csx.assign_src_dst(instr_form_r_ymm)
self.assertTrue(dag.is_read(reg_rcx, instr_form_r_c))
self.assertFalse(dag.is_read(reg_rcx, instr_form_non_r_c))
self.assertFalse(dag.is_read(reg_rcx, instr_form_w_c))
self.assertTrue(dag.is_written(reg_rcx, instr_form_w_c))
self.assertFalse(dag.is_written(reg_rcx, instr_form_r_c))
self.assertTrue(dag.is_read(reg_ymm1, instr_form_rw_ymm_1))
self.assertTrue(dag.is_read(reg_ymm1, instr_form_rw_ymm_2))
self.assertTrue(dag.is_read(reg_ymm1, instr_form_r_ymm))
self.assertTrue(dag.is_written(reg_ymm1, instr_form_rw_ymm_1))
self.assertTrue(dag.is_written(reg_ymm1, instr_form_rw_ymm_2))
self.assertFalse(dag.is_written(reg_ymm1, instr_form_r_ymm))
def test_is_read_is_written_AArch64(self):
# independent form HW model
dag = KernelDG(self.kernel_AArch64, self.parser_AArch64, None)
reg_x1 = AttrDict({'prefix': 'x', 'name': '1'})
reg_w1 = AttrDict({'prefix': 'w', 'name': '1'})
reg_d1 = AttrDict({'prefix': 'd', 'name': '1'})
reg_q1 = AttrDict({'prefix': 'q', 'name': '1'})
reg_v1 = AttrDict({'prefix': 'v', 'name': '1', 'lanes': '2', 'shape': 'd'})
regs = [reg_d1, reg_q1, reg_v1]
regs_gp = [reg_w1, reg_x1]
instr_form_r_1 = self.parser_AArch64.parse_line('stp q1, q3, [x12, #192]')
self.semantics_tx2.assign_src_dst(instr_form_r_1)
instr_form_r_2 = self.parser_AArch64.parse_line('fadd v2.2d, v1.2d, v0.2d')
self.semantics_tx2.assign_src_dst(instr_form_r_2)
instr_form_w_1 = self.parser_AArch64.parse_line('ldr d1, [x1, #:got_lo12:q2c]')
self.semantics_tx2.assign_src_dst(instr_form_w_1)
instr_form_non_w_1 = self.parser_AArch64.parse_line('ldr x1, [x1, #:got_lo12:q2c]')
self.semantics_tx2.assign_src_dst(instr_form_non_w_1)
instr_form_rw_1 = self.parser_AArch64.parse_line('fmul v1.2d, v1.2d, v0.2d')
self.semantics_tx2.assign_src_dst(instr_form_rw_1)
instr_form_rw_2 = self.parser_AArch64.parse_line('ldp q2, q4, [x1, #64]!')
self.semantics_tx2.assign_src_dst(instr_form_rw_2)
instr_form_rw_3 = self.parser_AArch64.parse_line('str x4, [x1], #64')
self.semantics_tx2.assign_src_dst(instr_form_rw_3)
instr_form_non_rw_1 = self.parser_AArch64.parse_line('adds x1, x11')
self.semantics_tx2.assign_src_dst(instr_form_non_rw_1)
for reg in regs:
with self.subTest(reg=reg):
self.assertTrue(dag.is_read(reg, instr_form_r_1))
self.assertTrue(dag.is_read(reg, instr_form_r_2))
self.assertTrue(dag.is_read(reg, instr_form_rw_1))
self.assertFalse(dag.is_read(reg, instr_form_rw_2))
self.assertFalse(dag.is_read(reg, instr_form_rw_3))
self.assertFalse(dag.is_read(reg, instr_form_w_1))
self.assertTrue(dag.is_written(reg, instr_form_w_1))
self.assertTrue(dag.is_written(reg, instr_form_rw_1))
self.assertFalse(dag.is_written(reg, instr_form_non_w_1))
self.assertFalse(dag.is_written(reg, instr_form_rw_2))
self.assertFalse(dag.is_written(reg, instr_form_rw_3))
self.assertFalse(dag.is_written(reg, instr_form_non_rw_1))
self.assertFalse(dag.is_written(reg, instr_form_non_rw_1))
for reg in regs_gp:
with self.subTest(reg=reg):
self.assertFalse(dag.is_read(reg, instr_form_r_1))
self.assertFalse(dag.is_read(reg, instr_form_r_2))
self.assertFalse(dag.is_read(reg, instr_form_rw_1))
self.assertTrue(dag.is_read(reg, instr_form_rw_2))
self.assertTrue(dag.is_read(reg, instr_form_rw_3))
self.assertTrue(dag.is_read(reg, instr_form_w_1))
self.assertFalse(dag.is_written(reg, instr_form_w_1))
self.assertFalse(dag.is_written(reg, instr_form_rw_1))
self.assertTrue(dag.is_written(reg, instr_form_non_w_1))
self.assertTrue(dag.is_written(reg, instr_form_rw_2))
self.assertTrue(dag.is_written(reg, instr_form_rw_3))
self.assertTrue(dag.is_written(reg, instr_form_non_rw_1))
self.assertTrue(dag.is_written(reg, instr_form_non_rw_1))
def test_invalid_MachineModel(self):
with self.assertRaises(ValueError):
MachineModel()
with self.assertRaises(ValueError):
MachineModel(arch='CSX', path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml'))
with self.assertRaises(FileNotFoundError):
MachineModel(arch='THE_MACHINE')
with self.assertRaises(FileNotFoundError):
MachineModel(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'THE_MACHINE.yml'))
def test_MachineModel_getter(self):
sample_operands = [
{
'memory': {
'offset': None,
'base': {'name': 'r12'},
'index': {'name': 'rcx'},
'scale': 8,
}
}
]
self.assertIsNone(self.machine_model_csx.get_instruction('GETRESULT', sample_operands))
self.assertIsNone(self.machine_model_tx2.get_instruction('GETRESULT', sample_operands))
self.assertEqual(self.machine_model_csx.get_arch(), 'csx')
self.assertEqual(self.machine_model_tx2.get_arch(), 'tx2')
self.assertEqual(self.machine_model_csx.get_ISA(), 'x86')
self.assertEqual(self.machine_model_tx2.get_ISA(), 'aarch64')
ports_csx = ['0', '0DV', '1', '2', '2D', '3', '3D', '4', '5', '6', '7']
data_ports_csx = ['2D', '3D']
self.assertEqual(self.machine_model_csx.get_ports(), ports_csx)
self.assertEqual(self.machine_model_csx.get_data_ports(), data_ports_csx)
self.assertFalse(self.machine_model_tx2.has_hidden_loads())
self.assertEqual(MachineModel.get_isa_for_arch('CSX'), 'x86')
self.assertEqual(MachineModel.get_isa_for_arch('tX2'), 'aarch64')
with self.assertRaises(ValueError):
self.assertIsNone(MachineModel.get_isa_for_arch('THE_MACHINE'))
##################
# Helper functions
##################
@staticmethod
def _find_file(name):
testdir = os.path.dirname(__file__)
name = os.path.join(testdir, 'test_files', name)
assert os.path.exists(name)
return name
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestSemanticTools)
unittest.TextTestRunner(verbosity=2).run(suite)

View File

@@ -1,653 +0,0 @@
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 14
.globl _main ## -- Begin function main
.p2align 4, 0x90
_main: ## @main
.cfi_startproc
## %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $408, %rsp ## imm = 0x198
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
movq %rsi, %rbx
movq 16(%rsi), %rdi
callq _atoi
movl %eax, %r14d
movq 24(%rbx), %rdi
callq _atoi
## kill: def $eax killed $eax def $rax
movq %r14, -96(%rbp) ## 8-byte Spill
movl %r14d, %ecx
imull %r14d, %ecx
movl %ecx, -88(%rbp) ## 4-byte Spill
movq %rax, -72(%rbp) ## 8-byte Spill
imull %eax, %ecx
movslq %ecx, %r13
shlq $3, %r13
leaq -56(%rbp), %rdi
movl $32, %esi
movq %r13, %rdx
callq _posix_memalign
testl %eax, %eax
je LBB0_2
## %bb.1:
movq $0, -56(%rbp)
xorl %ebx, %ebx
jmp LBB0_3
LBB0_2:
movq -56(%rbp), %rbx
LBB0_3:
leaq -56(%rbp), %rdi
movl $32, %esi
movq %r13, %rdx
callq _posix_memalign
testl %eax, %eax
je LBB0_5
## %bb.4:
movq $0, -56(%rbp)
xorl %eax, %eax
jmp LBB0_6
LBB0_5:
movq -56(%rbp), %rax
LBB0_6:
movq %rax, -80(%rbp) ## 8-byte Spill
movq -96(%rbp), %r9 ## 8-byte Reload
movabsq $4602641980904887326, %rax ## imm = 0x3FDFDE7EEC22D41E
movq %rax, -56(%rbp)
cmpl $3, -72(%rbp) ## 4-byte Folded Reload
jl LBB0_15
## %bb.7:
movabsq $4294967296, %r12 ## imm = 0x100000000
leal -1(%r9), %ecx
movslq %r9d, %rax
movslq -88(%rbp), %rdx ## 4-byte Folded Reload
movq %rdx, -160(%rbp) ## 8-byte Spill
movq -72(%rbp), %rsi ## 8-byte Reload
leal -1(%rsi), %edx
leaq 8(%rbx,%rax,8), %rsi
movq %rsi, -152(%rbp) ## 8-byte Spill
movq -80(%rbp), %rsi ## 8-byte Reload
leaq 8(%rsi,%rax,8), %rsi
movq %rsi, -144(%rbp) ## 8-byte Spill
leaq (,%rax,8), %rsi
movq %rsi, -104(%rbp) ## 8-byte Spill
leaq 2(%rax), %rsi
movq %rsi, -136(%rbp) ## 8-byte Spill
shlq $32, %rax
movq %rax, -184(%rbp) ## 8-byte Spill
addq $-1, %rcx
movl %r9d, %eax
movq %rax, -176(%rbp) ## 8-byte Spill
movl $1, %eax
movabsq $4601149042440805838, %rdi ## imm = 0x3FDA90AD19501DCE
movq %rdx, -208(%rbp) ## 8-byte Spill
.p2align 4, 0x90
LBB0_8: ## =>This Loop Header: Depth=1
## Child Loop BB0_10 Depth 2
## Child Loop BB0_11 Depth 3
cmpl $2, %r9d
jle LBB0_14
## %bb.9: ## in Loop: Header=BB0_8 Depth=1
movl %eax, %r14d
imull -88(%rbp), %r14d ## 4-byte Folded Reload
leaq 1(%rax), %r8
movq -160(%rbp), %rdx ## 8-byte Reload
movq %rdx, %rsi
movq %r8, -168(%rbp) ## 8-byte Spill
imulq %r8, %rsi
movq -152(%rbp), %r10 ## 8-byte Reload
leaq (%r10,%rsi,8), %r8
leaq -1(%rax), %rsi
imulq %rdx, %rsi
leaq (%r10,%rsi,8), %r10
movq %rax, %rsi
imulq %rdx, %rsi
movq -144(%rbp), %rdx ## 8-byte Reload
leaq (%rdx,%rsi,8), %r11
addl -136(%rbp), %esi ## 4-byte Folded Reload
shlq $32, %rsi
movl %r9d, %r15d
imull %eax, %r15d
leal 2(%r15), %r13d
imull %r9d, %r13d
addl $1, %r13d
addq $1, %r14
addl $1, %r15d
imull %r9d, %r15d
movl $1, %eax
.p2align 4, 0x90
LBB0_10: ## Parent Loop BB0_8 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_11 Depth 3
movq %rax, -112(%rbp) ## 8-byte Spill
leaq 1(%rax), %rax
movq %rax, -192(%rbp) ## 8-byte Spill
movq %rsi, -120(%rbp) ## 8-byte Spill
xorl %edx, %edx
.p2align 4, 0x90
LBB0_11: ## Parent Loop BB0_8 Depth=1
## Parent Loop BB0_10 Depth=2
## => This Inner Loop Header: Depth=3
movq %rdi, (%r11,%rdx,8)
leal (%r15,%rdx), %r9d
movslq %r9d, %rax
movq %rdi, (%rbx,%rax,8)
movq %rsi, %rax
sarq $29, %rax
movq %rdi, (%rbx,%rax)
leal (%r14,%rdx), %eax
cltq
movq %rdi, (%rbx,%rax,8)
leal (%r13,%rdx), %eax
cltq
movq %rdi, (%rbx,%rax,8)
movq %rdi, (%r10,%rdx,8)
movq %rdi, (%r8,%rdx,8)
addq $1, %rdx
addq %r12, %rsi
cmpq %rdx, %rcx
jne LBB0_11
## %bb.12: ## in Loop: Header=BB0_10 Depth=2
movq -104(%rbp), %rax ## 8-byte Reload
addq %rax, %r8
addq %rax, %r10
addq %rax, %r11
movq -120(%rbp), %rsi ## 8-byte Reload
addq -184(%rbp), %rsi ## 8-byte Folded Reload
movq -176(%rbp), %rax ## 8-byte Reload
addq %rax, %r13
addq %rax, %r14
addq %rax, %r15
cmpq %rdx, -112(%rbp) ## 8-byte Folded Reload
movq -192(%rbp), %rax ## 8-byte Reload
jne LBB0_10
## %bb.13: ## in Loop: Header=BB0_8 Depth=1
movq -168(%rbp), %rsi ## 8-byte Reload
movq %rsi, %rax
movq -96(%rbp), %r9 ## 8-byte Reload
movq -208(%rbp), %rdx ## 8-byte Reload
cmpq %rdx, %rsi
jne LBB0_8
jmp LBB0_15
.p2align 4, 0x90
LBB0_14: ## in Loop: Header=BB0_8 Depth=1
addq $1, %rax
movq %rax, %rsi
cmpq %rdx, %rsi
jne LBB0_8
LBB0_15:
movq _var_false@GOTPCREL(%rip), %rax
cmpl $0, (%rax)
je LBB0_17
## %bb.16:
movq %rbx, %rdi
callq _dummy
movq -80(%rbp), %rdi ## 8-byte Reload
callq _dummy
leaq -56(%rbp), %rdi
callq _dummy
movq -96(%rbp), %r9 ## 8-byte Reload
LBB0_17:
cmpl $3, -72(%rbp) ## 4-byte Folded Reload
jl LBB0_59
## %bb.18:
movabsq $4294967296, %r14 ## imm = 0x100000000
leal -1(%r9), %ecx
movslq %r9d, %rsi
movslq -88(%rbp), %rax ## 4-byte Folded Reload
movq %rax, -312(%rbp) ## 8-byte Spill
movq -72(%rbp), %rax ## 8-byte Reload
addl $-1, %eax
movq %rax, -72(%rbp) ## 8-byte Spill
leaq -1(%rcx), %rax
leaq -2(%rcx), %rdi
movq %rdi, -424(%rbp) ## 8-byte Spill
leaq 1(%rsi), %rdi
movq %rdi, -224(%rbp) ## 8-byte Spill
leaq (%rsi,%rcx), %rdi
movq %rdi, -304(%rbp) ## 8-byte Spill
movl %r9d, %edi
movq %rdi, -256(%rbp) ## 8-byte Spill
movq %rcx, -264(%rbp) ## 8-byte Spill
leaq (%rbx,%rcx,8), %rcx
addq $-8, %rcx
movq %rcx, -352(%rbp) ## 8-byte Spill
leal 6(%r9), %ecx
andl $7, %ecx
movq %rax, -448(%rbp) ## 8-byte Spill
movq %rcx, -344(%rbp) ## 8-byte Spill
subq %rcx, %rax
movq %rsi, %rcx
shlq $32, %rcx
movq %rcx, -440(%rbp) ## 8-byte Spill
leaq 1(%rax), %rcx
movq %rcx, -328(%rbp) ## 8-byte Spill
movq %rax, -336(%rbp) ## 8-byte Spill
leal 1(%rax), %eax
movl %eax, -212(%rbp) ## 4-byte Spill
leaq 2(%rsi), %rax
movq %rax, -296(%rbp) ## 8-byte Spill
movq -80(%rbp), %rax ## 8-byte Reload
leaq 8(%rax,%rsi,8), %rax
movq %rax, -288(%rbp) ## 8-byte Spill
leaq (,%rsi,8), %rax
movq %rax, -432(%rbp) ## 8-byte Spill
movq %rsi, -200(%rbp) ## 8-byte Spill
leaq (%rbx,%rsi,8), %rax
addq $8, %rax
movq %rax, -280(%rbp) ## 8-byte Spill
movl $1, %eax
.p2align 4, 0x90
LBB0_19: ## =>This Loop Header: Depth=1
## Child Loop BB0_52 Depth 2
## Child Loop BB0_37 Depth 3
## Child Loop BB0_55 Depth 3
cmpl $2, %r9d
jle LBB0_58
## %bb.20: ## in Loop: Header=BB0_19 Depth=1
movq %rax, %rcx
movq %rax, %r12
movq -312(%rbp), %r15 ## 8-byte Reload
imulq %r15, %r12
leaq 1(%rax), %rax
movl %r9d, %edi
imull %ecx, %edi
leal 1(%rdi), %r8d
imull %r9d, %r8d
addl $2, %edi
imull %r9d, %edi
movq %rax, -320(%rbp) ## 8-byte Spill
movq %rax, %r13
imulq %r15, %r13
movq -224(%rbp), %rdx ## 8-byte Reload
leaq (%rdx,%r13), %rax
movq %rax, -408(%rbp) ## 8-byte Spill
movq -304(%rbp), %rsi ## 8-byte Reload
leaq (%rsi,%r13), %rax
movq %rax, -400(%rbp) ## 8-byte Spill
addq $-1, %rcx
imulq %r15, %rcx
leaq (%rdx,%rcx), %rax
movq %rax, -392(%rbp) ## 8-byte Spill
leaq (%rsi,%rcx), %rax
movq %rax, -384(%rbp) ## 8-byte Spill
movq -296(%rbp), %rax ## 8-byte Reload
leal (%rax,%r12), %eax
shlq $32, %rax
movq %rax, -104(%rbp) ## 8-byte Spill
movq -280(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r13,8), %r10
leaq (%rax,%rcx,8), %r11
movl %r12d, %edx
addq $1, %rdx
movq -200(%rbp), %rax ## 8-byte Reload
addq %rax, %r13
movq %r13, -144(%rbp) ## 8-byte Spill
addq %rax, %rcx
movq %rcx, -152(%rbp) ## 8-byte Spill
leal 2(%r8), %eax
movq %rax, -240(%rbp) ## 8-byte Spill
leal 1(%r12), %eax
movq %rax, -416(%rbp) ## 8-byte Spill
movq %rdi, %rax
movq %rdi, -112(%rbp) ## 8-byte Spill
leal 1(%rdi), %r15d
movq -224(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r12), %rcx
leaq (%rsi,%r12), %rax
movq %rax, -368(%rbp) ## 8-byte Spill
movq -288(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r12,8), %rsi
leaq -8(%rax,%r12,8), %rax
movq %rax, -136(%rbp) ## 8-byte Spill
movq %r12, -120(%rbp) ## 8-byte Spill
leaq 1(%r12), %rax
movq %rax, -360(%rbp) ## 8-byte Spill
leal -1(%r8), %eax
movl %eax, -124(%rbp) ## 4-byte Spill
movq %rcx, -376(%rbp) ## 8-byte Spill
movq %rcx, -272(%rbp) ## 8-byte Spill
movq %r8, -248(%rbp) ## 8-byte Spill
movq %r8, %rdi
movq %r15, -232(%rbp) ## 8-byte Spill
movq %r15, %r8
xorl %r12d, %r12d
movl $1, %eax
jmp LBB0_52
.p2align 4, 0x90
LBB0_21: ## in Loop: Header=BB0_52 Depth=2
movl %r9d, %edx
imull %r12d, %edx
movq -248(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %ecx
movq -424(%rbp), %rax ## 8-byte Reload
leal (%rcx,%rax), %esi
cmpl %ecx, %esi
jl LBB0_53
## %bb.22: ## in Loop: Header=BB0_52 Depth=2
movq %rax, %rcx
shrq $32, %rcx
jne LBB0_53
## %bb.23: ## in Loop: Header=BB0_52 Depth=2
movq -240(%rbp), %rsi ## 8-byte Reload
leal (%rsi,%rdx), %esi
leal (%rsi,%rax), %edi
cmpl %esi, %edi
jl LBB0_53
## %bb.24: ## in Loop: Header=BB0_52 Depth=2
testq %rcx, %rcx
jne LBB0_53
## %bb.25: ## in Loop: Header=BB0_52 Depth=2
movq -416(%rbp), %rsi ## 8-byte Reload
leal (%rsi,%rdx), %esi
leal (%rsi,%rax), %edi
cmpl %esi, %edi
jl LBB0_53
## %bb.26: ## in Loop: Header=BB0_52 Depth=2
testq %rcx, %rcx
jne LBB0_53
## %bb.27: ## in Loop: Header=BB0_52 Depth=2
addl -232(%rbp), %edx ## 4-byte Folded Reload
leal (%rdx,%rax), %esi
cmpl %edx, %esi
jl LBB0_53
## %bb.28: ## in Loop: Header=BB0_52 Depth=2
testq %rcx, %rcx
jne LBB0_53
## %bb.29: ## in Loop: Header=BB0_52 Depth=2
movq -192(%rbp), %rdx ## 8-byte Reload
movq %rdx, %rsi
imulq -200(%rbp), %rsi ## 8-byte Folded Reload
movq -376(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %rdi
movq -368(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %r13
movq -408(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %r11
movq -400(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %rcx
movq -392(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %r10
addq -384(%rbp), %rsi ## 8-byte Folded Reload
## kill: def $edx killed $edx killed $rdx def $rdx
imull -256(%rbp), %edx ## 4-byte Folded Reload
movq -232(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %r12d
movq -360(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %r9d
movq -240(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %eax
movl %eax, -60(%rbp) ## 4-byte Spill
addl -248(%rbp), %edx ## 4-byte Folded Reload
movq -80(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rdi,8), %rdi
leaq (%rbx,%rcx,8), %rcx
cmpq %rcx, %rdi
leaq (%rax,%r13,8), %rcx
leaq (%rbx,%r11,8), %r11
setb -45(%rbp) ## 1-byte Folded Spill
cmpq %rcx, %r11
leaq (%rbx,%r10,8), %r10
leaq (%rbx,%rsi,8), %r11
movslq %r12d, %rsi
setb -44(%rbp) ## 1-byte Folded Spill
cmpq %r11, %rdi
setb %r12b
cmpq %rcx, %r10
leaq (%rbx,%rsi,8), %r10
movq -352(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi,8), %rsi
movslq %r9d, %r9
setb -43(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
setb %r11b
cmpq %rcx, %r10
leaq (%rbx,%r9,8), %r10
leaq (%rax,%r9,8), %rsi
movslq -60(%rbp), %r9 ## 4-byte Folded Reload
setb -60(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
setb %r13b
cmpq %rcx, %r10
leaq (%rbx,%r9,8), %r10
leaq (%rax,%r9,8), %rsi
movslq %edx, %rdx
setb -42(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
setb %r9b
cmpq %rcx, %r10
leaq (%rax,%rdx,8), %rsi
setb -41(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
leaq (%rbx,%rdx,8), %rdx
setb %r10b
cmpq %rcx, %rdx
setb %dl
leaq -55(%rbp), %rax
cmpq %rdi, %rax
seta %dil
leaq -56(%rbp), %rax
cmpq %rcx, %rax
setb %al
movb -44(%rbp), %cl ## 1-byte Reload
testb %cl, -45(%rbp) ## 1-byte Folded Reload
jne LBB0_53
## %bb.30: ## in Loop: Header=BB0_52 Depth=2
andb -43(%rbp), %r12b ## 1-byte Folded Reload
jne LBB0_53
## %bb.31: ## in Loop: Header=BB0_52 Depth=2
andb -60(%rbp), %r11b ## 1-byte Folded Reload
jne LBB0_53
## %bb.32: ## in Loop: Header=BB0_52 Depth=2
andb -42(%rbp), %r13b ## 1-byte Folded Reload
jne LBB0_53
## %bb.33: ## in Loop: Header=BB0_52 Depth=2
andb -41(%rbp), %r9b ## 1-byte Folded Reload
jne LBB0_53
## %bb.34: ## in Loop: Header=BB0_52 Depth=2
movl $1, %r9d
andb %dl, %r10b
jne LBB0_54
## %bb.35: ## in Loop: Header=BB0_52 Depth=2
andb %al, %dil
jne LBB0_54
## %bb.36: ## in Loop: Header=BB0_52 Depth=2
vbroadcastsd -56(%rbp), %zmm0
movq -104(%rbp), %rdx ## 8-byte Reload
xorl %esi, %esi
movq -336(%rbp), %r9 ## 8-byte Reload
movabsq $34359738368, %rdi ## imm = 0x800000000
movq %rdi, %r10
movq -184(%rbp), %r11 ## 8-byte Reload
movq -176(%rbp), %r15 ## 8-byte Reload
movq -168(%rbp), %r12 ## 8-byte Reload
movq -88(%rbp), %rdi ## 8-byte Reload
movq -160(%rbp), %rax ## 8-byte Reload
.p2align 4, 0x90
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
LBB0_37: ## Parent Loop BB0_19 Depth=1
## Parent Loop BB0_52 Depth=2
## => This Inner Loop Header: Depth=3
leal (%rax,%rsi), %ecx
movslq %ecx, %rcx
vmovupd (%rbx,%rcx,8), %zmm1
movq %rdx, %rcx
sarq $29, %rcx
vaddpd (%rbx,%rcx), %zmm1, %zmm1
leal (%r12,%rsi), %ecx
movslq %ecx, %rcx
vaddpd (%rbx,%rcx,8), %zmm1, %zmm1
leal (%r8,%rsi), %ecx
movslq %ecx, %rcx
vaddpd (%rbx,%rcx,8), %zmm1, %zmm1
vaddpd (%r15,%rsi,8), %zmm1, %zmm1
vaddpd (%r11,%rsi,8), %zmm1, %zmm1
vmulpd %zmm0, %zmm1, %zmm1
vmovupd %zmm1, (%rdi,%rsi,8)
addq $8, %rsi
addq %r10, %rdx
cmpq %rsi, %r9
jne LBB0_37
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
## %bb.38: ## in Loop: Header=BB0_52 Depth=2
movq -328(%rbp), %r9 ## 8-byte Reload
movl -212(%rbp), %eax ## 4-byte Reload
movl %eax, %r15d
cmpl $0, -344(%rbp) ## 4-byte Folded Reload
jne LBB0_54
jmp LBB0_56
.p2align 4, 0x90
LBB0_52: ## Parent Loop BB0_19 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_37 Depth 3
## Child Loop BB0_55 Depth 3
movq %rdx, -168(%rbp) ## 8-byte Spill
addq $1, %rax
movl $1, %r15d
cmpq $8, -448(%rbp) ## 8-byte Folded Reload
movq %r10, -184(%rbp) ## 8-byte Spill
movq %r11, -176(%rbp) ## 8-byte Spill
movq %rsi, -88(%rbp) ## 8-byte Spill
movq %rdi, -160(%rbp) ## 8-byte Spill
movq %r12, -192(%rbp) ## 8-byte Spill
movq %rax, -208(%rbp) ## 8-byte Spill
jae LBB0_21
LBB0_53: ## in Loop: Header=BB0_52 Depth=2
movl $1, %r9d
LBB0_54: ## in Loop: Header=BB0_52 Depth=2
movq -136(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r9,8), %rdx
movq -144(%rbp), %rax ## 8-byte Reload
leaq (%r9,%rax), %rcx
leaq (%rbx,%rcx,8), %r11
movq -152(%rbp), %rax ## 8-byte Reload
leaq (%r9,%rax), %rcx
leaq (%rbx,%rcx,8), %r10
movq -272(%rbp), %rax ## 8-byte Reload
leal (%r9,%rax), %r12d
shlq $32, %r12
movq -264(%rbp), %r13 ## 8-byte Reload
subq %r9, %r13
movq -112(%rbp), %rax ## 8-byte Reload
leal (%r15,%rax), %esi
movq -120(%rbp), %rax ## 8-byte Reload
leal (%r15,%rax), %edi
addl -124(%rbp), %r15d ## 4-byte Folded Reload
xorl %ecx, %ecx
.p2align 4, 0x90
LBB0_55: ## Parent Loop BB0_19 Depth=1
## Parent Loop BB0_52 Depth=2
## => This Inner Loop Header: Depth=3
leal (%r15,%rcx), %eax
cltq
vmovsd (%rbx,%rax,8), %xmm0 ## xmm0 = mem[0],zero
movq %r12, %rax
sarq $29, %rax
vaddsd (%rbx,%rax), %xmm0, %xmm0
leal (%rdi,%rcx), %eax
cltq
vaddsd (%rbx,%rax,8), %xmm0, %xmm0
leal (%rsi,%rcx), %eax
cltq
vaddsd (%rbx,%rax,8), %xmm0, %xmm0
vaddsd (%r10,%rcx,8), %xmm0, %xmm0
vaddsd (%r11,%rcx,8), %xmm0, %xmm0
vmulsd -56(%rbp), %xmm0, %xmm0
vmovsd %xmm0, (%rdx,%rcx,8)
addq $1, %rcx
addq %r14, %r12
cmpq %rcx, %r13
jne LBB0_55
LBB0_56: ## in Loop: Header=BB0_52 Depth=2
movq -192(%rbp), %r12 ## 8-byte Reload
addq $1, %r12
movq -104(%rbp), %rax ## 8-byte Reload
addq -440(%rbp), %rax ## 8-byte Folded Reload
movq %rax, -104(%rbp) ## 8-byte Spill
movq -432(%rbp), %rcx ## 8-byte Reload
movq -88(%rbp), %rsi ## 8-byte Reload
addq %rcx, %rsi
movq -184(%rbp), %r10 ## 8-byte Reload
addq %rcx, %r10
movq -176(%rbp), %r11 ## 8-byte Reload
addq %rcx, %r11
movq -256(%rbp), %rax ## 8-byte Reload
addq %rax, %r8
movq -168(%rbp), %rdx ## 8-byte Reload
addq %rax, %rdx
movq -160(%rbp), %rdi ## 8-byte Reload
addq %rax, %rdi
addq %rcx, -136(%rbp) ## 8-byte Folded Spill
movq -200(%rbp), %rax ## 8-byte Reload
addq %rax, -144(%rbp) ## 8-byte Folded Spill
addq %rax, -152(%rbp) ## 8-byte Folded Spill
addq %rax, -272(%rbp) ## 8-byte Folded Spill
movq -96(%rbp), %r9 ## 8-byte Reload
movq -112(%rbp), %rax ## 8-byte Reload
addl %r9d, %eax
movq %rax, -112(%rbp) ## 8-byte Spill
movq -120(%rbp), %rax ## 8-byte Reload
addl %r9d, %eax
movq %rax, -120(%rbp) ## 8-byte Spill
addl %r9d, -124(%rbp) ## 4-byte Folded Spill
movq -208(%rbp), %rax ## 8-byte Reload
cmpq -264(%rbp), %rax ## 8-byte Folded Reload
jne LBB0_52
## %bb.57: ## in Loop: Header=BB0_19 Depth=1
movq -320(%rbp), %rcx ## 8-byte Reload
movq %rcx, %rax
cmpq -72(%rbp), %rcx ## 8-byte Folded Reload
jne LBB0_19
jmp LBB0_59
.p2align 4, 0x90
LBB0_58: ## in Loop: Header=BB0_19 Depth=1
movq %rax, %rcx
addq $1, %rcx
movq %rcx, %rax
cmpq -72(%rbp), %rcx ## 8-byte Folded Reload
jne LBB0_19
LBB0_59:
movq _var_false@GOTPCREL(%rip), %rax
cmpl $0, (%rax)
je LBB0_61
## %bb.60:
movq %rbx, %rdi
vzeroupper
callq _dummy
movq -80(%rbp), %rdi ## 8-byte Reload
callq _dummy
leaq -56(%rbp), %rdi
callq _dummy
LBB0_61:
xorl %eax, %eax
addq $408, %rsp ## imm = 0x198
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
.cfi_endproc
## -- End function
.subsections_via_symbols

Binary file not shown.

View File

@@ -1,196 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
.file "taxCalc.c"
.text
..TXTST0:
# -- Begin main
.text
# mark_begin;
.align 16,0x90
.globl main
# --- main(void)
main:
..B1.1: # Preds ..B1.0
.cfi_startproc
..___tag_value_main.1:
..L2:
#4.15
pushq %rbp #4.15
.cfi_def_cfa_offset 16
movq %rsp, %rbp #4.15
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-128, %rsp #4.15
subq $4096, %rsp #4.15
movl $104446, %esi #4.15
movl $3, %edi #4.15
call __intel_new_feature_proc_init #4.15
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
vstmxcsr (%rsp) #4.15
movl $.2.3_2_kmpc_loc_struct_pack.3, %edi #4.15
xorl %esi, %esi #4.15
orl $32832, (%rsp) #4.15
xorl %eax, %eax #4.15
vldmxcsr (%rsp) #4.15
..___tag_value_main.6:
call __kmpc_begin #4.15
..___tag_value_main.7:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
movl $il0_peep_printf_format_0, %edi #5.5
call puts #5.5
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
vmovss .L_2il0floatpacket.0(%rip), %xmm0 #8.15
xorl %eax, %eax #11.5
vxorps %xmm1, %xmm1, %xmm1 #9.5
vmovss %xmm1, (%rsp) #9.5
movl $111,%ebx
.byte 100,103,144
..B1.4: # Preds ..B1.4 ..B1.3
lea 1(%rax,%rax), %edx #12.9
vcvtsi2ss %edx, %xmm2, %xmm2 #12.27
vmulss %xmm2, %xmm0, %xmm3 #12.29
lea 2(%rax,%rax), %ecx #12.9
vaddss %xmm3, %xmm1, %xmm4 #12.29
vxorps %xmm1, %xmm1, %xmm1 #12.27
vcvtsi2ss %ecx, %xmm1, %xmm1 #12.27
vmulss %xmm1, %xmm0, %xmm5 #12.29
vmovss %xmm4, 4(%rsp,%rax,8) #12.9
vaddss %xmm5, %xmm4, %xmm1 #12.29
vmovss %xmm1, 8(%rsp,%rax,8) #12.9
incq %rax #11.5
cmpq $499, %rax #11.5
jb ..B1.4 # Prob 99% #11.5
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
movl $222,%ebx
.byte 100,103,144
..B1.5: # Preds ..B1.4
vmovss 3992(%rsp), %xmm0 #12.18
movl $il0_peep_printf_format_1, %edi #15.5
vaddss .L_2il0floatpacket.1(%rip), %xmm0, %xmm1 #12.29
vmovss %xmm1, 3996(%rsp) #12.9
call puts #15.5
# LOE rbx r12 r13 r14 r15
..B1.6: # Preds ..B1.5
movl $.2.3_2_kmpc_loc_struct_pack.14, %edi #16.12
xorl %eax, %eax #16.12
..___tag_value_main.8:
call __kmpc_end #16.12
..___tag_value_main.9:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
xorl %eax, %eax #16.12
movq %rbp, %rsp #16.12
popq %rbp #16.12
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #16.12
.align 16,0x90
.cfi_endproc
# LOE
# mark_end;
.type main,@function
.size main,.-main
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.3:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.2
.align 4
.2.3_2__kmpc_loc_pack.2:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 52
.byte 59
.byte 52
.byte 59
.byte 59
.space 1, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.14:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.13
.align 4
.2.3_2__kmpc_loc_pack.13:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 49
.byte 54
.byte 59
.byte 49
.byte 54
.byte 59
.byte 59
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
il0_peep_printf_format_0:
.long 1128354639
.long 1702109249
.long 1931506803
.long 1953653108
.byte 0
.space 3, 0x00 # pad
.align 4
il0_peep_printf_format_1:
.long 1128354639
.long 1702109249
.long 1696625779
.word 25710
.byte 0
.data
# -- End main
.section .rodata, "a"
.align 4
.align 4
.L_2il0floatpacket.0:
.long 0x3e428f5c
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,4
.align 4
.L_2il0floatpacket.1:
.long 0x433dcf5c
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,4
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End

View File

@@ -1,201 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
.file "taxCalc.c"
.text
..TXTST0:
# -- Begin main
.text
# mark_begin;
.align 16,0x90
.globl main
# --- main(void)
main:
..B1.1: # Preds ..B1.0
.cfi_startproc
..___tag_value_main.1:
..L2:
#4.15
pushq %rbp #4.15
.cfi_def_cfa_offset 16
movq %rsp, %rbp #4.15
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-128, %rsp #4.15
subq $4096, %rsp #4.15
movl $104446, %esi #4.15
movl $3, %edi #4.15
call __intel_new_feature_proc_init #4.15
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
vstmxcsr (%rsp) #4.15
movl $.2.3_2_kmpc_loc_struct_pack.3, %edi #4.15
xorl %esi, %esi #4.15
orl $32832, (%rsp) #4.15
xorl %eax, %eax #4.15
vldmxcsr (%rsp) #4.15
..___tag_value_main.6:
call __kmpc_begin #4.15
..___tag_value_main.7:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
movl $il0_peep_printf_format_0, %edi #5.5
call puts #5.5
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
vmovss .L_2il0floatpacket.0(%rip), %xmm0 #8.15
xorl %eax, %eax #11.5
vxorps %xmm1, %xmm1, %xmm1 #9.5
vmovss %xmm1, (%rsp) #9.5
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100
.byte 103
.byte 144
..B1.4: # Preds ..B1.4 ..B1.3
lea 1(%rax,%rax), %edx #12.9
vcvtsi2ss %edx, %xmm2, %xmm2 #12.27
vmulss %xmm2, %xmm0, %xmm3 #12.29
lea 2(%rax,%rax), %ecx #12.9
vaddss %xmm3, %xmm1, %xmm4 #12.29
vxorps %xmm1, %xmm1, %xmm1 #12.27
vcvtsi2ss %ecx, %xmm1, %xmm1 #12.27
vmulss %xmm1, %xmm0, %xmm5 #12.29
vmovss %xmm4, 4(%rsp,%rax,8) #12.9
vaddss %xmm5, %xmm4, %xmm1 #12.29
vmovss %xmm1, 8(%rsp,%rax,8) #12.9
incq %rax #11.5
cmpq $499, %rax #11.5
jb ..B1.4 # Prob 99% #11.5
movl $222, %ebx
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
# LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
..B1.5: # Preds ..B1.4
vmovss 3992(%rsp), %xmm0 #12.18
movl $il0_peep_printf_format_1, %edi #15.5
vaddss .L_2il0floatpacket.1(%rip), %xmm0, %xmm1 #12.29
vmovss %xmm1, 3996(%rsp) #12.9
call puts #15.5
# LOE rbx r12 r13 r14 r15
..B1.6: # Preds ..B1.5
movl $.2.3_2_kmpc_loc_struct_pack.14, %edi #16.12
xorl %eax, %eax #16.12
..___tag_value_main.8:
call __kmpc_end #16.12
..___tag_value_main.9:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
xorl %eax, %eax #16.12
movq %rbp, %rsp #16.12
popq %rbp #16.12
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #16.12
.align 16,0x90
.cfi_endproc
# LOE
# mark_end;
.type main,@function
.size main,.-main
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.3:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.2
.align 4
.2.3_2__kmpc_loc_pack.2:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 52
.byte 59
.byte 52
.byte 59
.byte 59
.space 1, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.14:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.13
.align 4
.2.3_2__kmpc_loc_pack.13:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 109
.byte 97
.byte 105
.byte 110
.byte 59
.byte 49
.byte 54
.byte 59
.byte 49
.byte 54
.byte 59
.byte 59
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
il0_peep_printf_format_0:
.long 1128354639
.long 1702109249
.long 1931506803
.long 1953653108
.byte 0
.space 3, 0x00 # pad
.align 4
il0_peep_printf_format_1:
.long 1128354639
.long 1702109249
.long 1696625779
.word 25710
.byte 0
.data
# -- End main
.section .rodata, "a"
.align 4
.align 4
.L_2il0floatpacket.0:
.long 0x3e428f5c
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,4
.align 4
.L_2il0floatpacket.1:
.long 0x433dcf5c
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,4
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End

View File

@@ -1,8 +1,5 @@
[tox]
envlist = py35
envlist = py35,py36
[testenv]
commands=
python tests/all_tests.py
# osaca --arch ivb --iaca examples/taxCalc-ivb-iaca
# osaca --arch ivb --iaca examples/taxCalc-ivb-iaca.S
# osaca --arch ivb examples/taxCalc-ivb