Resolved merge conflicts

2026-01-11 13:37:07 +01:00 · 2019-10-16 10:59:03 +02:00
parent 1c896a9f5f 1b2f00e06b
commit aa48803549
76 changed files with 8913 additions and 44043 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,6 +3,28 @@ language: python
 python:
    - "3.5"
    - "3.6"
-    - "3.7"
-install: pip install tox-travis
-script: tox
+# Python 3.7 not working yet
+#    - "3.7"
+before_install: 
+#  - pip install tox-travis
+  - pip install codecov
+install:
+  - pip install -e .
+cache: pip
+script: 
+#  - tox
+  - coverage run -p tests/all_tests.py
+after_success:
+  - coverage combine
+  - codecov
+deploy:
+  provider: pypi
+  user: "__token__"
+  password:
+    secure: "fRRCETOwDkJ4pFacYZghPfCQ9mSsV4PlD3sTDp8rDHoCnebPjvFYc1tIdv+Wds0ae162KNUaj9GbxjK0MTGiRcy4pD08n7ufv8snmBQ2rtOLkj7RCRg1hw30WcMHjzqScFJgQcBrpjdPmR5AlesUufh6OadGvF1NspmVRWKr8ir3KQhmNV+itAliYoqaSTRTg1zC/znm+49l5gkzlLxd+mPj5/dtcc8vZ/i2M2+nNTTjDxq71q4Ddqv+bgZV1y7OZY2YuvjEDPflUbwc3fjOxpj891uMDHodsGmEHBu8WsLpF2tAO0C/x63S0jXamkV+/4cAQqQAwWr0Lby9/BjCfUwyUMOEgZ0S+z9WoFpBpQTQEfkD2JH/UFrv4CMnLFqgDkVMcx0vc/rT4Od8eJ5wOSG5+VdniJNOLpodFOXuKc09eJMk2lE9vk9OBrcsZ09UOTPTUCMZSIP4cBDxaIkx+RHQEy63TQdJZcElRBEWGEgj2e9hbiktvIoOvbFGQDscpz7ShBDklXIpu9hnxcKHtNDEjyywTUJmx7lTMILL05DPUnpUmnMb1Gyx5lbHzhSExc9re0cxEA354UUQKBS5HwHQcEBw9stMfsaForiBAUOocUKdGqlGP9cOXFoxdC9M+ff5FNstgbjPYSowb/JbATMlmCWKgH/bXXcTGCO10sk="
+  distributions: sdist
+  skip_existing: true
+  skip_cleanup: true
+  on:
+    repo: RRZE-HPC/OSACA
+    tag: true
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,7 @@
 include README.rst
 include LICENSE
 include tox.ini
-recursive-include osaca/data/ *.csv
+recursive-include osaca/data/ *.yml
 include examples/*
 recursive-include tests *.py *.out
 recursive-include tests/testfiles/ *
--- a/README.rst
+++ b/README.rst
@@ -16,9 +16,15 @@ analysis and throughput prediction for a innermost loop kernel.
 .. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?token=393L6z2HEXNiGLtZ43s6&branch=master
    :target: https://travis-ci.com/RRZE-HPC/OSACA

-.. image:: https://landscape.io/github/RRZE-HPC/OSACA/master/landscape.svg?style=flat&badge_auth_token=c95f01b247f94bc79c09d21c5c827697
-   :target: https://landscape.io/github/RRZE-HPC/OSACA/master
-   :alt: Code Health
+.. ..image:: https://landscape.io/github/RRZE-HPC/OSACA/master/landscape.svg?style=flat&badge_auth_token=c95f01b247f94bc79c09d21c5c827697
+..   :target: https://landscape.io/github/RRZE-HPC/OSACA/master
+..   :alt: Code Health
+
+.. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
+    :target: https://codecov.io/github/RRZE-HPC/OSACA?branch=master
+
+.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
+    :target: https://github.com/ambv/black

 Getting started
 ===============
@@ -46,8 +52,7 @@ Dependencies:
 Additional requirements are:

 -  `Python3 <https://www.python.org/>`_
-  `pandas <http://pandas.pydata.org/>`_
-  `NumPy <http://www.numpy.org/>`_
+-  `Graphviz <https://www.graphviz.org/>`_ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
 -  `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`_ for marker insertion
 -   `ibench <https://github.com/hofm/ibench>`_ for throughput/latency measurements

@@ -66,213 +71,169 @@ The usage of OSACA can be listed as:

 .. code:: bash

-    osaca [-h] [-V] [--arch ARCH] [--tp-list] [-i | --iaca | -m] FILEPATH
+    osaca [-h] [-V] [--arch ARCH] [--export-graph GRAPHNAME] FILEPATH

- ``-h`` or ``--help`` prints out the help message.
- ``-V`` or ``--version`` shows the program’s version number.
- ``ARCH`` needs to be replaced with the wished architecture abbreviation. This flag is necessary for the throughput analysis (default function) and the inclusion of an ibench output (``-i``). Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW`` and ``SKL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN`` for AMD Zen (17h family) architecture .
- While in the throughput analysis mode, one can add ``--tp-list`` for printing the additional throughput list of the kernel or ``--iaca`` for letting OSACA to know it has to search for IACA binary markers.
- ``-i`` or ``--include-ibench`` starts the integration of ibench output into the CSV data file determined by ``ARCH``.
- With the flag ``-m`` or ``--insert-marker`` OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`_ marker in suggested assembly blocks.
- ``FILEPATH`` describes the filepath to the file to work with and is always necessary
+-h, --help
+  prints out the help message.
+-V, --version
+  shows the program’s version number.
+--arch ARCH
+  needs to be replaced with the wished architecture abbreviation.
+  This flag is necessary for the throughput analysis (default function) and the inclusion of an ibench output (``-i``).
+  Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX`` and ``CSX`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1`` for AMD Zen (17h family) architecture.
+  Furthermore, `VULCAN` for Marvell`s ARM-based ThunderX2 architecture is available.
+--insert-marker
+  OSACA calls the Kerncraft module for the interactively insertion of `IACA <https://software.intel.com/en-us/articles/intel-architecture-code-analyzer>`_ marker in suggested assembly blocks.
+--db-check
+  Run a sanity check on the by "--arch" specified database.
+  The output depends on the verbosity level.
+  Keep in mind you have to provide a (dummy) filename in anyway.
+--export-graph EXPORT_PATH
+  Output path for .dot file export. If "." is given, the file will be stored as "./osaca_dg.dot".
+  After the file was created, you can convert it to a PDF file using dot: `dot -Tpdf osaca_dg.dot -o osaca_dependency_graph.pdf`
+
+The **FILEPATH** describes the filepath to the file to work with and is always necessary
+
+______________________

 Hereinafter OSACA's scope of function will be described.

-Throughput analysis
-~~~~~~~~~~~~~~~~~~~
-As main functionality of OSACA this process starts by default. It is always necessary to specify the core architecture by the flag ``--arch ARCH``, where ``ARCH`` can stand for ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKL`` or ``ZEN``.
+Throughput & Latency analysis
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+As main functionality of OSACA this process starts by default. It is always necessary to specify the core architecture by the flag ``--arch ARCH``, where ``ARCH`` can stand for ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ZEN`` or ``VULCAN``.

-For extracting the right kernel, one has to mark it beforehand. For this there are two different approaches:
+For extracting the right kernel, one has to mark it beforehand.
+Currently, only the detechtion of markers in the assembly code and therefore the analysis of assemly files is supported by OSACA.

-| **High level code**
+**Assembly code**

-The OSACA marker is ``//STARTLOOP`` and must be put in one line in front of the loop head, and the loop code must be indented consistently. This means the marker and the head must have the same indentation level while the whole loop body needs to be more indented than the code before and after. For instance, this is a valid OSACA marker:
-
-.. code-block:: c
-
-    int i = 0;
-    //STARTLOOP
-    while(i < N){
-        // do something...
-        i++;
-    }
-
-| **Assembly code**
-
-Another way for marking a kernel is to insert the IACA byte markers in the assembly file in before and after the loop.
+Marking a kernel means to insert the byte markers in the assembly file in before and after the loop.
 For this, the start marker has to be inserted right in front of the loop label and the end marker directly after the jump instruction.
-Start and end marker can be seen in the example below:
+For the convience of the user, in x86 assembly IACA byte markers are used.
+
+**x86 Byte Markers**

 .. code-block:: gas

-    movl    $111,%ebx       ;IACA START MARKER
-    .byte   100,103,144     ;IACA START MARKER
-    ; LABEL
-        ; do something
-        ; ...
-        ; conditional jump to LABEL
-    movl    $222,%ebx       ;IACA END MARKER
-    .byte   100,103,144     ;IACA END MARKER
+    movl    $111,%ebx       #IACA/OSACA START MARKER
+    .byte   100,103,144     #IACA/OSACA START MARKER
+    Loop:
+      # ...
+    movl    $222,%ebx       #IACA/OSACA END MARKER
+    .byte   100,103,144     #IACA/OSACA END MARKER

-The optional flag ``--iaca`` defines if OSACA needs to search for the IACA byte markers or the OSACA marker in the chosen file.
+**AArch64 Byte Markers**

-With an additional, optional ``--tp-list``, OSACA adds a simple list of all kernel instruction forms together with their reciprocal throughput to the output. This is helpful in case of no further information about the port binding of the single instruction forms.
+.. code-block:: asm

-Include new measurements into the data file
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Running OSACA with the flag ``-i`` or ``--include-ibench`` and a specified micro architecture ``ARCH``, it
-takes the values given in an ibench output file and checks them for reasonability. If a value is not in the data file already, it will be added, otherwise OSACA prints out a warning message and keeps the old value in the data file. If a value does not pass the validation, a warning message is shown, however, OSACA will keep working with the new value.
-The handling of ibench is shortly described in the example section below.
+    mov x1, #111            // OSACA START
+    .byte 213,3,32,31       // OSACA START
+      \\ ...
+    mov x1, #222            // OSACA END
+    .byte 213,3,32,31       // OSACA END
+
+.. Include new measurements into the data file
+.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. Running OSACA with the flag ``-i`` or ``--include-ibench`` and a specified micro architecture ``ARCH``, it takes the values given in an ibench output file and checks them for reasonability. If a value is not in the data file already, it will be added, otherwise OSACA prints out a warning message and keeps the old value in the data file. If a value does not pass the validation, a warning message is shown, however, OSACA will keep working with the new value. The handling of ibench is shortly described in the example section below.

 Insert IACA markers
 ~~~~~~~~~~~~~~~~~~~
-Using the ``-m`` or ``--insert-marker`` flags for a given file, OSACA calls the implemented Kerncraft module for identifying and marking the inner-loop block in *manual mode*. More information about how this is done can be found in the `Kerncraft repository <https://github.com/RRZE-HPC/kerncraft>`_.
+Using the ``--insert-marker`` flags for a given file, OSACA calls the implemented Kerncraft module for identifying and marking the inner-loop block in *manual mode*. More information about how this is done can be found in the `Kerncraft repository <https://github.com/RRZE-HPC/kerncraft>`_.
+Note that this currrently only works for x86 loop kernels

 Example
 =======
-For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel IVB core hereafter:
+For clarifying the functionality of OSACA a sample kernel is analyzed for an Intel CSX core hereafter:

 .. code-block:: c

    double a[N], double b[N];
    double s;
    
-    //STARTLOOP
+    // loop
    for(int i = 0; i < N; ++i)
        a[i] = s * b[i];
        
-The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``. The result is
-written in vector ``a``.
-After including the OSACA marker ``//STARTLOOP`` and compiling the source, one can
-start the analysis typing 
+The code shows a simple scalar multiplication of a vector ``b`` and a floating-point number ``s``.
+The result is written in vector ``a``.
+After including the OSACA byte marker into the assembly, one can start the analysis typing 

 .. code:: bash

-    osaca --arch IVB PATH/TO/FILE
+    osaca --arch CSX PATH/TO/FILE

-in the command line. Optionally, one can create the assembly code out of the file, identify and mark the kernel of interest and run OSACA with the additional ``--iaca`` flag.
+in the command line.

 The output is:

 .. code-block::

+    Open Source Architecture Code Analyzer (OSACA) - v0.3
+    Analyzed file:      scale.s.csx.O3.s
+    Architecture:       csx
+    Timestamp:          2019-10-03 23:36:21
+
+     P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
+     * - Instruction micro-ops not bound to a port
+     X - No throughput/latency information for this instruction in data file
+
+
    Throughput Analysis Report
    --------------------------
-    P - Load operation can be hidden behind a past or future store instruction
-    X - No information for this instruction in database
-    * - Instruction micro-ops not bound to a port
-    
-    Port Binding in Cycles Per Iteration:
-    -------------------------------------------------
-    |  Port  |   0  |   1  |  2  |  3  |  4  |   5  |
-    -------------------------------------------------
-    | Cycles | 2.33 | 1.33 | 5.0 | 5.0 | 2.0 | 1.33 |
-    -------------------------------------------------
-             
-             Ports Pressure in cycles          
-    |  0   |  1   |  2   |  3   |  4   |  5   |
-    -------------------------------------------
-    |      |      | 0.50 | 0.50 | 1.00 |      | movl   $0x0,-0x24(%rbp)
-    |      |      |      |      |      |      | jmp    10b <scale+0x10b>
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x48(%rbp),%rax
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x24(%rbp),%edx
-    | 0.33 | 0.33 |      |      |      | 0.33 | movslq %edx,%rdx
-    |      |      | 0.50 | 0.50 |      |      | vmovsd (%rax,%rdx,8),%xmm0
-    | 1.00 |      | 0.50 | 0.50 |      |      | vmulsd -0x50(%rbp),%xmm0,%xmm0
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x38(%rbp),%rax
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x24(%rbp),%edx
-    | 0.33 | 0.33 |      |      |      | 0.33 | movslq %edx,%rdx
-    |      |      | 0.50 | 0.50 | 1.00 |      | vmovsd %xmm0,(%rax,%rdx,8)
-    |      |      |      |      |      |      | X addl   $0x1,-0x24(%rbp)
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x24(%rbp),%eax
-    | 0.33 | 0.33 | 0.50 | 0.50 |      | 0.33 | cmp    -0x54(%rbp),%eax
-    |      |      |      |      |      |      | jl     e4 <scale+0xe4>
-    | 0.33 | 0.33 |      |      |      | 0.33 | mov    %rcx,%rsp
-    Total number of estimated throughput: 5.0
+                                  Port pressure in cycles                              
+         |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   |
+    -----------------------------------------------------------------------------------
+     170 |             |      |             |             |      |      |      |      |   .L22:
+     171 | 0.50        | 0.50 | 0.50   0.50 | 0.50   0.50 |      |      |      |      |   vmulpd	(%r12,%rax), %ymm1, %ymm0
+     172 |             |      | 0.50        | 0.50        | 1.00 |      |      |      |   vmovapd	%ymm0, 0(%r13,%rax)
+     173 | 0.25        | 0.25 |             |             |      | 0.25 | 0.25 |      |   addq	$32, %rax
+     174 | 0.25        | 0.25 |             |             |      | 0.25 | 0.25 |      |   cmpq	%rax, %r14
+     175 |             |      |             |             |      |      |      |      | * jne	.L22
+
+           1.00          1.00   1.00   0.50   1.00   0.50   1.00   0.50   0.50         
+
+
+    Latency Analysis Report
+    -----------------------
+     171 |  8.0 | | vmulpd	(%r12,%rax), %ymm1, %ymm0
+     172 |  5.0 | | vmovapd	%ymm0, 0(%r13,%rax)
+
+           13.0
+
+
+    Loop-Carried Dependencies Analysis Report
+    -----------------------------------------
+    173 |  1.0 | addq	$32, %rax                      | [173]

 It shows the whole kernel together with the average port pressure of each instruction form and the overall port binding.
-In the fifth to last line containing ``addl $0x1, -0x24(%rbp)`` one can see an ``X`` in front of the instruction form and no port occupation.
-This means either there are no measured values for this instruction form or no port binding is provided in the
-data file.
-In the first case, OSACA automatically creates two benchmark assembly files (``add-mem_imd.S`` for latency and ``add-mem_imd-TP.S`` for throughput) in the benchmark folder, if it not already exists there.
+Furthermore, the critical path of the loop kernel and all loop-carried dependencies, each with a list of line numbers being part of this dependency chain on the right.

-One can now run ibench to get the throughput value for addl with the given file. Mind that the assembly
-file, which is used for ibench, is implemented in Intel syntax. So for a valid run instruction ``addl`` must be
-changed to ``add`` manually.
+.. For measuring the instruction forms with ibench we highly recommend to use an exclusively allocated node, so there is no other workload falsifying the results. For the correct function of ibench the benchmark files from OSACA need to be placed in a subdirectory of src in root so ibench can create the a folder with the subdirectory’s name and the shared objects. For running the tests the frequencies of all cores must set to a constant value and this has to be given as an argument together with the directory of the shared objects to ibench, e.g.:

-For measuring the instruction forms with ibench we highly recommend to use an exclusively allocated node,
-so there is no other workload falsifying the results. For the correct function of ibench the benchmark files
-from OSACA need to be placed in a subdirectory of src in root so ibench can create the a folder with the
-subdirectory’s name and the shared objects. For running the tests the frequencies of all cores must set to a
-constant value and this has to be given as an argument together with the directory of the shared objects to
-ibench, e.g.:
-
-.. code:: bash
+.. .. code:: bash

    ./ibench ./AVX 2.2
    
-for running ibench in the directory ``AVX`` with a core frequency of 2.2 GHz.
-We get an output like:
+.. for running ibench in the directory ``AVX`` with a core frequency of 2.2 GHz. We get an output like:

-.. code:: bash
+.. .. code:: bash

    Using frequency 2.20GHz.
    add-mem_imd-TP: 1.023 (clock cycles) [DEBUG - result: 1.000000]
    add-mem_imd: 6.050 (clock cycles) [DEBUG - result: 1.000000]
    
-The debug output as resulting value of register ``xmm0`` is additional validation information depending on
-the executed instruction form meant for the user and is not considered by OSACA.
-The ibench output information can be included by OSACA running the program with the flag ``--include-ibench`` or just
-``-i`` and the specify micro architecture:
+.. The debug output as resulting value of register ``xmm0`` is additional validation information depending on the executed instruction form meant for the user and is not considered by OSACA. The ibench output information can be included by OSACA running the program with the flag ``--include-ibench`` or just ``-i`` and the specify micro architecture:

-.. code-block:: bash
+.. .. code-block:: bash

    osaca --arch IVB -i PATH/TO/IBENCH-OUTPUTFILE

-For now no automatic allocation of ports for a instruction form is implemented, so for getting an output in the Ports Pressure table, one must add the port occupation by hand.
-We know that the inserted instruction form must be assigned always to Port 2, 3 and 4 and additionally to either 0, 1 or 5, a valid data file therefore would look like this:
+.. For now no automatic allocation of ports for a instruction form is implemented, so for getting an output in the Ports Pressure table, one must add the port occupation by hand. We know that the inserted instruction form must be assigned always to Port 2, 3 and 4 and additionally to either 0, 1 or 5, a valid data file therefore would look like this:

-.. code:: bash
+.. .. code:: bash

    addl-mem_imd,1.0,6.0,"(0.33,0.33,1.00,1.00,1.00,0.33)"
    
-Another throughput analysis with OSACA now returns all information for the kernel:
-
-.. code-block::
-
-    Throughput Analysis Report
-    --------------------------
-    P - Load operation can be hidden behind a past or future store instruction
-    X - No information for this instruction in database
-    * - Instruction micro-ops not bound to a port
-    
-    Port Binding in Cycles Per Iteration:
-    -------------------------------------------------
-    |  Port  |   0  |   1  |  2  |  3  |  4  |   5  |
-    -------------------------------------------------
-    | Cycles | 2.67 | 1.67 | 6.0 | 6.0 | 3.0 | 1.67 |
-    -------------------------------------------------
-             
-             Ports Pressure in cycles          
-    |  0   |  1   |  2   |  3   |  4   |  5   |
-    -------------------------------------------
-    |      |      | 0.50 | 0.50 | 1.00 |      | movl   $0x0,-0x24(%rbp)
-    |      |      |      |      |      |      | jmp    10b <scale+0x10b>
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x48(%rbp),%rax
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x24(%rbp),%edx
-    | 0.33 | 0.33 |      |      |      | 0.33 | movslq %edx,%rdx
-    |      |      | 0.50 | 0.50 |      |      | vmovsd (%rax,%rdx,8),%xmm0
-    | 1.00 |      | 0.50 | 0.50 |      |      | vmulsd -0x50(%rbp),%xmm0,%xmm0
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x38(%rbp),%rax
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x24(%rbp),%edx
-    | 0.33 | 0.33 |      |      |      | 0.33 | movslq %edx,%rdx
-    |      |      | 0.50 | 0.50 | 1.00 |      | vmovsd %xmm0,(%rax,%rdx,8)
-    | 0.33 | 0.33 | 1.00 | 1.00 | 1.00 | 0.33 | addl   $0x1,-0x24(%rbp)
-    |      |      | 0.50 | 0.50 |      |      | mov    -0x24(%rbp),%eax
-    | 0.33 | 0.33 | 0.50 | 0.50 |      | 0.33 | cmp    -0x54(%rbp),%eax
-    |      |      |      |      |      |      | jl     e4 <scale+0xe4>
-    | 0.33 | 0.33 |      |      |      | 0.33 | mov    %rcx,%rsp
-    Total number of estimated throughput: 6.0

 Credits
 =======
--- a/doc/osaca-workflow.png
+++ b/doc/osaca-workflow.png
--- a/examples/2d-5pt-ivb-iaca.S
+++ b/examples/2d-5pt-ivb-iaca.S
@@ -1,286 +0,0 @@
-# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 17.0.5.239 Build 20170817";
-# mark_description "-fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o 2d.S";
-	.file "2d-5pt.c"
-	.text
-..TXTST0:
-# -- Begin  jacobi2D5pt
-	.text
-# mark_begin;
-       .align    16,0x90
-	.globl jacobi2D5pt
-# --- jacobi2D5pt(int, int)
-jacobi2D5pt:
-# parameter 1: %edi
-# parameter 2: %esi
-..B1.1:                         # Preds ..B1.0
-                                # Execution count [1.00e+00]
-	.cfi_startproc
-..___tag_value_jacobi2D5pt.1:
-..L2:
-                                                          #2.31
-        pushq     %rbx                                          #2.31
-	.cfi_def_cfa_offset 16
-        movq      %rsp, %rbx                                    #2.31
-	.cfi_def_cfa 3, 16
-	.cfi_offset 3, -16
-        andq      $-32, %rsp                                    #2.31
-        pushq     %rbp                                          #2.31
-        pushq     %rbp                                          #2.31
-        movq      8(%rbx), %rbp                                 #2.31
-        movq      %rbp, 8(%rsp)                                 #2.31
-        movq      %rsp, %rbp                                    #2.31
-	.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
-        pushq     %r13                                          #2.31
-        pushq     %r14                                          #2.31
-        pushq     %r15                                          #2.31
-        subq      $88, %rsp                                     #2.31
-        movslq    %esi, %rsi                                    #2.31
-        movslq    %edi, %rcx                                    #2.31
-	.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
-	.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
-	.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
-        movq      %rsi, %r13                                    #4.17
-        imulq     %rcx, %r13                                    #4.17
-        shlq      $3, %r13                                      #4.12
-        movq      %r13, %rax                                    #4.12
-        addq      $31, %rax                                     #4.12
-        andq      $-32, %rax                                    #4.12
-        subq      %rax, %rsp                                    #4.12
-        movq      %rsp, %rax                                    #4.12
-                                # LOE rax rcx rsi r12 r13 edi
-..B1.29:                        # Preds ..B1.1
-                                # Execution count [1.00e+00]
-        movq      %rax, %r14                                    #4.12
-                                # LOE rcx rsi r12 r13 r14 edi
-..B1.2:                         # Preds ..B1.29
-                                # Execution count [1.00e+00]
-        movq      %r13, %rax                                    #5.12
-        addq      $31, %rax                                     #5.12
-        andq      $-32, %rax                                    #5.12
-        subq      %rax, %rsp                                    #5.12
-        movq      %rsp, %rax                                    #5.12
-                                # LOE rax rcx rsi r12 r13 r14 edi
-..B1.30:                        # Preds ..B1.2
-                                # Execution count [1.00e+00]
-        movq      %rax, %r15                                    #5.12
-                                # LOE rcx rsi r12 r13 r14 r15 edi
-..B1.3:                         # Preds ..B1.30
-                                # Execution count [1.00e+00]
-        xorl      %r10d, %r10d                                  #9.5
-        lea       (%r15,%rcx,8), %r11                           #13.13
-        vxorpd    %xmm1, %xmm1, %xmm1                           #6.5
-        lea       (%r14,%rcx,8), %rdx                           #13.37
-        cmpq      $2, %rsi                                      #9.18
-        jle       ..B1.21       # Prob 9%                       #9.18
-                                # LOE rdx rcx rsi r10 r11 r12 r13 r14 r15 edi xmm1
-..B1.4:                         # Preds ..B1.3
-                                # Execution count [9.00e-01]
-        addl      $-2, %edi                                     #12.9
-        movq      %rcx, %r9                                     #13.61
-        movl      %edi, %eax                                    #12.9
-        addq      $-2, %rsi                                     #9.18
-        andl      $-16, %eax                                    #12.9
-        xorl      %r8d, %r8d                                    #9.5
-        shlq      $4, %r9                                       #13.61
-        movslq    %eax, %rax                                    #12.9
-        addq      %r14, %r9                                     #13.61
-        movslq    %edi, %rdi                                    #12.9
-        vxorps    %ymm0, %ymm0, %ymm0                           #6.5
-        movq      %rax, -80(%rbp)                               #12.9[spill]
-        movq      %rdi, -88(%rbp)                               #12.9[spill]
-        movl      %eax, -72(%rbp)                               #9.5[spill]
-        movq      %rsi, -48(%rbp)                               #9.5[spill]
-        movq      %rdx, -64(%rbp)                               #9.5[spill]
-        movq      %r15, -96(%rbp)                               #9.5[spill]
-        movq      %r14, -56(%rbp)                               #9.5[spill]
-        movq      %r13, -104(%rbp)                              #9.5[spill]
-        movq      %r12, -112(%rbp)                              #9.5[spill]
-	.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
-                                # LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
-..B1.5:                         # Preds ..B1.19 ..B1.4
-                                # Execution count [5.00e+00]
-        cmpq      $2, %rcx                                      #12.22
-        jle       ..B1.19       # Prob 50%                      #12.22
-                                # LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
-..B1.6:                         # Preds ..B1.5
-                                # Execution count [4.50e+00]
-        cmpl      $16, %edi                                     #12.9
-        jl        ..B1.26       # Prob 10%                      #12.9
-                                # LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
-..B1.7:                         # Preds ..B1.6
-                                # Execution count [4.50e+00]
-        movl      -72(%rbp), %r14d                              #12.9[spill]
-        xorl      %edx, %edx                                    #12.9
-        movq      -80(%rbp), %r12                               #13.13[spill]
-        lea       (%r11,%r8), %rax                              #13.13
-                                # LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
-..B1.8:                         # Preds ..B1.8 ..B1.7
-                                # Execution count [2.50e+01]
-        vmovupd   %ymm0, 8(%rax,%rdx,8)                         #13.13
-        vmovupd   %ymm0, 40(%rax,%rdx,8)                        #13.13
-        vmovupd   %ymm0, 72(%rax,%rdx,8)                        #13.13
-        vmovupd   %ymm0, 104(%rax,%rdx,8)                       #13.13
-        addq      $16, %rdx                                     #12.9
-        cmpq      %r12, %rdx                                    #12.9
-        jb        ..B1.8        # Prob 82%                      #12.9
-                                # LOE rax rdx rcx r8 r9 r10 r11 r12 edi r14d xmm1 ymm0
-..B1.10:                        # Preds ..B1.8 ..B1.26
-                                # Execution count [5.00e+00]
-        lea       1(%r14), %eax                                 #12.9
-        cmpl      %edi, %eax                                    #12.9
-        ja        ..B1.19       # Prob 50%                      #12.9
-                                # LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
-..B1.11:                        # Preds ..B1.10
-                                # Execution count [4.50e+00]
-        movslq    %r14d, %r14                                   #12.9
-        movq      -88(%rbp), %r13                               #12.9[spill]
-        subq      %r14, %r13                                    #12.9
-        cmpq      $4, %r13                                      #12.9
-        jl        ..B1.25       # Prob 10%                      #12.9
-                                # LOE rcx r8 r9 r10 r11 r13 r14 edi xmm1 ymm0
-..B1.12:                        # Preds ..B1.11
-                                # Execution count [4.50e+00]
-        movl      %r13d, %r15d                                  #12.9
-        lea       (%r11,%r8), %rax                              #13.13
-        andl      $-4, %r15d                                    #12.9
-        xorl      %edx, %edx                                    #12.9
-        movslq    %r15d, %r15                                   #12.9
-        lea       (%rax,%r14,8), %rax                           #13.13
-                                # LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
-..B1.13:                        # Preds ..B1.13 ..B1.12
-                                # Execution count [2.50e+01]
-        vmovupd   %ymm0, 8(%rax,%rdx,8)                         #13.13
-        addq      $4, %rdx                                      #12.9
-        cmpq      %r15, %rdx                                    #12.9
-        jb        ..B1.13       # Prob 82%                      #12.9
-                                # LOE rax rdx rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
-..B1.15:                        # Preds ..B1.13 ..B1.25
-                                # Execution count [5.00e+00]
-        cmpq      %r13, %r15                                    #12.9
-        jae       ..B1.19       # Prob 10%                      #12.9
-                                # LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
-..B1.16:                        # Preds ..B1.15
-                                # Execution count [4.50e+00]
-        movq      -56(%rbp), %rax                               #13.49[spill]
-        lea       (%r11,%r8), %r12                              #13.13
-        movq      -64(%rbp), %rsi                               #13.25[spill]
-        lea       (%r9,%r8), %rdx                               #13.61
-        lea       (%r12,%r14,8), %r12                           #13.13
-        addq      %r8, %rax                                     #13.49
-        addq      %r8, %rsi                                     #13.25
-        lea       (%rdx,%r14,8), %rdx                           #13.61
-        lea       (%rax,%r14,8), %rax                           #13.49
-        lea       (%rsi,%r14,8), %r14                           #13.25
-                                # LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
-        movl      $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-..B1.17:                        # Preds ..B1.17 ..B1.16
-                                # Execution count [2.50e+01]
-        vmovsd    (%r14,%r15,8), %xmm2                          #13.25
-        vaddsd    16(%r14,%r15,8), %xmm2, %xmm3                 #13.37
-        vaddsd    8(%rax,%r15,8), %xmm3, %xmm4                  #13.49
-        vaddsd    8(%rdx,%r15,8), %xmm4, %xmm5                  #13.61
-        vmulsd    %xmm5, %xmm1, %xmm6                           #13.74
-        vmovsd    %xmm6, 8(%r12,%r15,8)                         #13.13
-        incq      %r15                                          #12.9
-        cmpq      %r13, %r15                                    #12.9
-        jb        ..B1.17       # Prob 82%                      #12.9
-        movl      $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-                                # LOE rax rdx rcx r8 r9 r10 r11 r12 r13 r14 r15 edi xmm1 ymm0
-..B1.19:                        # Preds ..B1.17 ..B1.5 ..B1.10 ..B1.15
-                                # Execution count [5.00e+00]
-        incq      %r10                                          #9.5
-        lea       (%r8,%rcx,8), %r8                             #9.5
-        cmpq      -48(%rbp), %r10                               #9.5[spill]
-        jb        ..B1.5        # Prob 82%                      #9.5
-                                # LOE rcx r8 r9 r10 r11 edi xmm1 ymm0
-..B1.20:                        # Preds ..B1.19
-                                # Execution count [9.00e-01]
-        movq      -64(%rbp), %rdx                               #[spill]
-        movq      -96(%rbp), %r15                               #[spill]
-        movq      -56(%rbp), %r14                               #[spill]
-        movq      -104(%rbp), %r13                              #[spill]
-        movq      -112(%rbp), %r12                              #[spill]
-	.cfi_restore 12
-                                # LOE rdx r11 r12 r13 r14 r15
-..B1.21:                        # Preds ..B1.3 ..B1.20
-                                # Execution count [1.00e+00]
-        addq      $8, %rdx                                      #16.5
-        addq      $8, %r11                                      #16.5
-        movq      %rdx, %rdi                                    #16.5
-        movq      %r11, %rsi                                    #16.5
-        vzeroupper                                              #16.5
-..___tag_value_jacobi2D5pt.12:
-#       dummy(double *, double *)
-        call      dummy                                         #16.5
-..___tag_value_jacobi2D5pt.13:
-                                # LOE r12 r13 r14 r15
-..B1.22:                        # Preds ..B1.21
-                                # Execution count [1.00e+00]
-        movq      %r15, %rdx                                    #16.5
-        movq      %r13, %rax                                    #16.5
-        addq      $31, %rax                                     #16.5
-        andq      $-32, %rax                                    #16.5
-        addq      %rax, %rsp                                    #16.5
-                                # LOE r12 r13 r14
-..B1.23:                        # Preds ..B1.22
-                                # Execution count [1.00e+00]
-        movq      %r14, %rdx                                    #16.5
-        movq      %r13, %rax                                    #16.5
-        addq      $31, %rax                                     #16.5
-        andq      $-32, %rax                                    #16.5
-        addq      %rax, %rsp                                    #16.5
-                                # LOE r12
-..B1.24:                        # Preds ..B1.23
-                                # Execution count [1.00e+00]
-        lea       -24(%rbp), %rsp                               #17.1
-	.cfi_restore 15
-        popq      %r15                                          #17.1
-	.cfi_restore 14
-        popq      %r14                                          #17.1
-	.cfi_restore 13
-        popq      %r13                                          #17.1
-        popq      %rbp                                          #17.1
-	.cfi_restore 6
-        movq      %rbx, %rsp                                    #17.1
-        popq      %rbx                                          #17.1
-	.cfi_def_cfa 7, 8
-	.cfi_restore 3
-        ret                                                     #17.1
-	.cfi_def_cfa 3, 16
-	.cfi_offset 3, -16
-	.cfi_escape 0x10, 0x06, 0x02, 0x76, 0x00
-	.cfi_escape 0x10, 0x0c, 0x03, 0x76, 0x90, 0x7f
-	.cfi_escape 0x10, 0x0d, 0x02, 0x76, 0x78
-	.cfi_escape 0x10, 0x0e, 0x02, 0x76, 0x70
-	.cfi_escape 0x10, 0x0f, 0x02, 0x76, 0x68
-                                # LOE
-..B1.25:                        # Preds ..B1.11
-                                # Execution count [4.50e-01]: Infreq
-        xorl      %r15d, %r15d                                  #12.9
-        jmp       ..B1.15       # Prob 100%                     #12.9
-                                # LOE rcx r8 r9 r10 r11 r13 r14 r15 edi xmm1 ymm0
-..B1.26:                        # Preds ..B1.6
-                                # Execution count [4.50e-01]: Infreq
-        xorl      %r14d, %r14d                                  #12.9
-        jmp       ..B1.10       # Prob 100%                     #12.9
-        .align    16,0x90
-                                # LOE rcx r8 r9 r10 r11 edi r14d xmm1 ymm0
-	.cfi_endproc
-# mark_end;
-	.type	jacobi2D5pt,@function
-	.size	jacobi2D5pt,.-jacobi2D5pt
-	.data
-# -- End  jacobi2D5pt
-	.data
-	.section .note.GNU-stack, ""
-// -- Begin DWARF2 SEGMENT .eh_frame
-	.section .eh_frame,"a",@progbits
-.eh_frame_seg:
-	.align 8
-# End
--- a/examples/2d-5pt.c
+++ b/examples/2d-5pt.c
@@ -1,16 +0,0 @@
-
-void jacobi2D5pt(int N, int M){
-    void dummy(double*, double*);
-    double a[M][N];
-    double b[M][N];
-    double s;
-
-    for(int j=1; j<M-1; ++j){
-        #pragma vector aligned
-        //STARTLOOP
-        for(int i=1; i<N-1; ++i){
-            b[j][i] = ( a[j][i-1] + a[j][i+1] + a[j-1][i] + a[j+1][i]) * s;
-        }
-    }   
-    dummy(&a[1][1], &b[1][1]);
-}
--- a/examples/daxpy.c
+++ b/examples/daxpy.c
@@ -1,13 +0,0 @@
-
-void daxpy(int N){
-    void dummy(double*, double*);
-    double a[N], b[N];
-    double s;
-
-    //STARTLOOP  
-    for(int i=0; i<N; ++i)
-            a[i] = a[i] + s * b[i];
-
-    dummy(&a[1], &b[1]);
-}
-
--- a/examples/scale.c
+++ b/examples/scale.c
@@ -1,13 +0,0 @@
-
-void scale(int N){
-    void dummy(double*, double*);
-    double a[N], b[N];
-    double s;
-    
-    //STARTLOOP
-    for(int i=0; i<N; ++i){
-            a[i] = s * b[i];
-    }
-
-    dummy(&a[1],&b[1]);
-}
--- a/examples/taxCalc-ivb
+++ b/examples/taxCalc-ivb
--- a/examples/taxCalc-ivb-iaca
+++ b/examples/taxCalc-ivb-iaca
--- a/examples/taxCalc-ivb-iaca.S
+++ b/examples/taxCalc-ivb-iaca.S
@@ -1,199 +0,0 @@
-# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
-# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
-	.file "taxCalc.c"
-	.text
-..TXTST0:
-# -- Begin  main
-	.text
-# mark_begin;
-       .align    16,0x90
-	.globl main
-# --- main(void)
-main:
-..B1.1:                         # Preds ..B1.0
-	.cfi_startproc
-..___tag_value_main.1:
-..L2:
-                                                          #4.15
-        pushq     %rbp                                          #4.15
-	.cfi_def_cfa_offset 16
-        movq      %rsp, %rbp                                    #4.15
-	.cfi_def_cfa 6, 16
-	.cfi_offset 6, -16
-        andq      $-128, %rsp                                   #4.15
-        subq      $4096, %rsp                                   #4.15
-        movl      $104446, %esi                                 #4.15
-        movl      $3, %edi                                      #4.15
-        call      __intel_new_feature_proc_init                 #4.15
-                                # LOE rbx r12 r13 r14 r15
-..B1.10:                        # Preds ..B1.1
-        vstmxcsr  (%rsp)                                        #4.15
-        movl      $.2.3_2_kmpc_loc_struct_pack.3, %edi          #4.15
-        xorl      %esi, %esi                                    #4.15
-        orl       $32832, (%rsp)                                #4.15
-        xorl      %eax, %eax                                    #4.15
-        vldmxcsr  (%rsp)                                        #4.15
-..___tag_value_main.6:
-        call      __kmpc_begin                                  #4.15
-..___tag_value_main.7:
-                                # LOE rbx r12 r13 r14 r15
-..B1.2:                         # Preds ..B1.10
-        movl      $il0_peep_printf_format_0, %edi               #5.5
-        call      puts                                          #5.5
-                                # LOE rbx r12 r13 r14 r15
-..B1.3:                         # Preds ..B1.2
-        vmovss    .L_2il0floatpacket.0(%rip), %xmm0             #8.15
-        xorl      %eax, %eax                                    #11.5
-        vxorps    %xmm1, %xmm1, %xmm1                           #9.5
-        vmovss    %xmm1, (%rsp)                                 #9.5
-
-        movl $111,%ebx          #IACA START
-        .byte 100,103,144       #IACA START
-                                # LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
-
-..B1.4:                         # Preds ..B1.4 ..B1.3
-        lea       1(%rax,%rax), %edx                            #12.9
-        vcvtsi2ss %edx, %xmm2, %xmm2                            #12.27
-        vmulss    %xmm2, %xmm0, %xmm3                           #12.29
-        lea       2(%rax,%rax), %ecx                            #12.9
-        vaddss    %xmm3, %xmm1, %xmm4                           #12.29
-        vxorps    %xmm1, %xmm1, %xmm1                           #12.27
-        vcvtsi2ss %ecx, %xmm1, %xmm1                            #12.27
-        vmulss    %xmm1, %xmm0, %xmm5                           #12.29
-        vmovss    %xmm4, 4(%rsp,%rax,8)                         #12.9
-        vaddss    %xmm5, %xmm4, %xmm1                           #12.29
-        vmovss    %xmm1, 8(%rsp,%rax,8)                         #12.9
-        incq      %rax                                          #11.5
-        cmpq      $499, %rax                                    #11.5
-        jb        ..B1.4        # Prob 99%                      #11.5
-
-        movl $222,%ebx          #IACA END
-        .byte 100,103,144       #IACA END
-
-                                # LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
-..B1.5:                         # Preds ..B1.4
-        vmovss    3992(%rsp), %xmm0                             #12.18
-        movl      $il0_peep_printf_format_1, %edi               #15.5
-        vaddss    .L_2il0floatpacket.1(%rip), %xmm0, %xmm1      #12.29
-        vmovss    %xmm1, 3996(%rsp)                             #12.9
-        call      puts                                          #15.5
-                                # LOE rbx r12 r13 r14 r15
-..B1.6:                         # Preds ..B1.5
-        movl      $.2.3_2_kmpc_loc_struct_pack.14, %edi         #16.12
-        xorl      %eax, %eax                                    #16.12
-..___tag_value_main.8:
-        call      __kmpc_end                                    #16.12
-..___tag_value_main.9:
-                                # LOE rbx r12 r13 r14 r15
-..B1.7:                         # Preds ..B1.6
-        xorl      %eax, %eax                                    #16.12
-        movq      %rbp, %rsp                                    #16.12
-        popq      %rbp                                          #16.12
-	.cfi_def_cfa 7, 8
-	.cfi_restore 6
-        ret                                                     #16.12
-        .align    16,0x90
-	.cfi_endproc
-                                # LOE
-# mark_end;
-	.type	main,@function
-	.size	main,.-main
-	.data
-	.align 4
-	.align 4
-.2.3_2_kmpc_loc_struct_pack.3:
-	.long	0
-	.long	2
-	.long	0
-	.long	0
-	.quad	.2.3_2__kmpc_loc_pack.2
-	.align 4
-.2.3_2__kmpc_loc_pack.2:
-	.byte	59
-	.byte	117
-	.byte	110
-	.byte	107
-	.byte	110
-	.byte	111
-	.byte	119
-	.byte	110
-	.byte	59
-	.byte	109
-	.byte	97
-	.byte	105
-	.byte	110
-	.byte	59
-	.byte	52
-	.byte	59
-	.byte	52
-	.byte	59
-	.byte	59
-	.space 1, 0x00 	# pad
-	.align 4
-.2.3_2_kmpc_loc_struct_pack.14:
-	.long	0
-	.long	2
-	.long	0
-	.long	0
-	.quad	.2.3_2__kmpc_loc_pack.13
-	.align 4
-.2.3_2__kmpc_loc_pack.13:
-	.byte	59
-	.byte	117
-	.byte	110
-	.byte	107
-	.byte	110
-	.byte	111
-	.byte	119
-	.byte	110
-	.byte	59
-	.byte	109
-	.byte	97
-	.byte	105
-	.byte	110
-	.byte	59
-	.byte	49
-	.byte	54
-	.byte	59
-	.byte	49
-	.byte	54
-	.byte	59
-	.byte	59
-	.section .rodata.str1.4, "aMS",@progbits,1
-	.align 4
-	.align 4
-il0_peep_printf_format_0:
-	.long	1128354639
-	.long	1702109249
-	.long	1931506803
-	.long	1953653108
-	.byte	0
-	.space 3, 0x00 	# pad
-	.align 4
-il0_peep_printf_format_1:
-	.long	1128354639
-	.long	1702109249
-	.long	1696625779
-	.word	25710
-	.byte	0
-	.data
-# -- End  main
-	.section .rodata, "a"
-	.align 4
-	.align 4
-.L_2il0floatpacket.0:
-	.long	0x3e428f5c
-	.type	.L_2il0floatpacket.0,@object
-	.size	.L_2il0floatpacket.0,4
-	.align 4
-.L_2il0floatpacket.1:
-	.long	0x433dcf5c
-	.type	.L_2il0floatpacket.1,@object
-	.size	.L_2il0floatpacket.1,4
-	.data
-	.section .note.GNU-stack, ""
-// -- Begin DWARF2 SEGMENT .eh_frame
-	.section .eh_frame,"a",@progbits
-.eh_frame_seg:
-	.align 8
-# End
--- a/examples/taxCalc.c
+++ b/examples/taxCalc.c
@@ -1,18 +0,0 @@
-#include <stdio.h>
-//#include "iacaMarks.h"
-
-int main(void){
-    printf("OSACA test start\n");
-    int i = 1;
-    float arr[1000];
-    float tax = 0.19;
-    arr[0] = 0;
-    //STARTLOOP
-    while(i < 1000){
-        arr[i] = arr[i-1]+i*tax;
-        i += 1;
-    }
-
-    printf("OSACA test end\n");
-    return 0;
-}
--- a/examples/triad.c
+++ b/examples/triad.c
@@ -1,12 +0,0 @@
-
-void triad(int N){
-    void dummy(double*);
-    double a[N], b[N], c[N], d[N];
-    double s;
-
-    //STARTLOOP
-    for(int i=0; i<N; ++i)
-        a[i] = b[i] + c[i] * d[i];
-    
-    dummy(&a[1]);
-}
--- a/osaca/init.py
+++ b/osaca/init.py
@@ -1,2 +1,10 @@
+"""Open Source Architecture Code Analyzer"""
 name = 'osaca'
-__version__ = '0.2.2'
+__version__ = '0.3.1.dev1'
+
+# To trigger travis deployment to pypi, do the following:
+# 1. Increment __version___
+# 2. commit to RRZE-HPC/osaca's master branch
+# 3. wait for travis to complete successful (unless already tested)
+# 4. tag commit with 'v{}'.format(__version__) (`git tag vX.Y.Z`)
+# 5. push tag to github (`git push origin vX.Y.Z` or push all tags with `git push --tags`)
--- a/osaca/api/init.py
+++ b/osaca/api/init.py
@@ -0,0 +1,8 @@
+"""
+APIs for handling interfaces to kerncraft, etc.
+
+Only the classes below will be exported, so please add new semantic tools to __all__.
+"""
+from .kerncraft_interface import KerncraftAPI
+
+__all__ = ['KerncraftAPI']
--- a/osaca/api/kerncraft_interface.py
+++ b/osaca/api/kerncraft_interface.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+import collections
+import sys
+from io import StringIO
+
+from osaca.frontend import Frontend
+from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.semantics import (INSTR_FLAGS, KernelDG, MachineModel,
+                             SemanticsAppender, reduce_to_section)
+
+
+# Stolen from https://stackoverflow.com/a/16571630
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        return self
+
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio    # free up some memory
+        sys.stdout = self._stdout
+
+
+class KerncraftAPI(object):
+    def __init__(self, arch, code):
+        self.machine_model = MachineModel(arch=arch)
+        self.semantics = SemanticsAppender(self.machine_model)
+        isa = self.machine_model.get_ISA().lower()
+        if isa == 'aarch64':
+            self.parser = ParserAArch64v81()
+        elif isa == 'x86':
+            self.parser = ParserX86ATT()
+
+        parsed_code = self.parser.parse_file(code)
+        self.kernel = reduce_to_section(parsed_code, self.machine_model.get_ISA())
+        self.semantics.add_semantics(self.kernel)
+
+    def create_output(self, verbose=False):
+        kernel_graph = KernelDG(self.kernel, self.parser, self.machine_model)
+        frontend = Frontend(arch=self.machine_model.get_arch())
+        with Capturing() as output:
+            frontend.print_full_analysis(self.kernel, kernel_graph, verbose=verbose)
+        return '\n'.join(output)
+
+    def get_unmatched_instruction_ratio(self):
+        unmatched_counter = 0
+        for instruction in self.kernel:
+            if (
+                INSTR_FLAGS.TP_UNKWN in instruction['flags']
+                and INSTR_FLAGS.LT_UNKWN in instruction['flags']
+            ):
+                unmatched_counter += 1
+        return unmatched_counter / len(self.kernel)
+
+    def get_port_occupation_cycles(self):
+        throughput_values = self.semantics.get_throughput_sum(self.kernel)
+        port_names = self.machine_model['ports']
+        return collections.OrderedDict(list(zip(port_names, throughput_values)))
+
+    def get_total_throughput(self):
+        return max(self.semantics.get_throughput_sum(self.kernel))
+
+    def get_latency(self):
+        return (self.get_lcd(), self.get_cp())
+
+    def get_cp(self):
+        kernel_graph = KernelDG(self.kernel, self.parser, self.machine_model)
+        kernel_cp = kernel_graph.get_critical_path()
+        return sum([x['latency_cp'] for x in kernel_cp])
+
+    def get_lcd(self):
+        kernel_graph = KernelDG(self.kernel, self.parser, self.machine_model)
+        lcd_dict = kernel_graph.get_loopcarried_dependencies()
+        lcd = 0.0
+        for dep in lcd_dict:
+            lcd_tmp = sum([x['latency_lcd'] for x in lcd_dict[dep]['dependencies']])
+            lcd = lcd_tmp if lcd_tmp > lcd else lcd
+        return lcd
--- a/osaca/create_testcase.py
+++ b/osaca/create_testcase.py
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-
-from param import Register, MemAddr, Parameter
-from testcase import Testcase
-
-# Choose out of various operands
-reg8 = Register('al')
-reg16 = Register('ax')
-reg32 = Register('eax')
-reg64 = Register('rax')
-xmm = Register('xmm0')
-ymm = Register('ymm0')
-zmm = Register('zmm0')
-mem0 = MemAddr('(%rax, %esi, 4)')
-imd1 = Parameter('IMD')
-
-
-# -----------------------------------------------
-# -USER INPUT------------------------------------
-# -----------------------------------------------
-#  Enter your mnemonic
-mnemonic = 'add'
-
-# Define your operands. If you don't need it, just type in None
-dst = mem0
-op1 = imd1
-op2 = None
-
-# Define the number of instructions per loop (default: 12)
-per_loop = '32'
-
-# -----------------------------------------------
-# -----------------------------------------------
-
-# Start
-operands = [x for x in [dst, op1, op2] if x is not None]
-opListStr = ', '.join([str(x) for x in operands])
-print('Create Testcase for {} {}'.format(mnemonic, opListStr), end='')
-tc = Testcase(mnemonic, operands, per_loop)
-tc.write_testcase()
-print('  --------> SUCCEEDED')
--- a/osaca/data/CFL_data.csv
+++ b/osaca/data/CFL_data.csv
--- a/osaca/data/KBL_data.csv
+++ b/osaca/data/KBL_data.csv
--- a/osaca/data/bdw_data.csv
+++ b/osaca/data/bdw_data.csv
--- a/osaca/data/csx.yml
+++ b/osaca/data/csx.yml
@@ -0,0 +1,540 @@
+osaca_version: 0.3.2
+micro_architecture: Cascade Lake SP
+arch_code: CSX
+isa: x86
+ROB_size: 224
+retired_uOps_per_cycle: 4
+scheduler_size: 97
+hidden_loads: false
+load_latency: {gpr: 4.0, xmm: 4.0, ymm: 4.0, zmm: 4.0}
+load_throughput:
+- {base: gpr, index: ~, offset: ~, scale: 1, port_pressure:     [[1, '23'], [1, ['2D', '3D']]]}
+- {base: gpr, index: ~, offset: ~, scale: 8, port_pressure:     [[1, '23'], [1, ['2D', '3D']]]}
+- {base: gpr, index: ~, offset: imd, scale: 1, port_pressure:   [[1, '23'], [1, ['2D', '3D']]]}
+- {base: gpr, index: ~, offset: imd, scale: 8, port_pressure:   [[1, '23'], [1, ['2D', '3D']]]}
+- {base: gpr, index: gpr, offset: ~, scale: 1, port_pressure:   [[1, '23'], [1, ['2D', '3D']]]}
+- {base: gpr, index: gpr, offset: ~, scale: 8, port_pressure:   [[1, '23'], [1, ['2D', '3D']]]}
+- {base: gpr, index: gpr, offset: imd, scale: 1, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
+- {base: gpr, index: gpr, offset: imd, scale: 8, port_pressure: [[1, '23'], [1, ['2D', '3D']]]}
+ports: ['0', 0DV, '1', '2', 2D, '3', 3D, '4', '5', '6', '7']
+port_model_scheme: |
+  ┌------------------------------------------------------------------------┐
+  |                         97 entry unified scheduler                     |
+  └------------------------------------------------------------------------┘
+     0 |       1 |      2 |     3 |     4 |      5 |        6 |       7 |
+       ▼         ▼        ▼       ▼       ▼        ▼          ▼         ▼
+   ┌-------┐ ┌-------┐ ┌-----┐ ┌-----┐ ┌-----┐ ┌-------┐ ┌--------┐  ┌-----┐
+   |  ALU  | |  ALU  | |  LD | |  LD | |  ST | |  ALU  | |  ALU & |  | AGU |
+   └-------┘ └-------┘ └-----┘ └-----┘ └-----┘ └-------┘ |  Shift |  └-----┘
+   ┌-------┐ ┌-------┐ ┌-----┐ ┌-----┐         ┌-------┐ └--------┘
+   |  2ND  | |  Fast | | AGU | | AGU |         |  Fast |
+   | BRANCH| |  LEA  | └-----┘ └-----┘         |  LEA  |
+   └-------┘ └-------┘                         └-------┘
+   ┌-------┐ ┌-------┐                         ┌-------┐
+   |AVX DIV| |AVX FMA|                         |  AVX  |
+   └-------┘ └-------┘                         |  SHUF |
+   ┌-------┐ ┌-------┐                         └-------┘
+   |AVX FMA| |AVX MUL|                         ┌-------┐
+   └-------┘ └-------┘                         |AVX-512|
+   ┌-------┐ ┌-------┐                         |  FMA  |
+   |AVX MUL| |AVX ADD|                         └-------┘
+   └-------┘ └-------┘                         ┌-------┐
+   ┌-------┐ ┌-------┐                         |AVX-512|
+   |AVX ADD| |AVX ALU|                         |  ADD  |
+   └-------┘ └-------┘                         └-------┘
+   ┌-------┐ ┌-------┐                         ┌-------┐
+   |AVX ALU| |  AVX  |                         |AVX-512|
+   └-------┘ | Shift |                         |  MUL  |
+   ┌-------┐ └-------┘                         └-------┘
+   |  AVX  | ┌-------┐                         ┌-------┐
+   | Shift | |  Slow |                         |AVX-512|
+   └-------┘ |  LEA  |                         |  ALU  |
+   ┌-------┐ └-------┘                         └-------┘
+   |  VNNI | ┌-------┐
+   └-------┘ |  VNNI |
+             └-------┘
+instruction_forms:
+- name: addsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: addss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: addl
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p0156
+  port_pressure: [[1, '0156']]
+- name: addq
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p0156
+  port_pressure: [[1, '0156']]
+- name: cmpl
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: ~  # 	1*p0156
+  port_pressure: [[1, '0156']]
+- name: cmpq
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: ~  # 	1*p0156
+  port_pressure: [[1, '0156']]
+- name: incq
+  operands:
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: ~  # 	1*p0156
+  port_pressure: [[1, '0156']]
+- name: ja
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: jb
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: jne
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: mulsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: mulss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: movl
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: movq
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: movq
+  operands:
+  - class: memory
+    base: gpr
+    offset: ~
+    index: ~
+    scale: 1
+  - class: register
+    name: gpr
+  throughput: 0.5
+  latency: 3.0  # 	1*p23+1*p2D3D
+  port_pressure: [[1, '23'], [1, [2D, 3D]]]
+- name: movq
+  operands:
+  - class: register
+    name: gpr
+  - class: memory
+    base: gpr
+    offset: imd
+    index: ~
+    scale: 1
+  throughput: 0.5
+  latency: 2.0  # 	1*p23+1*p4
+  port_pressure: [[1, '23'], [1, '4']]
+- name: rcpss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 1.0
+  latency: 4.0
+  port_pressure: ~
+- name: sqrtsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 6.0
+  latency: 22.0  # 	1*p0+6*p0DV
+  port_pressure: [[1, '0'], [6.0, [0DV]]]
+- name: sqrtss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 3.0
+  latency: 16.0  # 	1*p0+3*p0DV
+  port_pressure: [[1, '0'], [3.0, [0DV]]]
+- name: subq
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p0156
+  port_pressure: [[1, '0156']]
+- name: vaddpd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vaddpd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vaddsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vaddss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vdivsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 4.0
+  latency: 14.0  # 	1*p0+4*p0DV
+  port_pressure: [[1, '0'], [4.0, [0DV]]]
+- name: vdivss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 3.0
+  latency: 11.0  # 	1*p0+3*p0DV
+  port_pressure: [[1, '0'], [3.0, [0DV]]]
+- name: vfmadd213pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vfmadd132pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vfmadd231pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vfmadd132pd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vfmadd213pd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vfmadd231pd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulpd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmovapd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovapd
+  operands:
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  - class: register
+    name: xmm
+  throughput: 1.0
+  latency: 4.0  # 	1*p23+1*p2D3D
+  port_pressure: [[1, '23'], [1, [2D, 3D]]]
+- name: vmovapd
+  operands:
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	1*p23+1*p2D3D
+  port_pressure: [[1, '23'], [1, [2D, 3D]]]
+- name: vmovapd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p23+1*p4
+  port_pressure: [[1, '23'], [1, '4']]
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 5.0  # 	1*p23+1*p4
+  port_pressure: [[1, '23'], [1, '4']]
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 5.0  # 	1*p23+1*p4
+  port_pressure: [[1, '23'], [1, '4']]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 5.0  # 	1*p23+1*p4
+  port_pressure: [[1, '23'], [1, '4']]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 5.0  # 	1*p23+1*p4
+  port_pressure: [[1, '23'], [1, '4']]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovsd
+  operands:
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p23+1*p2D3D
+  port_pressure: [[1, '23'], [1, [2D, 3D]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p23+1*p4
+  port_pressure: [[1, '23'], [1, '4']]
--- a/osaca/data/hsw_data.csv
+++ b/osaca/data/hsw_data.csv
--- a/osaca/data/isa/aarch64.yml
+++ b/osaca/data/isa/aarch64.yml
@@ -0,0 +1,374 @@
+osaca_version: 0.3.0
+isa: "AArch64"
+# Contains all operand-irregular instruction forms OSACA supports for AArch64.
+# Operand-regular for a AArch64 instruction form with N operands in the shape of
+#   mnemonic op1 ... opN
+# means that op1 is the only destination operand and op2 to op(N) are source operands.
+instruction_forms:
+    - name: "fmla"
+      operands:
+        - class: "register"
+          prefix: "v"
+          shape: "s"
+          source: true
+          destination: true
+        - class: "register"
+          prefix: "v"
+          shape: "s"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "v"
+          shape: "s"
+          source: true
+          destination: false
+    - name: "fmla"
+      operands:
+        - class: "register"
+          prefix: "v"
+          shape: "d"
+          source: true
+          destination: true
+        - class: "register"
+          prefix: "v"
+          shape: "d"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "v"
+          shape: "d"
+          source: true
+          destination: false
+    - name: "ldp"
+      operands:
+        - class: "register"
+          prefix: "d"
+          source: false
+          destination: true
+        - class: "register"
+          prefix: "d"
+          source: false
+          destination: true
+        - class: "memory"
+          base: "x"
+          offset: "imd"
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: true
+          destination: false
+    - name: "ldp"
+      operands:
+        - class: "register"
+          prefix: "d"
+          source: false
+          destination: true
+        - class: "register"
+          prefix: "d"
+          source: false
+          destination: true
+        - class: "memory"
+          base: "x"
+          offset: "imd"
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: true
+          source: true
+          destination: false   
+    - name: "ldp"
+      operands:
+        - class: "register"
+          prefix: "d"
+          source: false
+          destination: true
+        - class: "register"
+          prefix: "d"
+          source: false
+          destination: true
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: true
+          source: true
+          destination: false
+    - name: "ldp"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "memory"
+          base: "x"
+          offset: "imd"
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: true
+          destination: false
+    - name: "ldp"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "memory"
+          base: "x"
+          offset: ~ 
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: true 
+          source: true
+          destination: false
+    - name: "ldp"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "memory"
+          base: "x"
+          offset: ~ 
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: true
+          destination: false
+    - name: "ldp"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "register"
+          prefix: "q"
+          source: false
+          destination: true
+        - class: "memory"
+          base: "x"
+          offset: "imd" 
+          index: ~
+          scale: 1
+          pre-indexed: true
+          post-indexed: false 
+          source: true
+          destination: true
+    - name: "stp"
+      operands:
+        - class: "register"
+          prefix: "d"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "d"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true   
+    - name: "stp"
+      operands:
+        - class: "register"
+          prefix: "d"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "d"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: "imd"
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true
+    - name: "stp"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true
+    - name: "stp"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: True
+          source: false
+          destination: true
+    - name: "stp"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: "imd"
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true
+    - name: "str"
+      operands:
+        - class: "register"
+          prefix: "x"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true
+    - name: "str"
+      operands:
+        - class: "register"
+          prefix: "d"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: "imd"
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true
+    - name: "str"
+      operands:
+        - class: "register"
+          prefix: "d"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: true
+          source: false
+          destination: true
+    - name: "str"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: "x"
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true
+    - name: "str"
+      operands:
+        - class: "register"
+          prefix: "q"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: true
+          source: false
+          destination: true
+    - name: "str"
+      operands:
+        - class: "register"
+          prefix: "x"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: ~
+          scale: 1
+          pre-indexed: false
+          post-indexed: true
+          source: false
+          destination: true
+    - name: "str"
+      operands:
+        - class: "register"
+          prefix: "x"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "x"
+          offset: ~
+          index: "x"
+          scale: 1
+          pre-indexed: false
+          post-indexed: false
+          source: false
+          destination: true
--- a/osaca/data/isa/x86.yml
+++ b/osaca/data/isa/x86.yml
@@ -0,0 +1,218 @@
+osaca_version: 0.3.0
+isa: "x86"
+# Contains all operand-irregular instruction forms OSACA supports for x86.
+# Operand-regular for a x86 AT&T instruction form with N operands in the shape of
+#   mnemonic op1 ... opN
+# means that opN is the only destination operand and op1 to op(N-1) are source operands.
+instruction_forms:
+    - name: addl
+      operands:
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: true
+    - name: addq
+      operands:
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: true
+    - name: addsd
+      operands:
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: true
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: false
+    - name: addss
+      operands:
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: true
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: false
+    - name: cmpl
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+    - name: cmpq
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+    - name: cmpq
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "gpr"
+          offset: "imd"
+          index: ~
+          scale: 1
+          source: true
+          destination: false
+    - name: ja
+      operands:
+        - class: "identifier"
+          source: true
+          destination: false
+    - name: mulsd
+      operands:
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: true
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: false
+    - name: mulss
+      operands:
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: true
+        - class: "register"
+          name: "xmm"
+          source: true
+          destination: false
+    - name: subq
+      operands:
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: true
+    - name: vfmadd132pd
+      operands:
+        - class: "memory"
+          base: "gpr"
+          offset: ~
+          index: "gpr"
+          scale: 1
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: true
+    - name: vfmadd132pd
+      operands:
+        - class: "memory"
+          base: "gpr"
+          offset: "imd"
+          index: "gpr"
+          scale: 1
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: true
+    - name: vfmadd213pd
+      operands:
+        - class: "memory"
+          base: "gpr"
+          offset: ~
+          index: "gpr"
+          scale: 1
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: true
+    - name: vfmadd213pd
+      operands:
+        - class: "memory"
+          base: "gpr"
+          offset: "imd"
+          index: "gpr"
+          scale: 1
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: true
+    - name: vfmadd231pd
+      operands:
+        - class: "memory"
+          base: "gpr"
+          offset: "imd"
+          index: "gpr"
+          scale: 1
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: true
+    - name: vfmadd231pd
+      operands:
+        - class: "memory"
+          base: "gpr"
+          offset: ~
+          index: "gpr"
+          scale: 1
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: false
+        - class: "register"
+          name: "ymm"
+          source: true
+          destination: true
+
--- a/osaca/data/ivb_data.csv
+++ b/osaca/data/ivb_data.csv
--- a/osaca/data/model_importer.py
+++ b/osaca/data/model_importer.py
@@ -1,222 +1,202 @@
 #!/usr/bin/env python3
-from collections import defaultdict, OrderedDict
-import xml.etree.ElementTree as ET
-import re
-import sys
 import argparse
+import sys
+import xml.etree.ElementTree as ET
 from distutils.version import StrictVersion

-from osaca.param import Parameter, Register
-from osaca.eu_sched import Scheduler
+from osaca.parser import get_parser
+from osaca.semantics import MachineModel


-def normalize_reg_name(reg_name):
-    # strip spaces
-    reg_name = reg_name.strip()
-    # masks are denoted with curly brackets in uops.info
-    reg_name = re.sub(r'{K([0-7])}', r'K\1', reg_name)
-    reg_name = re.sub(r'ST\(([0-7])\)', r'ST\1', reg_name)
-    return reg_name
+def port_pressure_from_tag_attributes(attrib):
+    # '1*p015+1*p1+1*p23+1*p4+3*p5' ->
+    # [[1, '015'], [1, '1'], [1, '23'], [1, '4'], [3, '5']]
+    port_occupation = []
+    for p in attrib['ports'].split('+'):
+        cycles, ports = p.split('*p')
+        port_occupation.append([int(cycles), ports])

-
-def port_occupancy_from_tag_attributes(attrib, arch):
-    occupancy = defaultdict(int)
-    for k, v in attrib.items():
-        m = re.match('^port([0-9]+)', k)
-        if not m:
-            continue
-        ports = m.group(1)
-        # Ignore Port7 on HSW, BDW, SKL and SKX if present in combination with ports 2 and 3.
-        # Port7 is only used for simple address generation, while 2 and 3 handle all addressing,
-        # but uops.info does not differentiate.
-        if arch in ['HSW', 'BDW', 'SKL', 'SKX'] and ports == '237':
-            ports = ports.replace('7', '')
-        potential_ports = list(ports)
-        per_port_occupancy = int(v) / len(potential_ports)
-        for pp in potential_ports:
-            occupancy[pp] += per_port_occupancy
-
-    # Also consider DIV pipeline
+    # Also
    if 'div_cycles' in attrib:
-        occupancy['0DV'] = int(attrib['div_cycles'])
+        port_occupation.append([int(attrib['div_cycles']), ['DIV']])

-    return dict(occupancy)
+    return port_occupation


-def extract_paramters(instruction_tag):
+def extract_paramters(instruction_tag, parser, isa):
    # Extract parameter components
    parameters = []  # used to store string representations
-    parameter_tags = sorted(instruction_tag.findall("operand"),
-                            key=lambda p: int(p.attrib['idx']))
+    parameter_tags = sorted(instruction_tag.findall("operand"), key=lambda p: int(p.attrib['idx']))
    for parameter_tag in parameter_tags:
+        parameter = {}
        # Ignore parameters with suppressed=1
        if int(parameter_tag.attrib.get('suppressed', '0')):
            continue

        p_type = parameter_tag.attrib['type']
        if p_type == 'imm':
-            parameters.append('imd')  # Parameter('IMD')
+            parameter['class'] = 'immediate'
+            parameter['imd'] = 'int'
+            parameters.append(parameter)
        elif p_type == 'mem':
-            parameters.append('mem')  # Parameter('MEM')
+            parameter['class'] = 'memory'
+            parameter['base'] = 'gpr'
+            parameter['offset'] = None
+            parameter['index'] = None
+            parameter['scale'] = 1
+            parameters.append(parameter)
        elif p_type == 'reg':
-            possible_regs = [normalize_reg_name(r)
-                             for r in parameter_tag.text.split(',')]
-            reg_groups = [Register.sizes.get(r, None) for r in possible_regs]
-            if reg_groups[1:] == reg_groups[:-1]:
-                if reg_groups[0] is None:
-                    raise ValueError("Unknown register type for {} with {}.".format(
-                        parameter_tag.attrib, parameter_tag.text))
-                elif reg_groups[0][1] == 'GPR':
-                    parameters.append('r{}'.format(reg_groups[0][0]))
-                    # Register(possible_regs[0]))
-                elif '{' in parameter_tag.text:
-                    # We have a mask
-                    parameters[-1] += '{opmask}'
+            parameter['class'] = 'register'
+            possible_regs = [parser.parse_register('%' + r) for r in parameter_tag.text.split(',')]
+            if possible_regs[0] is None:
+                raise ValueError(
+                    'Unknown register type for {} with {}.'.format(
+                        parameter_tag.attrib, parameter_tag.text
+                    )
+                )
+            if isa == 'x86':
+                if parser.is_vector_register(possible_regs[0]['register']):
+                    possible_regs[0]['register']['name'] = possible_regs[0]['register'][
+                        'name'
+                    ].lower()[:3]
+                    if 'mask' in possible_regs[0]['register']:
+                        possible_regs[0]['register']['mask'] = True
                else:
-                    parameters.append(reg_groups[0][1].lower())
+                    possible_regs[0]['register']['name'] = 'gpr'
+            elif isa == 'aarch64':
+                del possible_regs['register']['name']
+            for key in possible_regs[0]['register']:
+                parameter[key] = possible_regs[0]['register'][key]
+            parameters.append(parameter)
        elif p_type == 'relbr':
-            parameters.append('LBL')
+            parameter['class'] = 'identifier'
+            parameters.append(parameter)
        elif p_type == 'agen':
-            parameters.append('mem')
+            # FIXME actually only address generation
+            parameter['class'] = 'memory'
+            parameter['base'] = 'gpr'
+            parameter['offset'] = None
+            parameter['index'] = None
+            parameter['scale'] = 1
+            parameters.append(parameter)
+            parameters.append(parameter)
        else:
            raise ValueError("Unknown paramter type {}".format(parameter_tag.attrib))
    return parameters


 def extract_model(tree, arch):
-    model_data = []
-    for instruction_tag in tree.findall('//instruction'):
+    isa = MachineModel.get_isa_for_arch(arch)
+    mm = MachineModel(isa=isa)
+    parser = get_parser(isa)
+
+    for instruction_tag in tree.findall('.//instruction'):
        ignore = False

        mnemonic = instruction_tag.attrib['asm']

        # Extract parameter components
        try:
-            parameters = extract_paramters(instruction_tag)
+            parameters = extract_paramters(instruction_tag, parser, isa)
+            if isa == 'x86':
+                parameters.reverse()
        except ValueError as e:
            print(e, file=sys.stderr)

        # Extract port occupation, throughput and latency
-        port_occupancy, throughput, latency = [], 0.0, None
-        arch_tag = instruction_tag.find('architecture[@name="'+arch+'"]')
+        port_pressure, throughput, latency, uops = [], None, None, None
+        arch_tag = instruction_tag.find('architecture[@name="' + arch.upper() + '"]')
        if arch_tag is None:
            continue
        # We collect all measurement and IACA information and compare them later
        for measurement_tag in arch_tag.iter('measurement'):
-            port_occupancy.append(port_occupancy_from_tag_attributes(measurement_tag.attrib, arch))
-            # FIXME handle min/max Latencies ('maxCycles' and 'minCycles')
-            latencies = [int(l_tag.attrib['cycles'])
-                         for l_tag in measurement_tag.iter('latency') if 'latency' in l_tag.attrib]
-
+            if 'TP_ports' in measurement_tag.attrib:
+                throughput = measurement_tag.attrib['TP_ports']
+            else:
+                throughput = (
+                    measurement_tag.attrib['TP'] if 'TP' in measurement_tag.attrib else None
+                )
+            uops = (
+                int(measurement_tag.attrib['uops']) if 'uops' in measurement_tag.attrib else None
+            )
+            if 'ports' in measurement_tag.attrib:
+                port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib))
+            latencies = [
+                int(l_tag.attrib['cycles'])
+                for l_tag in measurement_tag.iter('latency')
+                if 'cycles' in l_tag.attrib
+            ]
+            if len(latencies) == 0:
+                latencies = [
+                    int(l_tag.attrib['max_cycles'])
+                    for l_tag in measurement_tag.iter('latency')
+                    if 'max_cycles' in l_tag.attrib
+                ]
            if latencies[1:] != latencies[:-1]:
                print("Contradicting latencies found:", mnemonic, file=sys.stderr)
                ignore = True
            elif latencies:
                latency = latencies[0]
+
        # Ordered by IACA version (newest last)
-        for iaca_tag in sorted(arch_tag.iter('IACA'),
-                               key=lambda i: StrictVersion(i.attrib['version'])):
-            port_occupancy.append(port_occupancy_from_tag_attributes(iaca_tag.attrib, arch))
-        if ignore: continue
+        for iaca_tag in sorted(
+            arch_tag.iter('IACA'), key=lambda i: StrictVersion(i.attrib['version'])
+        ):
+            if 'ports' in iaca_tag.attrib:
+                port_pressure.append(port_pressure_from_tag_attributes(iaca_tag.attrib))
+        if ignore:
+            continue

        # Check if all are equal
-        if port_occupancy:
-            if port_occupancy[1:] != port_occupancy[:-1]:
-                print("Contradicting port occupancies, using latest IACA:", mnemonic,
-                      file=sys.stderr)
-            port_occupancy = port_occupancy[-1]
-            throughput = max(list(port_occupancy.values())+[0.0])
+        if port_pressure:
+            if port_pressure[1:] != port_pressure[:-1]:
+                print(
+                    "Contradicting port occupancies, using latest IACA:", mnemonic, file=sys.stderr
+                )
+            port_pressure = port_pressure[-1]
+
+            # Add missing ports:
+            for ports in [pp[1] for pp in port_pressure]:
+                for p in ports:
+                    mm.add_port(p)
+
+            throughput = max(mm.average_port_pressure(port_pressure))
        else:
            # print("No data available for this architecture:", mnemonic, file=sys.stderr)
            continue
+        # ---------------------------------------------
+        mm.set_instruction(mnemonic, parameters, latency, port_pressure, throughput, uops)

-        for m, p in build_variants(mnemonic, parameters):
-            model_data.append((m.lower() + '-' + '_'.join(p),
-                              throughput, latency, port_occupancy))
-
-    return model_data
-
-
-def all_or_false(iterator):
-    if not iterator:
-        return False
-    else:
-        return all(iterator)
-
-
-def build_variants(mnemonic, parameters):
-    """Yield all resonable variants of this instruction form."""
-    # The one that was given
-    mnemonic = mnemonic.upper()
-    yield mnemonic, parameters
-
-    # Without opmask
-    if any(['{opmask}' in p for p in parameters]):
-        yield mnemonic, list([p.replace('{opmask}', '') for p in parameters])
-
-    # With suffix (assuming suffix was not already present)
-    suffixes = {'Q': 'r64',
-                'L': 'r32',
-                'W': 'r16',
-                'B': 'r8'}
-    for s, reg in suffixes.items():
-        if not mnemonic.endswith(s) and all_or_false(
-                [p == reg for p in parameters if p not in ['mem', 'imd']]):
-            yield mnemonic+s, parameters
+    return mm


 def architectures(tree):
    return set([a.attrib['name'] for a in tree.findall('.//architecture')])


-def int_or_zero(s):
-    try:
-        return int(s)
-    except ValueError:
-        return 0
-
-
-def dump_csv(model_data, arch):
-    csv = 'instr,TP,LT,ports\n'
-    ports = set()
-    for mnemonic, throughput, latency, port_occupancy in model_data:
-        for p in port_occupancy:
-            ports.add(p)
-    ports = sorted(ports)
-    # If not all ports have been used (happens with port7 due to blacklist
-    # port_occupancy_from_tag_attributes), extend list accordingly:
-    while len(ports) < Scheduler.arch_dict[arch] + len(Scheduler.arch_pipeline_ports.get(arch, [])):
-        max_index = ports.index(str(max(map(int_or_zero, ports))))
-        ports.insert(max_index + 1, str(max(map(int_or_zero, ports)) + 1))
-
-    for mnemonic, throughput, latency, port_occupancy in model_data:
-        for p in ports:
-            if p not in port_occupancy:
-                port_occupancy[p] = 0.0
-        po_items = sorted(port_occupancy.items())
-        csv_line = '{},{},{},"({})"\n'.format(mnemonic, throughput, latency,
-                                              ','.join([str(c) for p, c in po_items]))
-        csv += csv_line
-    return csv
-
-
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('xml', help='path of instructions.xml from http://uops.info')
-    parser.add_argument('arch', nargs='?',
-                        help='architecture to extract, use IACA abbreviations (e.g., SNB). '
-                             'if not given, all will be extracted and saved to file in CWD.')
+    parser.add_argument(
+        'arch',
+        nargs='?',
+        help='architecture to extract, use IACA abbreviations (e.g., SNB). '
+        'if not given, all will be extracted and saved to file in CWD.',
+    )
    args = parser.parse_args()

    tree = ET.parse(args.xml)
+    print('Available architectures:', ', '.join(architectures(tree)))
    if args.arch:
-        model_data = extract_model(tree, args.arch)
-        print(dump_csv(model_data, args.arch))
+        model = extract_model(tree, args.arch)
+        print(model.dump())
    else:
        for arch in architectures(tree):
-            model_data = extract_model(tree, arch)
-            with open('{}_data.csv'.format(arch), 'w') as f:
-                f.write(dump_csv(model_data, arch))
+            print(arch, end='')
+            model = extract_model(tree, arch.lower())
+            with open('{}.yml'.format(arch.lower()), 'w') as f:
+                model.dump(f)
+            print('.')


 if __name__ == '__main__':
--- a/osaca/data/nhm_data.csv
+++ b/osaca/data/nhm_data.csv
--- a/osaca/data/skl_data.csv
+++ b/osaca/data/skl_data.csv
--- a/osaca/data/skx_data.csv
+++ b/osaca/data/skx_data.csv
--- a/osaca/data/snb_data.csv
+++ b/osaca/data/snb_data.csv
--- a/osaca/data/tx2.yml
+++ b/osaca/data/tx2.yml
@@ -0,0 +1,698 @@
+osaca_version: 0.3.2
+micro_architecture: Thunder X2
+arch_code: tx2
+isa: AArch64
+ROB_size: 180
+retired_uOps_per_cycle: 4
+scheduler_size: 60
+hidden_loads: false
+load_latency: {w: 4.0, x: 4.0, b: 4.0, h: 4.0, s: 4.0, d: 4.0, q: 4.0, v: 4.0}
+load_throughput:
+- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure:    [1, '34']}
+- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure:   [1, '34']}
+- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure:     [1, '34']}
+- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure:    [1, '34']}
+- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: false, post-indexed: true, port_pressure:    [1, '34']}
+- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: false, post-indexed: false, port_pressure:   [1, '34']}
+- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: true, post-indexed: true, port_pressure:     [1, '34']}
+- {base: x, index: ~, offset: ~, scale: 8, pre-indexed: true, post-indexed: false, port_pressure:    [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure:  [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure:   [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure:  [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: false, post-indexed: true, port_pressure:  [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: true, post-indexed: true, port_pressure:   [1, '34']}
+- {base: x, index: ~, offset: imd, scale: 8, pre-indexed: true, post-indexed: false, port_pressure:  [1, '34']}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure:    [1, '34']}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure:   [1, '34']}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure:     [1, '34']}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure:    [1, '34']}
+- {base: x, index: x, offset: ~, scale: 8, pre-indexed: false, post-indexed: true, port_pressure:    [1, '34']}
+- {base: x, index: x, offset: ~, scale: 8, pre-indexed: false, post-indexed: false, port_pressure:   [1, '34']}
+- {base: x, index: x, offset: ~, scale: 8, pre-indexed: true, post-indexed: true, port_pressure:     [1, '34']}
+- {base: x, index: x, offset: ~, scale: 8, pre-indexed: true, post-indexed: false, port_pressure:    [1, '34']}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure:  [1, '34']}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure:   [1, '34']}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure:  [1, '34']}
+- {base: x, index: x, offset: imd, scale: 8, pre-indexed: false, post-indexed: true, port_pressure:  [1, '34']}
+- {base: x, index: x, offset: imd, scale: 8, pre-indexed: false, post-indexed: false, port_pressure: [1, '34']}
+- {base: x, index: x, offset: imd, scale: 8, pre-indexed: true, post-indexed: true, port_pressure:   [1, '34']}
+- {base: x, index: x, offset: imd, scale: 8, pre-indexed: true, post-indexed: false, port_pressure:  [1, '34']}
+ports: ['0', 0DV, '1', 1DV, '2', '3', '4', '5']
+port_model_scheme: |
+  ┌-----------------------------------------------------------┐
+  |                  60 entry unified scheduler               |
+  └-----------------------------------------------------------┘
+    0 |       1 |       2 |        3 |      4 |       5 |
+      ▼         ▼         ▼          ▼        ▼         ▼
+   ┌------┐  ┌------┐  ┌------┐  ┌------┐  ┌------┐  ┌------┐
+   | ALU  |  | ALU  |  | ALU/ |  |  LD  |  |  LD  |  |  ST  |
+   └------┘  └------┘  |  BR  |  └------┘  └------┘  └------┘
+   ┌------┐  ┌------┐  └------┘  ┌------┐  ┌------┐
+   |  FP/ |  |  FP/ |            | AGU  |  | AGU  |
+   | NEON |  | NEON |            └------┘  └------┘
+   └------┘  └------┘
+             ┌------┐
+             | INT  |
+             | MUL/ |
+             |  DIV |
+             └------┘
+             ┌------┐
+             |CRYPTO|
+             └------┘
+instruction_forms:
+- name: add
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: add
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: adds
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: b.ne
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: b.gt
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: bne
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: cmp
+  operands:
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: cmp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fadd
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fdiv
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  throughput: 8.5
+  latency: 16.0  # 	1*p01+17*p0DV1DV
+  port_pressure: [[1, '01'], [17.0, [0DV, 1DV]]]
+- name: fdiv
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 12.0
+  latency: 23.0  # 	1*p01+24*p0DV1DV
+  port_pressure: [[1, '01'], [24.0, [0DV, 1DV]]]
+- name: fmla
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fmla
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fmov
+  operands:
+  - {class: register, prefix: s}
+  - {class: immediate, imd: double}
+  latency: ~  # 	1*p01
+  port_pressure: [[1, '01']]
+  throughput: 0.5
+- name: fmul
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fmul
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fmul
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 0.5
+  latency: 6.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: ~  # 	2*p34
+  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: ~  # 	2*p34
+  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: ~  # 	2*p34
+  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: ~  # 	2*p34
+  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: ~  # 	2*p34
+  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: true
+    post-indexed: false
+  throughput: 1.0
+  latency: ~  # 	2*p34
+  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: ~  # 	2*p34
+  port_pressure: [[2.0, '34']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 4.0  # 	1*p34
+  port_pressure: [[1.0, '34']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 4.0  # 	1*p34
+  port_pressure: [[1.0, '34']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: ~
+    index: x
+    scale: 8
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 4.0  # 	1*p34
+  port_pressure: [[1.0, '34']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: mov
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.5
+  latency: 1.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: mov
+  operands:
+  - class: register
+    prefix: v
+    shape: b
+  - class: register
+    prefix: v
+    shape: b
+  throughput: 0.5
+  latency: 5.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: prfm
+  operands:
+  - class: prfop
+    type: pld
+    target: l1
+    policy: keep
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: ~
+  latency: ~
+  port_pressure: []
+- name: stp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 2.0
+  latency: ~  # 	4*p34
+  port_pressure: [[4.0, '34']]
+- name: stp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 2.0
+  latency: ~  # 	4*p34
+  port_pressure: [[4.0, '34']]
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 2.0
+  latency: ~  # 	2*p34+2*p5
+  port_pressure: [[2.0, '34'], [2.0, '5']]
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 2.0
+  latency: ~  # 	2*p34+2*p5
+  port_pressure: [[2.0, '34'], [2.0, '5']]
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 2.0
+  latency: ~  # 	2*p34+2*p5
+  port_pressure: [[2.0, '34'], [2.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 4.0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 4.0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 4.0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: ~
+    index: x
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 4.0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 4.0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 4.0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: ~
+    index: x
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 4.0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
--- a/osaca/data/wsm_data.csv
+++ b/osaca/data/wsm_data.csv
--- a/osaca/data/zen1.yml
+++ b/osaca/data/zen1.yml
@@ -0,0 +1,539 @@
+osaca_version: 0.3.2
+micro_architecture: AMD Zen (family 17h)
+arch_code: ZEN1
+isa: x86
+load_latency: {gpr: 4.0, xmm: 4.0, ymm: 4.0}
+load_throughput_multiplier: {gpr: 1.0, xmm: 1.0, ymm: 2.0}
+load_throughput:
+- {base: gpr, index: ~, offset: ~, scale: 1, port_pressure:     [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: ~, offset: ~, scale: 8, port_pressure:     [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: ~, offset: imd, scale: 1, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: ~, offset: imd, scale: 8, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: ~, scale: 1, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: ~, scale: 8, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: imd, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: imd, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
+hidden_loads: false
+ports: ['0', '1', '2', '3', 3DV, '4', '5', '6', '7', '8', '9', 8D, 9D, ST]
+port_model_scheme: |
+  ┌--------------------------------------┐  ┌-----------------------------------------------┐
+  |       96 entries OoO scheduler       |  |           84 entries OoO scheduler            |
+  └--------------------------------------┘  └-----------------------------------------------┘
+     0 |        1 |       2 |       3 |        4 |     5 |     6 |     7 |      8 |     9 |
+       ▼         ▼         ▼         ▼          ▼       ▼       ▼       ▼        ▼       ▼
+   ┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐  ┌------┐ ┌-----┐ ┌-----┐ ┌------┐ ┌-----┐ ┌-----┐
+   |SSE ALU| |SSE ALU| |SSE ALU| |SSE ALU|  | ALU  | | ALU | | ALU | | ALU  | | AGU | | AGU |
+   └-------┘ └-------┘ └-------┘ └-------┘  └------┘ └-----┘ └-----┘ └------┘ └-----┘ └-----┘
+   ┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐  ┌------┐ ┌-----┐ ┌-----┐ ┌------┐    |       |
+   |SSE MUL| |SSE MUL| |SSE ADD| |SSE ADD|  |BRANCH| | MUL | | MUL | |BRANCH|    ▼       ▼
+   └-------┘ └-------┘ └-------┘ └-------┘  └------┘ └-----┘ └-----┘ └------┘ ┌-------------┐ 
+   ┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐                                    |    LOAD     |
+   |SSE FMA| |SSE FMA| |  SSE  | |SSE DIV|                                    └-------------┘
+   └-------┘ └-------┘ |  SHUF | └-------┘                                    ┌-------------┐ 
+             ┌-------┐ └-------┘                                              |    LOAD     |
+             |  SSE  |                                                        └-------------┘
+             |  SHUF |                                                        ┌-------------┐
+             └-------┘                                                        |    STORE    |
+                                                                              └-------------┘
+instruction_forms:
+- name: add
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: add
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: addl
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: addq
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: cmpl
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: ~  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: cmpq
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: ~  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: incq
+  operands:
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567     
+  port_pressure: [[1, '4567']]
+- name: ja
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: ~
+  port_pressure: []
+- name: jb
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: ~
+  port_pressure: []
+- name: jne
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: ~
+  port_pressure: []
+- name: leaq
+  operands:
+  - class: memory
+    base: gpr
+    offset: imd
+    index: ~
+    scale: 1
+  - class: register
+    name: gpr
+  throughput: 0.5
+  latency: ~  # 	1*p89
+  port_pressure: [[1, '89']]
+- name: movl
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: mulsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: mulss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: rcpss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: ~     #1.0
+  latency: 5.0
+  port_pressure: []
+- name: sqrtsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: ~     #8.0
+  latency: 23.0
+  port_pressure: []
+- name: sqrtss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: ~     #5.0
+  latency: 17.0
+  port_pressure: []
+- name: subq
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: subq
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: vaddpd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 3.0  # 	2*p23
+  port_pressure: [[2, '23']]
+- name: vaddsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p23
+  port_pressure: [[1, '23']]
+- name: vaddss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p23
+  port_pressure: [[1, '23']]
+- name: vdivsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 4.0
+  latency: 13.0  # 	1*p3+4*p3DV
+  port_pressure: [[1, '3'], [4.0, [3DV]]]
+- name: vdivss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 3.0
+  latency: 10.0
+  port_pressure: [[1, '3'], [3.0, [3DV]]]
+- name: vfmadd213pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vfmadd231pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vfmadd132pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vmulsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulpd
+  operands:
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01+1*p89+1*p8D9D
+  port_pressure: [[1, '01'], [1, '89'], [1, [8D, 9D]]]
+- name: vmulpd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulpd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vmovapd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovapd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovaps
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovaps
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovsd
+  operands:
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p89+1*p8D9D
+  port_pressure: [[1, '89'], [1, [8D, 9D]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: ~
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: ~
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
--- a/osaca/data/zen_data.csv
+++ b/osaca/data/zen_data.csv
@@ -1,138 +0,0 @@
-instr,TP,LT,ports
-jae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-ja-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jcxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jecxz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-je-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jg-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jmp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jmpq-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnae-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jna-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnbe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnb-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnc-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jne-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnge-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jng-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnle-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnl-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jno-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jns-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jnz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jpe-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jp-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jpo-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-js-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-jz-lbl,0.0,0.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-add-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-add-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-addl-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-addq-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-addl-mem_imd,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
-addq-mem_imd,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
-add-mem_r32,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
-add-mem_r64,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
-addl-mem_r32,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
-addq-mem_r64,1.0,7.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0)"
-cmp-mem_r32,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
-cmpl-mem_r32,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
-cmp-r32_mem,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
-cmpl-r32_mem,0.5,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
-cmp-r32_r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-cmpl-r32_r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-cmp-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-cmp-r64_r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-cmpq-r64_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-cmpq-r64_r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-inc-r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-incq-r64,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-incl-r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-mov-mem_r64,1.0,4.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-mov-r64_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-mov-r32_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-movq-mem_r64,1.0,4.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-movq-r64_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-movl-r32_mem,0.5,3.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-movslq-r64_r32,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-sub-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-vaddpd-ymm_ymm_mem,1.0,3.0,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vaddsd-xmm_xmm_mem,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vaddsd-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
-vaddss-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
-vcvtsi2ss-xmm_xmm_r32,1.0,4.0,"(1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
-vcvtss2si-r32_xmm,1.0,7.0,"(1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
-cvtsi2ss-xmm_r32,1.0,8.0,"(1.0, 1.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd213pd-ymm_ymm_ymm,1.0,5.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd213pd-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd213ps-ymm_ymm_ymm,1.0,5.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd213ps-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd213sd-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd213ss-xmm_xmm_xmm,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd132sd-xmm_xmm_mem,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vfmadd132pd-xmm_xmm_mem,0.5,5.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vfmadd132pd-ymm_ymm_mem,1.0,5.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-vinsertf128-ymm_ymm_imd,0.6666666666666667,1.0,"(-1,)"
-vmovsd-mem_xmm,1.0,8.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-vmovsd-xmm_mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vmulpd-ymm_ymm_ymm,1.0,4.0,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vmulsd-xmm_xmm_mem,0.5,4.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vmulsd-xmm_xmm_xmm,0.5,4.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vmulss-xmm_xmm_xmm,0.5,3.0,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vsubpd-ymm_ymm_mem,1.0,3.0,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-vsubsd-xmm_xmm_mem,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vsubsd-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
-vsubss-xmm_xmm_xmm,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)"
-vmovaps-xmm_mem,0.5,3.0,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vmovaps-mem_xmm,1.0,5.0,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 1.0, 1.0)"
-vmovapd-ymm_mem,1.0,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-vmovapd-mem_ymm,2.0,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)"
-movq-r64_xmm,1.0,-1.0,"(0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0)"
-#prefetcht0-mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-#prefetchw-mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-cmpl-r32_imd,0.25,1.0,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0, 0)"
-vaddpd-xmm_xmm_xmm,0.5,3,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0)"
-vaddpd-ymm_ymm_ymm,1,3,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
-vcvtdq2pd-xmm_xmm,1,7,"(0.5, 0.5, 0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
-vcvtdq2pd-ymm_xmm,2,7,"(1.0, 1.0, 0, 2.0, 0, 0, 0, 0, 0, 0, 0)"
-vcvtsi2sd-xmm_xmm_r32,1,4,"(0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0)"
-vextracti128-xmm_ymm_imd,0.3333333333333333,1,"(0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd132pd-xmm_xmm_xmm,0.5,5,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd132pd-ymm_ymm_ymm,1,5,"(1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vfmadd132sd-xmm_xmm_xmm,0.5,5,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vmulpd-xmm_xmm_xmm,0.5,4,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vpaddd-xmm_xmm_xmm,0.3333333333333333,1,"(0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)"
-vpaddd-ymm_ymm_ymm,0.6666666666666667,1,"(0.66, 0.66, 0, 0.66, 0, 0, 0, 0, 0, 0, 0)"
-vpshufd-xmm_xmm_imd,0.5,1,"(0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0)"
-vxorpd-xmm_xmm_xmm,0.25,1,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0, 0)"
-vxorps-xmm_xmm_xmm,0.25,1,"(0.25, 0.25, 0.25, 0.25, 0, 0, 0, 0, 0, 0, 0)"
-vdivpd-xmm_xmm_xmm,4,8,"(0, 0, 0, 1.0, 4.0, 0, 0, 0, 0, 0, 0)"
-vdivsd-xmm_xmm_xmm,4,8,"(0, 0, 0, 1.0, 4.0, 0, 0, 0, 0, 0, 0)"
-vmovups-mem_xmm,0.5,8,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vmovups-xmm_mem,1,8,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vaddpd-xmm_xmm_mem,0.5,3.0,"(0, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 0.5, 0.5)"
-vmulpd-xmm_xmm_mem,0.5,4,"(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vmulpd-ymm_ymm_mem,1,4,"(1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
-vinsertf128-ymm_ymm_mem_imd,1,5,"(0.33, 0.33, 0, 0.33, 0, 0, 0, 0, 0, 0, 0)"
-vmovupd-xmm_mem,0.5,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-vmovupd-mem_xmm,1,1,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)"
-vmovupd-ymm_mem,3.0,-1.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-vmovupd-mem_ymm,2,2,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 2.0)"
-movupd-xmm_mem,0.5,-1,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0)"
-pushq-r64,0.5,-1,"(-1,)"
-cmpq-r64_mem,0.5,-1,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
-movq-r64_r64,0.2,-1,"(-1,)"
-subq-r64_r64,0.25,1,"(-1,)"
-cmpq-mem_r64,0.5,-1,"(0, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5)"
--- a/osaca/db_interface.py
+++ b/osaca/db_interface.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+
+import math
+import os
+import warnings
+
+import ruamel.yaml
+
+from osaca.semantics import MachineModel
+
+
+def sanity_check(arch: str, verbose=False):
+    # load arch machine model
+    arch_mm = MachineModel(arch=arch)
+    data = arch_mm['instruction_forms']
+    # load isa machine model
+    isa = arch_mm.get_ISA()
+    isa_mm = MachineModel(arch='isa/{}'.format(isa))
+    num_of_instr = len(data)
+
+    # check arch DB entries
+    (
+        missing_throughput,
+        missing_latency,
+        missing_port_pressure,
+        wrong_port,
+        suspicious_instructions,
+        duplicate_instr_arch,
+    ) = _check_sanity_arch_db(arch_mm, isa_mm)
+    # check ISA DB entries
+    duplicate_instr_isa, only_in_isa = _check_sanity_isa_db(arch_mm, isa_mm)
+
+    _print_sanity_report(
+        num_of_instr,
+        missing_throughput,
+        missing_latency,
+        missing_port_pressure,
+        wrong_port,
+        suspicious_instructions,
+        duplicate_instr_arch,
+        duplicate_instr_isa,
+        only_in_isa,
+        verbose=verbose,
+    )
+
+
+def import_benchmark_output(arch, bench_type, filepath):
+    supported_bench_outputs = ['ibench', 'asmbench']
+    assert os.path.exists(filepath)
+    if bench_type not in supported_bench_outputs:
+        raise ValueError('Benchmark type is not supported.')
+    with open(filepath, 'r') as f:
+        input_data = f.readlines()
+    db_entries = None
+    if bench_type == 'ibench':
+        db_entries = _get_ibench_output(input_data)
+    elif bench_type == 'asmbench':
+        raise NotImplementedError
+    # write entries to DB
+    mm = MachineModel(arch)
+    for entry in db_entries:
+        mm.set_instruction_entry(entry)
+    with open(filepath, 'w') as f:
+        mm.dump(f)
+
+##################
+# HELPERS IBENCH #
+##################
+
+
+def _get_ibench_output(input_data):
+    db_entries = {}
+    for line in input_data:
+        if 'Using frequency' in line or len(line) == 0:
+            continue
+        instruction = line.split(':')[0]
+        key = '-'.join(instruction.split('-')[:2])
+        if key in db_entries:
+            # add only TP/LT value
+            entry = db_entries[key]
+        else:
+            mnemonic = instruction.split('-')[0]
+            operands = instruction.split('-')[1].split('_')
+            operands = [_create_db_operand(op) for op in operands]
+            entry = {
+                'name': mnemonic,
+                'operands': operands,
+                'throughput': None,
+                'latency': None,
+                'port_pressure': None,
+            }
+        if 'TP' in instruction:
+            entry['throughput'] = _validate_measurement(float(line.split()[1]), True)
+            if not entry['throughput']:
+                warnings.warn(
+                    'Your THROUGHPUT measurement for {} looks suspicious'.format(key)
+                    + ' and was not added. Please inspect your benchmark.'
+                )
+        elif 'LT' in instruction:
+            entry['latency'] = _validate_measurement(float(line.split()[1]), False)
+            if not entry['latency']:
+                warnings.warn(
+                    'Your LATENCY measurement for {} looks suspicious'.format(key)
+                    + ' and was not added. Please inspect your benchmark.'
+                )
+        db_entries[key] = entry
+    return db_entries
+
+
+def _validate_measurement(self, measurement, is_tp):
+    if not is_tp:
+        if (
+            math.floor(measurement) * 1.05 >= measurement
+            or math.ceil(measurement) * 0.95 <= measurement
+        ):
+            # Value is probably correct, so round it to the estimated value
+            return float(round(measurement))
+        # Check reciprocal only if it is a throughput value
+    else:
+        reciprocals = [1 / x for x in range(1, 11)]
+        for reci in reciprocals:
+            if reci * 0.95 <= measurement <= reci * 1.05:
+                # Value is probably correct, so round it to the estimated value
+                return round(reci, 5)
+    # No value close to an integer or its reciprocal found, we assume the
+    # measurement is incorrect
+    return None
+
+
+def _create_db_operand(self, operand):
+    if self.isa == 'aarch64':
+        return self._create_db_operand_aarch64(operand)
+    elif self.isa == 'x86':
+        return self._create_db_operand_x86(operand)
+
+
+def _create_db_operand_aarch64(self, operand):
+    if operand == 'i':
+        return {'class': 'immediate', 'imd': 'int'}
+    elif operand in 'wxbhsdq':
+        return {'class': 'register', 'prefix': operand}
+    elif operand.startswith('v'):
+        return {'class': 'register', 'prefix': 'v', 'shape': operand[1:2]}
+    elif operand.startswith('m'):
+        return {
+            'class': 'memory',
+            'base': 'gpr' if 'b' in operand else None,
+            'offset': 'imd' if 'o' in operand else None,
+            'index': 'gpr' if 'i' in operand else None,
+            'scale': 8 if 's' in operand else 1,
+            'pre-indexed': True if 'r' in operand else False,
+            'post-indexed': True if 'p' in operand else False,
+        }
+    else:
+        raise ValueError('Parameter {} is not a valid operand code'.format(operand))
+
+
+def _create_db_operand_x86(self, operand):
+    if operand == 'r':
+        return {'class': 'register', 'name': 'gpr'}
+    elif operand in 'xyz':
+        return {'class': 'register', 'name': operand + 'mm'}
+    elif operand == 'i':
+        return {'class': 'immediate', 'imd': 'int'}
+    elif operand.startswith('m'):
+        return {
+            'class': 'memory',
+            'base': 'gpr' if 'b' in operand else None,
+            'offset': 'imd' if 'o' in operand else None,
+            'index': 'gpr' if 'i' in operand else None,
+            'scale': 8 if 's' in operand else 1,
+        }
+    else:
+        raise ValueError('Parameter {} is not a valid operand code'.format(operand))
+
+
+########################
+# HELPERS SANITY CHECK #
+########################
+
+
+def _check_sanity_arch_db(arch_mm, isa_mm):
+    suspicious_prefixes_x86 = ['vfm', 'fm']
+    suspicious_prefixes_arm = ['fml', 'ldp', 'stp', 'str']
+    if arch_mm.get_ISA().lower() == 'aarch64':
+        suspicious_prefixes = suspicious_prefixes_arm
+    if arch_mm.get_ISA().lower() == 'x86':
+        suspicious_prefixes = suspicious_prefixes_x86
+
+    # returned lists
+    missing_throughput = []
+    missing_latency = []
+    missing_port_pressure = []
+    wrong_port = []
+    suspicious_instructions = []
+    duplicate_instr_arch = []
+
+    for instr_form in arch_mm['instruction_forms']:
+        # check value in DB entry
+        if instr_form['throughput'] is None:
+            missing_throughput.append(instr_form)
+        if instr_form['latency'] is None:
+            missing_latency.append(instr_form)
+        if instr_form['port_pressure'] is None:
+            missing_port_pressure.append(instr_form)
+        else:
+            if _check_for_wrong_port(arch_mm['ports'], instr_form):
+                wrong_port.append(instr_form)
+        # check entry against ISA DB
+        for prefix in suspicious_prefixes:
+            if instr_form['name'].startswith(prefix):
+                # check if instruction in ISA DB
+                if isa_mm.get_instruction(instr_form['name'], instr_form['operands']) is None:
+                    # if not, mark them as suspicious and print it on the screen
+                    suspicious_instructions.append(instr_form)
+        # check for duplicates in DB
+        if arch_mm._check_for_duplicate(instr_form['name'], instr_form['operands']):
+            duplicate_instr_arch.append(instr_form)
+    # every entry exists twice --> uniquify
+    tmp_list = []
+    for i in range(0, len(duplicate_instr_arch)):
+        tmp = duplicate_instr_arch.pop()
+        if tmp not in duplicate_instr_arch:
+            tmp_list.append(tmp)
+    duplicate_instr_arch = tmp_list
+    return (
+        missing_throughput,
+        missing_latency,
+        missing_port_pressure,
+        wrong_port,
+        suspicious_instructions,
+        duplicate_instr_arch,
+    )
+
+
+def _check_for_wrong_port(port_list, instr_form):
+    for cycles, ports in instr_form['port_pressure']:
+        for p in ports:
+            if p not in port_list:
+                return False
+    return True
+
+
+def _check_sanity_isa_db(arch_mm, isa_mm):
+    # returned lists
+    duplicate_instr_isa = []
+    only_in_isa = []
+
+    for instr_form in isa_mm['instruction_forms']:
+        # check if instr is missing in arch DB
+        if arch_mm.get_instruction(instr_form['name'], instr_form['operands']) is None:
+            only_in_isa.append(instr_form)
+        # check for duplicates
+        if isa_mm._check_for_duplicate(instr_form['name'], instr_form['operands']):
+            duplicate_instr_isa.append(instr_form)
+    # every entry exists twice --> uniquify
+    tmp_list = []
+    for i in range(0, len(duplicate_instr_isa)):
+        tmp = duplicate_instr_isa.pop()
+        if tmp not in duplicate_instr_isa:
+            tmp_list.append(tmp)
+    duplicate_instr_isa = tmp_list
+
+    return duplicate_instr_isa, only_in_isa
+
+
+def _print_sanity_report(
+    total, m_tp, m_l, m_pp, wrong_pp, suspic_instr, dup_arch, dup_isa, only_isa, verbose=False
+):
+    # non-verbose summary
+    print('SUMMARY\n----------------------')
+    print(
+        '{}% ({}/{}) of instruction forms have no throughput value.'.format(
+            round(100 * len(m_tp) / total), len(m_tp), total
+        )
+    )
+    print(
+        '{}% ({}/{}) of instruction forms have no latency value.'.format(
+            round(100 * len(m_l) / total), len(m_l), total
+        )
+    )
+    print(
+        '{}% ({}/{}) of instruction forms have no port pressure assignment.'.format(
+            round(100 * len(m_pp) / total), len(m_pp), total
+        )
+    )
+    print(
+        '{}% ({}/{}) of instruction forms have an invalid port identifier.'.format(
+            round(100 * len(wrong_pp) / total), len(wrong_pp), total
+        )
+    )
+    print(
+        '{}% ({}/{}) of instruction forms might miss an ISA DB entry.'.format(
+            round(100 * len(suspic_instr) / total), len(suspic_instr), total
+        )
+    )
+    print('{} duplicate instruction forms in uarch DB.'.format(len(dup_arch)))
+    print('{} duplicate instruction forms in ISA DB.'.format(len(dup_isa)))
+    print(
+        '{} instruction forms in ISA DB are not referenced by instruction '.format(len(only_isa))
+        + 'forms in uarch DB.'
+    )
+    print('----------------------\n')
+    # verbose version
+    if verbose:
+        _print_sanity_report_verbose(
+            total, m_tp, m_l, m_pp, wrong_pp, suspic_instr, dup_arch, dup_isa, only_isa
+        )
+
+
+def _print_sanity_report_verbose(
+    total, m_tp, m_l, m_pp, wrong_pp, suspic_instr, dup_arch, dup_isa, only_isa
+):
+    BRIGHT_CYAN = '\033[1;36;1m'
+    BRIGHT_BLUE = '\033[1;34;1m'
+    BRIGHT_RED = '\033[1;31;1m'
+    BRIGHT_MAGENTA = '\033[1;35;1m'
+    BRIGHT_YELLOW = '\033[1;33;1m'
+    CYAN = '\033[36m'
+    YELLOW = '\033[33m'
+    WHITE = '\033[0m'
+
+    print('Instruction forms without throughput value:\n' if len(m_tp) != 0 else '', end='')
+    for instr_form in m_tp:
+        print('{}{}{}'.format(BRIGHT_BLUE, _get_full_instruction_name(instr_form), WHITE))
+    print('Instruction forms without latency value:\n' if len(m_l) != 0 else '', end='')
+    for instr_form in m_l:
+        print('{}{}{}'.format(BRIGHT_RED, _get_full_instruction_name(instr_form), WHITE))
+    print(
+        'Instruction forms without port pressure assignment:\n' if len(m_pp) != 0 else '', end=''
+    )
+    for instr_form in m_pp:
+        print('{}{}{}'.format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE))
+    print(
+        'Instruction forms with invalid port identifiers in port pressure:\n'
+        if len(wrong_pp) != 0
+        else '',
+        end='',
+    )
+    for instr_form in wrong_pp:
+        print('{}{}{}'.format(BRIGHT_MAGENTA, _get_full_instruction_name(instr_form), WHITE))
+    print(
+        'Instruction forms which might miss an ISA DB entry:\n' if len(suspic_instr) != 0 else '',
+        end='',
+    )
+    for instr_form in suspic_instr:
+        print('{}{}{}'.format(BRIGHT_CYAN, _get_full_instruction_name(instr_form), WHITE))
+    print('Duplicate instruction forms in uarch DB:\n' if len(dup_arch) != 0 else '', end='')
+    for instr_form in dup_arch:
+        print('{}{}{}'.format(YELLOW, _get_full_instruction_name(instr_form), WHITE))
+    print('Duplicate instruction forms in ISA DB:\n' if len(dup_isa) != 0 else '', end='')
+    for instr_form in dup_isa:
+        print('{}{}{}'.format(BRIGHT_YELLOW, _get_full_instruction_name(instr_form), WHITE))
+    print(
+        'Instruction forms existing in ISA DB but not in uarch DB:\n'
+        if len(only_isa) != 0
+        else '',
+        end='',
+    )
+    for instr_form in only_isa:
+        print('{}{}{}'.format(CYAN, _get_full_instruction_name(instr_form), WHITE))
+
+
+###################
+# GENERIC HELPERS #
+###################
+
+
+def _get_full_instruction_name(instruction_form):
+    operands = []
+    for op in instruction_form['operands']:
+        op_attrs = [
+            y + ':' + str(op[y])
+            for y in list(filter(lambda x: True if x != 'class' else False, op))
+        ]
+        operands.append('{}({})'.format(op['class'], ','.join(op_attrs)))
+    return '{}  {}'.format(instruction_form['name'], ','.join(operands))
+
+
+def __represent_none(self, data):
+    return self.represent_scalar(u'tag:yaml.org,2002:null', u'~')
+
+
+def _create_yaml_object():
+    yaml_obj = ruamel.yaml.YAML()
+    yaml_obj.representer.add_representer(type(None), __represent_none)
+    return yaml_obj
+
+
+def __dump_data_to_yaml(filepath, data):
+    # first add 'normal' meta data in the right order (no ordered dict yet)
+    meta_data = dict(data)
+    del meta_data['instruction_forms']
+    del meta_data['port_model_scheme']
+    with open(filepath, 'w') as f:
+        ruamel.yaml.dump(meta_data, f, allow_unicode=True)
+    with open(filepath, 'a') as f:
+        # now add port model scheme in |-scheme for better readability
+        ruamel.yaml.dump(
+            {'port_model_scheme': data['port_model_scheme']},
+            f,
+            allow_unicode=True,
+            default_style='|',
+        )
+        # finally, add instruction forms
+        ruamel.yaml.dump({'instruction_forms': data['instruction_forms']}, f, allow_unicode=True)
--- a/osaca/eu_sched.py
+++ b/osaca/eu_sched.py
@@ -1,447 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import os
-import math
-import ast
-from operator import add
-import pandas as pd
-
-from osaca.param import Register, MemAddr
-#from param import Register, MemAddr
-
-
-class Scheduler(object):
-    arch_dict = {
-        # Intel
-        'NHM': 5, 'WSM': 5,  # Nehalem, Westmere
-        'SNB': 6, 'IVB': 6,  # Sandy Bridge, Ivy Bridge
-        'HSW': 8, 'BDW': 8,  # Haswell, Broadwell
-        'SKL': 8, 'SKX': 8,  # Skylake(-X)
-        'KBL': 8, 'CFL': 8,  # Kaby Lake, Coffee Lake
-        # AMD
-        'ZEN': 10,  # Zen/Ryzen/EPYC
-    }
-    arch_pipeline_ports = {
-        'NHM': ['0DV'], 'WSM': ['0DV'],
-        'SNB': ['0DV'], 'IVB': ['0DV'],
-        'HSW': ['0DV'], 'BDW': ['0DV'],
-        'SKL': ['0DV'], 'SKX': ['0DV'],
-        'KBL': ['0DV'], 'CFL': ['0DV'],
-        'ZEN': ['3DV'],}
-    # content of most inner list in instrList: instr, operand(s), instr form
-    df = None  # type: DataFrame
-    # for parallel ld/st in archs with 1 st/cy and >1 ld/cy, able to do 1 st and 1 ld in 1cy
-    ld_ports = None  # type: list<int>
-    # enable flag for parallel ld/st
-    en_par_ldst = False  # type: boolean
-
-    def __init__(self, arch, instruction_list):
-        arch = arch.upper()
-        try:
-            self.ports = self.arch_dict[arch]
-        except KeyError:
-            print('Architecture not supported for EU scheduling.', file=sys.stderr)
-            sys.exit(1)
-        # check for parallel ld/st in a cycle
-        if arch == 'ZEN':
-            self.en_par_ldst = True
-            self.ld_ports = [9, 10]
-        # check for DV port
-        self.pipeline_ports = self.arch_pipeline_ports.get(arch, [])
-        self.instrList = instruction_list
-        # curr_dir = os.path.realpath(__file__)[:-11]
-        osaca_dir = os.path.expanduser('~/.osaca/')
-        self.df = pd.read_csv(osaca_dir + 'data/' + arch.lower() + '_data.csv', quotechar='"',
-                              converters={'ports': ast.literal_eval})
-
-    def new_schedule(self, machine_readable=False):
-        """
-        Schedule Instruction Form list and calculate port bindings.
-
-        Parameters
-        ----------
-        machine_readable : bool
-            Boolean for indicating if the return value should be human readable (if False) or 
-            machine readable (if True)
-
-        Returns
-        -------
-        (str, [float, ...]) or ([[float, ...], ...], [float, ...])
-            A tuple containing the output of the schedule as string (if machine_readable is not
-            given or False) or as list of lists (if machine_readable is True) and the port bindings
-            as list of float.
-        """
-        sched = self.get_head()
-        # Initialize ports
-        # Add DV port, if it is existing
-        occ_ports = [[0] * (self.ports + len(self.pipeline_ports)) for x in range(len(self.instrList))]
-        port_bndgs = [0] * (self.ports + len(self.pipeline_ports))
-        # Store instruction counter for parallel ld/st
-        par_ldst = 0
-        # Count the number of store instr if we schedule for an architecture with par ld/st
-        if self.en_par_ldst:
-            for i, instrForm in enumerate(self.instrList):
-                if (isinstance(instrForm[1], MemAddr) and len(instrForm) > 3
-                        and not instrForm[0].startswith('cmp')):
-                    # print('({}, {}) is st --> par_ldst = {}'.format(i, instrForm[0], par_ldst + 1))
-                    par_ldst += 1
-        # Check if there's a port occupation stored in the CSV, otherwise leave the
-        # occ_port list item empty
-        for i, instrForm in enumerate(self.instrList):
-            search_string = instrForm[0] + self.get_operand_suffix(instrForm)
-            try:
-                entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
-                tup = entry.ports.values[0]
-                if len(tup) == 1 and tup[0] == -1:
-                    raise IndexError()
-            except IndexError:
-                # Instruction form not in CSV
-                if instrForm[0][:3] == 'nop':
-                    sched += self.format_port_occupation_line(occ_ports[i], '* ' + instrForm[-1])
-                elif instrForm[0] == 'DIRECTIVE':
-                    sched += self.format_port_occupation_line(occ_ports[i], '* ' + instrForm[-1])
-                else:
-                    sched += self.format_port_occupation_line(occ_ports[i], 'X ' + instrForm[-1])
-                continue
-            occ_ports[i] = list(tup)
-            # Check if it's a ld including instr 
-            p_flg = ''
-            if self.en_par_ldst:
-                # Check for ld
-                # FIXME remove special load handling from here and place in machine model
-                if (isinstance(instrForm[-2], MemAddr) or
-                        (len(instrForm) > 4 and isinstance(instrForm[2], MemAddr))):
-                    if par_ldst > 0:
-                        par_ldst -= 1
-                        p_flg = 'P '
-                        for port in self.ld_ports:
-                            occ_ports[i][port] = 0.0  # '(' + str(occ_ports[i][port]) + ')'
-            # Write schedule line
-            if len(p_flg) > 0:
-                sched += self.format_port_occupation_line(occ_ports[i], p_flg + instrForm[-1])
-                for port in self.ld_ports:
-                    occ_ports[i][port] = 0
-            else:
-                sched += self.format_port_occupation_line(occ_ports[i], instrForm[-1])
-            # Add throughput to total port binding
-            port_bndgs = list(map(add, port_bndgs, occ_ports[i]))
-        if machine_readable:
-            list(map(self.append, occ_ports, self.instrList))
-            return occ_ports, port_bndgs
-        return sched, port_bndgs
-
-    def schedule(self):
-        """
-        Schedule Instruction Form list and calculate port bindings.
-
-        Returns
-        -------
-        (str, [int, ...])
-            A tuple containing the graphic output of the schedule as string and
-            the port bindings as list of ints.
-        """
-        wTP = False
-        sched = self.get_head()
-        # Initialize ports
-        port_bndgs = [0] * self.ports
-        # Check if there's a port occupation stored in the CSV, otherwise leave the
-        # occ_port list item empty
-        for i, instrForm in enumerate(self.instrList):
-            try:
-                search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
-                entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
-                tup = entry.ports.values[0]
-                if len(tup) == 1 and tup[0][0] == -1:
-                    raise IndexError()
-            except IndexError:
-                # Instruction form not in CSV
-                if instrForm[0][:3] == 'nop':
-                    sched += self.format_port_occupation_line(occ_ports[i], '* ' + instrForm[-1])
-                else:
-                    sched += self.format_port_occupation_line(occ_ports[i], 'X ' + instrForm[-1])
-                continue
-            if wTP:
-                # Get the occurance of each port from the occupation list
-                port_occurances = self.get_port_occurances(tup)
-                # Get 'occurance groups'
-                occurance_groups = self.get_occurance_groups(port_occurances)
-                # Calculate port dependent throughput
-                tp_ges = entry.TP.values[0] * len(occurance_groups[0])
-                for occGroup in occurance_groups:
-                    for port in occGroup:
-                        occ_ports[i][port] = tp_ges / len(occGroup)
-            else:
-                variations = len(tup)
-                t_all = self.flatten(tup)
-                if entry.TP.values[0] == 0:
-                    t_all = ()
-                if variations == 1:
-                    for j in tup[0]:
-                        occ_ports[i][j] = entry.TP.values[0]
-                else:
-                    for j in range(0, self.ports):
-                        occ_ports[i][j] = t_all.count(j) / variations
-            # Write schedule line
-            sched += self.format_port_occupation_line(occ_ports[i], instrForm[-1])
-            # Add throughput to total port binding
-            port_bndgs = list(map(add, port_bndgs, occ_ports[i]))
-        return sched, port_bndgs
-
-    def flatten(self, l):
-        if len(l) == 0:
-            return l
-        if isinstance(l[0], type(l)):
-            return self.flatten(l[0]) + self.flatten(l[1:])
-        return l[:1] + self.flatten(l[1:])
-
-    def append(self, l, e):
-        if(isinstance(l, list)):
-            l.append(e)
-    
-    def schedule_fcfs(self):
-        """
-        Schedule Instruction Form list for a single run with latencies.
-
-        Returns
-        -------
-        (str, int)
-            A tuple containing the graphic output as string and the total throughput time as int.
-        """
-        sched = self.get_head()
-        total = 0
-        # Initialize ports
-        occ_ports = [0] * self.ports
-        for instrForm in self.instrList:
-            try:
-                search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
-                entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
-                tup = entry.ports.values[0]
-                if len(tup) == 1 and tup[0][0] == -1:
-                    raise IndexError()
-            except IndexError:
-                # Instruction form not in CSV
-                sched += self.format_port_occupation_line([0] * self.ports, '* ' + instrForm[-1])
-                continue
-            found = False
-            while not found:
-                for portOcc in tup:
-                    # Test if chosen instruction form port occupation suits the current CPU port
-                    # occupation
-                    if self.test_ports_fcfs(occ_ports, portOcc):
-                        # Current port occupation fits for chosen port occupation of instruction!
-                        found = True
-                        good = [entry.LT.values[0] if (j in portOcc) else 0 for j in
-                                range(0, self.ports)]
-                        sched += self.format_port_occupation_line(good, instrForm[-1])
-                        # Add new occupation
-                        occ_ports = [occ_ports[j] + good[j] for j in range(0, self.ports)]
-                        break
-                # Step
-                occ_ports = [j - 1 if (j > 0) else 0 for j in occ_ports]
-                if entry.LT.values[0] != 0:
-                    total += 1
-        total += max(occ_ports)
-        return sched, total
-
-    def get_occurance_groups(self, port_occurances):
-        """
-        Group ports in groups by the number of their occurrence and sorts groups by cardinality.
-
-        Parameters
-        ----------
-        port_occurances : [int, ...]
-            List with the length of ports containing the number of occurances
-            of each port
-
-        Returns
-        -------
-        [[int, ...], ...]
-            List of lists with all occurance groups sorted by cardinality
-            (smallest group first)
-        """
-        groups = [[] for x in range(len(set(port_occurances))-1)]
-        for i, groupInd in enumerate(range(min(list(filter(lambda x: x > 0, port_occurances))),
-                                           max(port_occurances) + 1)):
-            for p, occurs in enumerate(port_occurances):
-                if groupInd == occurs:
-                    groups[i].append(p)
-        # Sort groups by cardinality
-        groups.sort(key=len)
-        return groups
-
-    def get_port_occurances(self, tups):
-        """
-        Return the number of each port occurrence for the possible port occupations.
-
-        Parameters
-        ----------
-        tups : ((int, ...), ...)
-            Tuple of tuples of possible port occupations
-
-        Returns
-        -------
-        [int, ...]
-            List in the length of the number of ports for the current architecture,
-            containing the amount of occurances for each port
-        """
-        ports = [0] * self.ports
-        for tup in tups:
-            for elem in tup:
-                ports[elem] += 1
-        return ports
-
-    def test_ports_fcfs(self, occ_ports, needed_ports):
-        """
-        Test if current configuration of ports is possible and returns boolean.
-
-        Parameters
-        ----------
-        occ_ports : [int]
-            Tuple to inspect for current port occupation
-        needed_ports : (int)
-            Tuple with needed port(s) for particular instruction form
-
-        Returns
-        -------
-        bool
-            True    if needed ports can get scheduled on current port occupation
-            False   if not
-        """
-        for port in needed_ports:
-            if occ_ports[port] != 0:
-                return False
-        return True
-
-    def get_report_info(self):
-        """
-        Create Report information including all needed annotations.
-
-        Returns
-        -------
-        str
-            String containing the report information
-        """
-        analysis = 'Throughput Analysis Report\n' + ('-' * 26) + '\n'
-        annotations = (
-            'P - Load operation can be hidden behind a past or future store instruction\n'
-            'X - No information for this instruction in data file\n'
-            '* - Not bound to a port, therefore ignored\n\n')
-        return analysis + annotations
-
-    def get_head(self):
-        """
-        Create right heading for CPU architecture.
-
-        Returns
-        -------
-        str
-            String containing the header
-        """
-        port_names = self.get_port_naming()
-
-        port_line = ''.join('|{:^6}'.format(pn) for pn in port_names) + '|\n'
-        horiz_line = '-' * (len(port_line) - 1) + '\n'
-        port_anno = ' ' * ((len(port_line) - 25) // 2) + 'Ports Pressure in cycles\n'
-
-        return port_anno + port_line + horiz_line
-
-    def format_port_occupation_line(self, occ_ports, instr_name):
-        """
-        Create line with port occupation for output.
-
-        Parameters
-        ----------
-        occ_ports : (int, ...)
-            Integer tuple containing needed ports
-        instr_name : str
-            Name of instruction form for output
-
-        Returns
-        -------
-        str
-            String for output containing port scheduling for instr_name
-        """
-        line = ''
-        for cycles in occ_ports:
-            if cycles == 0:
-                line += '|' + ' ' * 6
-            elif cycles >= 10:
-                line += '|{:^6.1f}'.format(cycles)
-            else:
-                line += '|{:^6.2f}'.format(cycles)
-        line += '| ' + instr_name + '\n'
-        return line
-
-    def get_port_naming(self):
-        """
-        Return list of port names
-
-        :return: list of strings
-        """
-        return sorted([str(i) for i in range(self.ports)] + self.pipeline_ports)
-
-    def get_port_binding(self, port_bndg):
-        """
-        Create port binding out of scheduling result.
-
-        Parameters
-        ----------
-        port_bndg : [int, ...]
-            Integer list containing port bindings
-
-        Returns
-        -------
-        str
-            String containing the port binding graphical output
-        """
-        col_widths = self.get_column_widths(port_bndg)
-        header = 'Port Binding in Cycles Per Iteration:\n'
-        horiz_line = '-' * 10 + '-' * (sum(col_widths) + len(col_widths)) + '\n'
-        port_line = '|  Port  |'
-        for i, port_name in enumerate(self.get_port_naming()):
-            port_line += port_name.center(col_widths[i]) + '|'
-        port_line += '\n'
-        cyc_line = '| Cycles |'
-        for i in range(len(port_bndg)):
-            cyc_line += '{}|'.format(str(round(port_bndg[i], 2)).center(col_widths[i]))
-        cyc_line += '\n'
-        binding = header + horiz_line + port_line + horiz_line + cyc_line + horiz_line
-        return binding
-
-    def get_column_widths(self, port_bndg):
-        return [max(len(str(round(x, 2))), len(name)) + 2
-                for x, name in zip(port_bndg, self.get_port_naming())]
-
-    def get_operand_suffix(self, instr_form):
-        """
-        Create operand suffix out of list of Parameters.
-
-        Parameters
-        ----------
-        instr_form : [str, Parameter, ..., Parameter, str]
-            Instruction Form data structure
-
-        Returns
-        -------
-        str
-            Operand suffix for searching in data file
-        """
-        op_ext = []
-        operands = ''
-        if len(instr_form) > 2:
-            operands = '-'
-        for i in range(1, len(instr_form) - 1):
-            if isinstance(instr_form[i], Register) and instr_form[i].reg_type == 'GPR':
-                optmp = 'r' + str(instr_form[i].size)
-            elif isinstance(instr_form[i], MemAddr):
-                optmp = 'mem'
-            else:
-                optmp = str(instr_form[i]).lower()
-            op_ext.append(optmp)
-        operands += '_'.join(op_ext)
-        return operands
-
-
-if __name__ == '__main__':
-    print('Nothing to do.')
--- a/osaca/frontend.py
+++ b/osaca/frontend.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+import os
+import re
+from datetime import datetime as dt
+
+from ruamel import yaml
+
+from osaca import utils
+from osaca.semantics import INSTR_FLAGS, KernelDG, SemanticsAppender
+
+
+class Frontend(object):
+    def __init__(self, filename='', arch=None, path_to_yaml=None):
+        self._filename = filename
+        if not arch and not path_to_yaml:
+            raise ValueError('Either arch or path_to_yaml required.')
+        if arch and path_to_yaml:
+            raise ValueError('Only one of arch and path_to_yaml is allowed.')
+        self._arch = arch
+        if arch:
+            self._arch = arch.lower()
+            with open(utils.find_file(self._arch + '.yml'), 'r') as f:
+                self._data = yaml.load(f, Loader=yaml.Loader)
+        elif path_to_yaml:
+            with open(path_to_yaml, 'r') as f:
+                self._data = yaml.load(f, Loader=yaml.Loader)
+
+    def _is_comment(self, instruction_form):
+        return instruction_form['comment'] is not None and instruction_form['instruction'] is None
+
+    def print_throughput_analysis(self, kernel, show_lineno=False, show_cmnts=True):
+        lineno_filler = '     ' if show_lineno else ''
+        port_len = self._get_max_port_len(kernel)
+        separator = '-' * sum([x + 3 for x in port_len]) + '-'
+        separator += '--' + len(str(kernel[-1]['line_number'])) * '-' if show_lineno else ''
+        col_sep = '|'
+        sep_list = self._get_separator_list(col_sep)
+        headline = 'Port pressure in cycles'
+        headline_str = '{{:^{}}}'.format(len(separator))
+
+        print('\n\nThroughput Analysis Report\n' + '--------------------------')
+        print(headline_str.format(headline))
+        print(lineno_filler + self._get_port_number_line(port_len))
+        print(separator)
+        for instruction_form in kernel:
+            line = '{:4d} {} {} {}'.format(
+                instruction_form['line_number'],
+                self._get_port_pressure(instruction_form['port_pressure'], port_len, sep_list),
+                self._get_flag_symbols(instruction_form['flags'])
+                if instruction_form['instruction'] is not None
+                else ' ',
+                instruction_form['line'].strip(),
+            )
+            line = line if show_lineno else col_sep + col_sep.join(line.split(col_sep)[1:])
+            if show_cmnts is False and self._is_comment(instruction_form):
+                continue
+            print(line)
+        print()
+        tp_sum = SemanticsAppender.get_throughput_sum(kernel)
+        print(lineno_filler + self._get_port_pressure(tp_sum, port_len, ' '))
+
+    def _get_separator_list(self, separator, separator_2=' '):
+        separator_list = []
+        for i in range(len(self._data['ports']) - 1):
+            match_1 = re.search(r'\d+', self._data['ports'][i])
+            match_2 = re.search(r'\d+', self._data['ports'][i + 1])
+            if match_1 is not None and match_2 is not None and match_1.group() == match_2.group():
+                separator_list.append(separator_2)
+            else:
+                separator_list.append(separator)
+        separator_list.append(separator)
+        return separator_list
+
+    def _get_flag_symbols(self, flag_obj):
+        string_result = ''
+        string_result += '*' if INSTR_FLAGS.NOT_BOUND in flag_obj else ''
+        string_result += 'X' if INSTR_FLAGS.TP_UNKWN in flag_obj else ''
+        string_result += 'P' if INSTR_FLAGS.HIDDEN_LD in flag_obj else ''
+        # TODO add other flags
+        string_result += ' ' if len(string_result) == 0 else ''
+        return string_result
+
+    def _get_port_pressure(self, ports, port_len, separator='|'):
+        if not isinstance(separator, list):
+            separator = [separator for x in ports]
+        string_result = '{} '.format(separator[-1])
+        for i in range(len(ports)):
+            if float(ports[i]) == 0.0:
+                string_result += port_len[i] * ' ' + ' {} '.format(separator[i])
+                continue
+            left_len = len(str(float(ports[i])).split('.')[0])
+            substr = '{:' + str(left_len) + '.' + str(max(port_len[i] - left_len - 1, 0)) + 'f}'
+            string_result += substr.format(ports[i]) + ' {} '.format(separator[i])
+        return string_result[:-1]
+
+    def _get_max_port_len(self, kernel):
+        port_len = [4 for x in self._data['ports']]
+        for instruction_form in kernel:
+            for i, port in enumerate(instruction_form['port_pressure']):
+                if len('{:.2f}'.format(port)) > port_len[i]:
+                    port_len[i] = len('{:.2f}'.format(port))
+        return port_len
+
+    def _get_port_number_line(self, port_len, separator='|'):
+        string_result = separator
+        separator_list = self._get_separator_list(separator, '-')
+        for i, length in enumerate(port_len):
+            substr = '{:^' + str(length + 2) + 's}'
+            string_result += substr.format(self._data['ports'][i]) + separator_list[i]
+        return string_result
+
+    def print_latency_analysis(self, cp_kernel, separator='|'):
+        print('\n\nLatency Analysis Report\n' + '-----------------------')
+        for instruction_form in cp_kernel:
+            print(
+                '{:4d} {} {:4.1f} {}{}{} {}'.format(
+                    instruction_form['line_number'],
+                    separator,
+                    instruction_form['latency_cp'],
+                    separator,
+                    'X' if INSTR_FLAGS.LT_UNKWN in instruction_form['flags'] else ' ',
+                    separator,
+                    instruction_form['line'],
+                )
+            )
+        print(
+            '\n{:4} {} {:4.1f}'.format(
+                ' ' * max([len(str(instr_form['line_number'])) for instr_form in cp_kernel]),
+                ' ' * len(separator),
+                sum([instr_form['latency_cp'] for instr_form in cp_kernel]),
+            )
+        )
+
+    def print_loopcarried_dependencies(self, dep_dict, separator='|'):
+        print(
+            '\n\nLoop-Carried Dependencies Analysis Report\n'
+            + '-----------------------------------------'
+        )
+        # TODO find a way to overcome padding for different tab-lengths
+        for dep in dep_dict:
+            print(
+                '{:4d} {} {:4.1f} {} {:36}{} {}'.format(
+                    dep,
+                    separator,
+                    sum(
+                        [
+                            instr_form['latency_lcd']
+                            for instr_form in dep_dict[dep]['dependencies']
+                        ]
+                    ),
+                    separator,
+                    dep_dict[dep]['root']['line'],
+                    separator,
+                    [node['line_number'] for node in dep_dict[dep]['dependencies']],
+                )
+            )
+
+    def _print_header_report(self):
+        version = 'v0.3'
+        adjust = 20
+        header = ''
+        header += 'Open Source Architecture Code Analyzer (OSACA) - {}\n'.format(version)
+        header += 'Analyzed file:'.ljust(adjust) + '{}\n'.format(self._filename)
+        header += 'Architecture:'.ljust(adjust) + '{}\n'.format(self._arch)
+        header += 'Timestamp:'.ljust(adjust) + '{}\n'.format(
+            dt.utcnow().strftime('%Y-%m-%d %H:%M:%S')
+        )
+        print(header)
+
+    def _print_symbol_map(self):
+        symbol_dict = {
+            INSTR_FLAGS.NOT_BOUND: 'Instruction micro-ops not bound to a port',
+            INSTR_FLAGS.TP_UNKWN: 'No throughput/latency information for this instruction in '
+            + 'data file',
+            INSTR_FLAGS.HIDDEN_LD: 'Throughput of LOAD operation can be hidden behind a past '
+            + 'or future STORE instruction',
+        }
+        symbol_map = ''
+        for flag in sorted(symbol_dict.keys()):
+            symbol_map += ' {} - {}\n'.format(self._get_flag_symbols([flag]), symbol_dict[flag])
+
+        print(symbol_map, end='')
+
+    def _print_port_binding_summary(self):
+        raise NotImplementedError
+
+    def print_full_analysis(self, kernel, kernel_dg: KernelDG, verbose=False):
+        self._print_header_report()
+        self._print_symbol_map()
+        self.print_throughput_analysis(kernel, show_lineno=True)
+        self.print_latency_analysis(kernel_dg.get_critical_path())
+        self.print_loopcarried_dependencies(kernel_dg.get_loopcarried_dependencies())
--- a/osaca/get_instr.py
+++ b/osaca/get_instr.py
@@ -1,240 +0,0 @@
-#!/usr/bin/env python3
-import os
-import re
-import argparse
-
-from osaca.testcase import Testcase
-from osaca.param import Register, MemAddr, Parameter
-#from testcase import Testcase
-#from param import Register, MemAddr, Parameter
-
-
-class InstrExtractor(object):
-    filepaths = []
-    # Variables for checking lines
-    numSeps = 0
-    sem = 0
-    db = {}
-    sorted_db = []
-    lncnt = 1
-    cntChar = ''
-    first = True
-    # Constant variables
-    MARKER = r'//STARTLOOP'
-    ASM_LINE = re.compile(r'\s[0-9a-f]+[:]')
-
-    def __init__(self, filepath):
-        self.filepaths = filepath
-
-    def check_all(self):
-        for i in range(0, len(self.filepaths)):
-            self.extract_instr(self.filepaths[i])
-
-    def is_elffile(self, filepath):
-        if os.path.isfile(filepath):
-            with open(filepath) as f:
-                src = f.read()
-            if 'format elf64' in src:
-                return True
-            return False
-
-    def extract_instr(self, asm_file):
-        # Check if parameter is in the correct file format
-        if not self.is_elffile(asm_file):
-            print('Invalid argument')
-            return
-        # Open file
-        f = open(asm_file, 'r')
-        # Analyse code line by line and check the instructions
-        self.lncnt = 1
-        for line in f:
-            self.check_line(line)
-            self.lncnt += 1
-        f.close()
-
-    def check_line(self, line):
-        # Check if MARKER is in line and count the number of whitespaces if so
-        if self.MARKER in line:
-            # But first, check if high level code ist indented with whitespaces or tabs
-            if self.first:
-                self.set_counter_char(line)
-                self.first = False
-            self.numSeps = (re.split(self.MARKER, line)[0]).count(self.cntChar)
-            self.sem = 2
-        elif self.sem > 0:
-            # We're in the marked code snipped
-            # Check if the line is ASM code and - if not - check if we're still in the loop
-            match = re.search(self.ASM_LINE, line)
-            if match:
-                # Further analysis of instructions
-                # Check if there are commetns in line
-                if r'//' in line:
-                    return
-                self.check_instr(''.join(re.split(r'\t', line)[-1:]))
-            elif (re.split(r'\S', line)[0]).count(self.cntChar) <= self.numSeps:
-                # Not in the loop anymore - or yet - so we decrement the semaphore
-                self.sem = self.sem - 1
-
-    # Check if seperator is either tabulator or whitespace
-    def set_counter_char(self, line):
-        num_spaces = (re.split(self.MARKER, line)[0]).count(' ')
-        num_tabs = (re.split(self.MARKER, line)[0]).count('\t')
-        if num_spaces != 0 and num_tabs == 0:
-            self.cntChar = ' '
-        elif num_spaces == 0 and num_tabs != 0:
-            self.cntChar = '\t'
-        else:
-            err_msg = 'Indentation of code is only supported for whitespaces and tabs.'
-            raise NotImplementedError(err_msg)
-
-    def check_instr(self, instr):
-        # Check for strange clang padding bytes
-        while instr.startswith('data32'):
-            instr = instr[7:]
-        # Seperate mnemonic and operands
-        mnemonic = instr.split()[0]
-        params = ''.join(instr.split()[1:])
-        # Check if line is not only a byte
-        empty_byte = re.compile(r'[0-9a-f]{2}')
-        if re.match(empty_byte, mnemonic) and len(mnemonic) == 2:
-            return
-        # Check if there's one or more operand and store all in a list
-        param_list = self.flatten(self.separate_params(params))
-        op_list = list(param_list)
-        # Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or
-        # LABEL (LBL)
-        for i in range(len(param_list)):
-            op = param_list[i]
-            if len(op) <= 0:
-                op = Parameter('NONE')
-            elif op[0] == '$':
-                op = Parameter('IMD')
-            elif op[0] == '%' and '(' not in op:
-                j = len(op)
-                opmask = False
-                if '{' in op:
-                    j = op.index('{')
-                    opmask = True
-                op = Register(op[1:j], opmask)
-            elif '<' in op:
-                op = Parameter('LBL')
-            else:
-                op = MemAddr(op)
-            param_list[i] = str(op) if (type(op) is not Register) else str(op) + str(op.size)
-            op_list[i] = op
-        # Join mnemonic and operand(s) to an instruction form
-        if len(mnemonic) > 7:
-            tabs = '\t'
-        else:
-            tabs = '\t\t'
-        instr_form = mnemonic + tabs + ('  '.join(param_list))
-        # Check in data file for instruction form and increment the counter
-        if instr_form in self.db:
-            self.db[instr_form] = self.db[instr_form] + 1
-        else:
-            self.db[instr_form] = 1
-            # Create testcase for instruction form, since it is the first appearance of it
-            # Only create benchmark if no label (LBL) is part of the operands
-            do_bench = True
-            for par in op_list:
-                if str(par) == 'LBL' or str(par) == '':
-                    do_bench = False
-            if do_bench:
-                # Create testcase with reversed param list, due to the fact its intel syntax!
-                tc = Testcase(mnemonic, list(reversed(op_list)), '64')
-                tc.write_testcase()
-
-    def separate_params(self, params):
-        param_list = [params]
-        if ',' in params:
-            if ')' in params:
-                if params.index(')') < len(params) - 1 and params[params.index(')') + 1] == ',':
-                    i = params.index(')') + 1
-                elif params.index('(') < params.index(','):
-                    return param_list
-                else:
-                    i = params.index(',')
-            else:
-                i = params.index(',')
-            param_list = [params[:i], self.separate_params(params[i + 1:])]
-        elif '#' in params:
-            i = params.index('#')
-            param_list = [params[:i]]
-        return param_list
-
-    def sort_db(self):
-        self.sorted_db = sorted(self.db.items(), key=lambda x: x[1], reverse=True)
-
-    def print_sorted_db(self):
-        self.sort_db()
-        total = 0
-        print('Number of\tmnemonic')
-        print('calls\n')
-        for i in range(len(self.sorted_db)):
-            print(str(self.sorted_db[i][1]) + '\t\t' + self.sorted_db[i][0])
-            total += self.sorted_db[i][1]
-        print('\nCumulated number of instructions: ' + str(total))
-
-    def save_db(self):
-        file = open('.cnt_asm_ops.db', 'w')
-        for i in self.db.items():
-            file.write(i[0] + '\t' + str(i[1]) + '\n')
-        file.close()
-
-    def load_db(self):
-        try:
-            file = open('.cnt_asm_ops.db', 'r')
-        except FileNotFoundError:
-            print('no data file found in current directory')
-            return
-        for line in file:
-            mnemonic = line.split('\t')[0]
-            # Join mnemonic and operand(s) to an instruction form
-            if len(mnemonic) > 7:
-                tabs = '\t'
-                params = line.split('\t')[1]
-                num_calls = line.split('\t')[2][:-1]
-            else:
-                tabs = '\t\t'
-                params = line.split('\t')[2]
-                num_calls = line.split('\t')[3][:-1]
-            instr_form = mnemonic + tabs + params
-            self.db[instr_form] = int(num_calls)
-        file.close()
-
-    def flatten(self, l):
-        if not l:
-            return l
-        if isinstance(l[0], list):
-            return self.flatten(l[0]) + self.flatten(l[1:])
-        return l[:1] + self.flatten(l[1:])
-
-
-def main():
-    # Parse args
-    parser = argparse.ArgumentParser(description='Returns a list of all instruction forms in the '
-                                                 'given files sorted by their number of '
-                                                 'occurrences.')
-    parser.add_argument('-V', '--version', action='version', version='%(prog)s 0.2')
-    parser.add_argument('filepath', nargs='+', help='path to objdump(s)')
-    parser.add_argument('-l', '--load', dest='load', action='store_true',
-                        help='load data file before checking new files')
-    parser.add_argument('-s', '--store', dest='store', action='store_true',
-                        help='store data file before checking new files')
-
-    # Create object and store arguments as attribute
-    inp = parser.parse_args()
-    ie = InstrExtractor(inp.filepath)
-
-    # Do work
-    if inp.load:
-        ie.load_db()
-    ie.check_all()
-    ie.print_sorted_db()
-    if inp.store:
-        ie.save_db()
-
-
-# ---------main method----------
-if __name__ == '__main__':
-    main()
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
--- a/osaca/param.py
+++ b/osaca/param.py
@@ -1,142 +0,0 @@
-#!/usr/bin/env python3
-import re
-
-
-class Parameter(object):
-    type_list = ['REG', 'MEM', 'IMD', 'LBL', 'NONE']
-
-    def __init__(self, ptype):
-        self.ptype = ptype.upper()
-        if self.ptype not in self.type_list:
-            raise NameError('Type not supported: '+ptype)
-
-    def __str__(self):
-        """Return string representation."""
-        if self.ptype == 'NONE':
-            return ''
-        else:
-            return self.ptype
-
-
-class MemAddr(Parameter):
-    segment_regs = ['CS', 'DS', 'SS', 'ES', 'FS', 'GS']
-    scales = [1, 2, 4, 8]
-
-    def __init__(self, name):
-        super().__init__("MEM")
-        name = name.strip(', \t')
-        self.offset = None
-        self.base = None
-        self.index = None
-        self.scale = None
-        
-        m = re.match(r'((?P<offset_hex>[x0-9a-fA-F]*)|(?P<offset_dec>\-?[0-9]*))'
-                     r'\((?P<base>[^,\)]+)(?:,\s*(?P<index>[^,\)]+)(?:,\s*'
-                     r'(?P<scale>[^,\)]+))?)?\)', name)
-
-        if not m:
-            raise ValueError('Type not supported: {!r}'.format(name))
-
-        self.offset = m.group('offset_dec') or m.group('offset_hex') or None
-        self.base = m.group('base') or None
-        self.index = m.group('index') or None
-        self.scale = m.group('scale') or None
-
-
-    def __str__(self):
-        """returns string representation"""
-        mem_format = 'MEM('
-        if self.offset:
-            mem_format += 'offset'
-        if self.base and not self.index:
-            mem_format += '(base)'
-        elif self.base and self.index and self.scale:
-            mem_format += '(base, index, scale)'
-        mem_format += ')'
-        return mem_format
-
-
-class Register(Parameter):
-    sizes = {
-        # General Purpose Registers
-        'AH': (8, 'GPR'), 'AL': (8, 'GPR'), 'BH': (8, 'GPR'), 'BL': (8, 'GPR'), 'CH': (8, 'GPR'),
-        'CL': (8, 'GPR'), 'DH': (8, 'GPR'), 'DL': (8, 'GPR'), 'BPL': (8, 'GPR'), 'SIL': (8, 'GPR'),
-        'DIL': (8, 'GPR'), 'SPL': (8, 'GPR'), 'R8L': (8, 'GPR'), 'R9L': (8, 'GPR'),
-        'R10L': (8, 'GPR'), 'R11L': (8, 'GPR'), 'R12L': (8, 'GPR'), 'R13L': (8, 'GPR'),
-        'R14L': (8, 'GPR'), 'R15L': (8, 'GPR'), 'R8B': (8, 'GPR'), 'R9B': (8, 'GPR'),
-        'R10B': (8, 'GPR'), 'R11B': (8, 'GPR'), 'R12B': (8, 'GPR'), 'R13B': (8, 'GPR'),
-        'R14B': (8, 'GPR'), 'R15B': (8, 'GPR'), 'AX': (16, 'GPR'), 'BC': (16, 'GPR'),
-        'CX': (16, 'GPR'), 'DX': (16, 'GPR'), 'BP': (16, 'GPR'), 'SI': (16, 'GPR'),
-        'DI': (16, 'GPR'), 'SP': (16, 'GPR'), 'R8W': (16, 'GPR'), 'R9W': (16, 'GPR'),
-        'R10W': (16, 'GPR'), 'R11W': (16, 'GPR'), 'R12W': (16, 'GPR'), 'R13W': (16, 'GPR'),
-        'R14W': (16, 'GPR'), 'R15W': (16, 'GPR'), 'EAX': (32, 'GPR'), 'EBX': (32, 'GPR'),
-        'ECX': (32, 'GPR'), 'EDX': (32, 'GPR'), 'EBP': (32, 'GPR'), 'ESI': (32, 'GPR'),
-        'EDI': (32, 'GPR'), 'ESP': (32, 'GPR'), 'R8D': (32, 'GPR'), 'R9D': (32, 'GPR'),
-        'R10D': (32, 'GPR'), 'R11D': (32, 'GPR'), 'R12D': (32, 'GPR'), 'R13D': (32, 'GPR'),
-        'R14D': (32, 'GPR'), 'R15D': (32, 'GPR'), 'RAX': (64, 'GPR'), 'RBX': (64, 'GPR'),
-        'RCX': (64, 'GPR'), 'RDX': (64, 'GPR'), 'RBP': (64, 'GPR'), 'RSI': (64, 'GPR'),
-        'RDI': (64, 'GPR'), 'RSP': (64, 'GPR'), 'R8': (64, 'GPR'), 'R9': (64, 'GPR'),
-        'R10': (64, 'GPR'), 'R11': (64, 'GPR'), 'R12': (64, 'GPR'), 'R13': (64, 'GPR'),
-        'R14': (64, 'GPR'), 'R15': (64, 'GPR'), 'CS': (16, 'GPR'), 'DS': (16, 'GPR'),
-        'SS': (16, 'GPR'), 'ES': (16, 'GPR'), 'FS': (16, 'GPR'), 'GS': (16, 'GPR'),
-        'EFLAGS': (32, 'GPR'), 'RFLAGS': (64, 'GPR'), 'EIP': (32, 'GPR'), 'RIP': (64, 'GPR'),
-        # FPU Registers
-        'ST0': (80, 'FPU'), 'ST1': (80, 'FPU'), 'ST2': (80, 'FPU'), 'ST3': (80, 'FPU'),
-        'ST4': (80, 'FPU'), 'ST5': (80, 'FPU'), 'ST6': (80, 'FPU'), 'ST7': (80, 'FPU'),
-        # MMX Registers
-        'MM0': (64, 'MMX'), 'MM1': (64, 'MMX'), 'MM2': (64, 'MMX'), 'MM3': (64, 'MMX'),
-        'MM4': (64, 'MMX'), 'MM5': (64, 'MMX'), 'MM6': (64, 'MMX'), 'MM7': (64, 'MMX'),
-        # XMM Registers
-        'XMM0': (128, 'XMM'), 'XMM1': (128, 'XMM'), 'XMM2': (128, 'XMM'), 'XMM3': (128, 'XMM'),
-        'XMM4': (128, 'XMM'), 'XMM5': (128, 'XMM'), 'XMM6': (128, 'XMM'), 'XMM7': (128, 'XMM'),
-        'XMM8': (128, 'XMM'), 'XMM9': (128, 'XMM'), 'XMM10': (128, 'XMM'), 'XMM11': (128, 'XMM'),
-        'XMM12': (128, 'XMM'), 'XMM13': (128, 'XMM'), 'XMM14': (128, 'XMM'), 'XMM15': (128, 'XMM'),
-        'XMM16': (128, 'XMM'), 'XMM17': (128, 'XMM'), 'XMM18': (128, 'XMM'), 'XMM19': (128, 'XMM'),
-        'XMM20': (128, 'XMM'), 'XMM21': (128, 'XMM'), 'XMM22': (128, 'XMM'), 'XMM23': (128, 'XMM'),
-        'XMM24': (128, 'XMM'), 'XMM25': (128, 'XMM'), 'XMM26': (128, 'XMM'), 'XMM27': (128, 'XMM'),
-        'XMM28': (128, 'XMM'), 'XMM29': (128, 'XMM'), 'XMM30': (128, 'XMM'), 'XMM31': (128, 'XMM'),
-        # YMM Registers
-        'YMM0': (256, 'YMM'), 'YMM1': (256, 'YMM'), 'YMM2': (256, 'YMM'), 'YMM3': (256, 'YMM'),
-        'YMM4': (256, 'YMM'), 'YMM5': (256, 'YMM'), 'YMM6': (256, 'YMM'), 'YMM7': (256, 'YMM'),
-        'YMM8': (256, 'YMM'), 'YMM9': (256, 'YMM'), 'YMM10': (256, 'YMM'), 'YMM11': (256, 'YMM'),
-        'YMM12': (256, 'YMM'), 'YMM13': (256, 'YMM'), 'YMM14': (256, 'YMM'), 'YMM15': (256, 'YMM'),
-        'YMM16': (256, 'YMM'), 'YMM17': (256, 'YMM'), 'YMM18': (256, 'YMM'), 'YMM19': (256, 'YMM'),
-        'YMM20': (256, 'YMM'), 'YMM21': (256, 'YMM'), 'YMM22': (256, 'YMM'), 'YMM23': (256, 'YMM'),
-        'YMM24': (256, 'YMM'), 'YMM25': (256, 'YMM'), 'YMM26': (256, 'YMM'), 'YMM27': (256, 'YMM'),
-        'YMM28': (256, 'YMM'), 'YMM29': (256, 'YMM'), 'YMM30': (256, 'YMM'), 'YMM31': (256, 'YMM'),
-        # ZMM Registers
-        'ZMM0': (512, 'ZMM'), 'ZMM1': (512, 'ZMM'), 'ZMM2': (512, 'ZMM'), 'ZMM3': (512, 'ZMM'),
-        'ZMM4': (512, 'ZMM'), 'ZMM5': (512, 'ZMM'), 'ZMM6': (512, 'ZMM'), 'ZMM7': (512, 'ZMM'),
-        'ZMM8': (512, 'ZMM'), 'ZMM9': (512, 'ZMM'), 'ZMM10': (512, 'ZMM'), 'ZMM11': (512, 'ZMM'),
-        'ZMM12': (512, 'ZMM'), 'ZMM13': (512, 'ZMM'), 'ZMM14': (512, 'ZMM'), 'ZMM15': (512, 'ZMM'),
-        'ZMM16': (512, 'ZMM'), 'ZMM17': (512, 'ZMM'), 'ZMM18': (512, 'ZMM'), 'ZMM19': (512, 'ZMM'),
-        'ZMM20': (512, 'ZMM'), 'ZMM21': (512, 'ZMM'), 'ZMM22': (512, 'ZMM'), 'ZMM23': (512, 'ZMM'),
-        'ZMM24': (512, 'ZMM'), 'ZMM25': (512, 'ZMM'), 'ZMM26': (512, 'ZMM'), 'ZMM27': (512, 'ZMM'),
-        'ZMM28': (512, 'ZMM'), 'ZMM29': (512, 'ZMM'), 'ZMM30': (512, 'ZMM'), 'ZMM31': (512, 'ZMM'),
-        # Opmask Register
-        'K0': (64, 'K'), 'K1': (64, 'K'), 'K2': (64, 'K'), 'K3': (64, 'K'), 'K4': (64, 'K'),
-        'K5': (64, 'K'), 'K6': (64, 'K'), 'K7': (64, 'K'),
-        # Bounds Registers
-        'BND0': (128, 'BND'), 'BND1': (128, 'BND'), 'BND2': (128, 'BND'), 'BND3': (128, 'BND'),
-        # Registers in gerneral
-        'R16': (16, 'GPR'), 'R32': (32, 'GPR'), 'R64': (64, 'GPR'), 'FPU': (80, 'FPU'),
-        'MMX': (64, 'MMX'), 'XMM': (128, 'XMM'), 'YMM': (256, 'YMM'), 'ZMM': (512, 'ZMM'),
-        'K': (64, 'K'), 'BND': (128, 'BND')
-    }
-
-    def __init__(self, name, mask=False):
-        super().__init__("REG")
-        self.name = name.upper()
-        self.mask = mask
-        if self.name in self.sizes:
-            self.size = self.sizes[self.name][0]
-            self.reg_type = self.sizes[self.name][1]
-        else:
-            raise NameError('Register name not in dictionary: {}'.format(self.name))
-
-    def __str__(self):
-        """Return string representation."""
-        opmask = ''
-        if self.mask:
-            opmask = '{opmask}'
-        return self.reg_type + opmask
--- a/osaca/parser/init.py
+++ b/osaca/parser/init.py
@@ -0,0 +1,19 @@
+"""
+Collection of parsers supported by OSACA.
+
+Only the parser below will be exported, so please add new parsers to __all__.
+"""
+from .attr_dict import AttrDict
+from .base_parser import BaseParser
+from .parser_x86att import ParserX86ATT
+from .parser_AArch64v81 import ParserAArch64v81
+
+__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64v81', 'get_parser']
+
+def get_parser(isa):
+    if isa.lower() == 'x86':
+        return ParserX86ATT()
+    elif isa.lower() == 'aarch64':
+        return ParserAArch64v81()
+    else:
+        raise ValueError("Unknown ISA {!r}.".format(isa))
--- a/osaca/parser/attr_dict.py
+++ b/osaca/parser/attr_dict.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+    @staticmethod
+    def convert_dict(dictionary):
+        if isinstance(dictionary, type(list())):
+            return [AttrDict.convert_dict(x) for x in dictionary]
+        if isinstance(dictionary, type(dict())):
+            for key in list(dictionary.keys()):
+                entry = dictionary[key]
+                if isinstance(entry, type(dict())) or isinstance(
+                    entry, type(AttrDict())
+                ):
+                    dictionary[key] = AttrDict.convert_dict(dictionary[key])
+                if isinstance(entry, type(list())):
+                    dictionary[key] = [AttrDict.convert_dict(x) for x in entry]
+            return AttrDict(dictionary)
+        return dictionary
--- a/osaca/parser/base_parser.py
+++ b/osaca/parser/base_parser.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+
+class BaseParser(object):
+    # Identifiers for operand types
+    COMMENT_ID = 'comment'
+    DIRECTIVE_ID = 'directive'
+    IMMEDIATE_ID = 'immediate'
+    LABEL_ID = 'label'
+    MEMORY_ID = 'memory'
+    REGISTER_ID = 'register'
+    INSTRUCTION_ID = 'instruction'
+    OPERANDS_ID = 'operands'
+
+    def __init__(self):
+        self.construct_parser()
+
+    def parse_file(self, file_content, start_line=0):
+        '''
+        Parse assembly file. This includes *not* extracting of the marked kernel and
+        the parsing of the instruction forms.
+
+        :param str file_content: assembly code
+        :param int start_line: offset, if first line in file_content is meant to be not 1
+        :return: list of instruction forms
+        '''
+        # Create instruction form list
+        asm_instructions = []
+        lines = file_content.split('\n')
+        for i, line in enumerate(lines):
+            if line.strip() == '':
+                continue
+            asm_instructions.append(self.parse_line(line, i + 1 + start_line))
+        return asm_instructions
+
+    def parse_line(self, line, line_number=None):
+        # Done in derived classes
+        raise NotImplementedError
+
+    def parse_instruction(self, instruction):
+        # Done in derived classes
+        raise NotImplementedError
+
+    def parse_register(self, register_string):
+        raise NotImplementedError
+
+    def is_gpr(self, register):
+        raise NotImplementedError
+
+    def is_vector_register(self, register):
+        raise NotImplementedError
+
+    def get_reg_type(self, register):
+        raise NotImplementedError
+
+    def construct_parser(self):
+        return
+        # raise NotImplementedError
+
+    ##################
+    # Helper functions
+    ##################
+
+    def process_operand(self, operand):
+        raise NotImplementedError
+
+    def get_full_reg_name(self, register):
+        raise NotImplementedError
+
+    def normalize_imd(self, imd):
+        raise NotImplementedError
+
+    def is_reg_dependend_of(self, reg_a, reg_b):
+        raise NotImplementedError
--- a/osaca/parser/parser_AArch64v81.py
+++ b/osaca/parser/parser_AArch64v81.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+
+
+import pyparsing as pp
+
+from osaca.parser import AttrDict, BaseParser
+
+
+class ParserAArch64v81(BaseParser):
+    def __init__(self):
+        super().__init__()
+
+    def construct_parser(self):
+        # Comment
+        symbol_comment = '//'
+        self.comment = pp.Literal(symbol_comment) + pp.Group(
+            pp.ZeroOrMore(pp.Word(pp.printables))
+        ).setResultsName(self.COMMENT_ID)
+        # Define ARM assembly identifier
+        relocation = pp.Combine(pp.Literal(':') + pp.Word(pp.alphanums + '_') + pp.Literal(':'))
+        first = pp.Word(pp.alphas + '_.', exact=1)
+        rest = pp.Word(pp.alphanums + '_.')
+        identifier = pp.Group(
+            pp.Optional(relocation).setResultsName('relocation')
+            + pp.Combine(first + pp.Optional(rest)).setResultsName('name')
+        ).setResultsName('identifier')
+        # Label
+        self.label = pp.Group(
+            identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
+        ).setResultsName(self.LABEL_ID)
+        # Directive
+        decimal_number = pp.Combine(
+            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
+        ).setResultsName('value')
+        hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
+        directive_option = pp.Combine(
+            pp.Word(pp.alphas + '#@.%', exact=1)
+            + pp.Optional(pp.Word(pp.printables + ' ', excludeChars=','))
+        )
+        directive_parameter = (
+            pp.quotedString | directive_option | identifier | hex_number | decimal_number
+        )
+        commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
+        self.directive = pp.Group(
+            pp.Literal('.')
+            + pp.Word(pp.alphanums + '_').setResultsName('name')
+            + commaSeparatedList.setResultsName('parameters')
+            + pp.Optional(self.comment)
+        ).setResultsName(self.DIRECTIVE_ID)
+
+        ##############################
+        # Instructions
+        # Mnemonic
+        # (?P<instr>[a-zA-Z][a-zA-Z0-9]*)(?P<setflg>S?)(P?<CC>.[a-zA-Z]{2})
+        mnemonic = pp.Word(pp.alphanums + '.').setResultsName('mnemonic')
+        # Immediate:
+        # int: ^-?[0-9]+ | hex: ^0x[0-9a-fA-F]+ | fp: ^[0-9]{1}.[0-9]+[eE]{1}[\+-]{1}[0-9]+[fF]?
+        symbol_immediate = '#'
+        mantissa = pp.Combine(
+            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums) + pp.Literal('.') + pp.Word(pp.nums)
+        ).setResultsName('mantissa')
+        exponent = (
+            pp.CaselessLiteral('e')
+            + pp.Word('+-').setResultsName('e_sign')
+            + pp.Word(pp.nums).setResultsName('exponent')
+        )
+        float_ = pp.Group(
+            mantissa + pp.Optional(exponent) + pp.CaselessLiteral('f')
+        ).setResultsName('float')
+        double_ = pp.Group(mantissa + pp.Optional(exponent)).setResultsName('double')
+        immediate = pp.Group(
+            pp.Optional(pp.Literal(symbol_immediate))
+            + (hex_number ^ decimal_number ^ float_ ^ double_)
+            | (pp.Optional(pp.Literal(symbol_immediate)) + identifier)
+        ).setResultsName(self.IMMEDIATE_ID)
+        shift_op = (
+            pp.CaselessLiteral('lsl')
+            ^ pp.CaselessLiteral('lsr')
+            ^ pp.CaselessLiteral('asr')
+            ^ pp.CaselessLiteral('ror')
+            ^ pp.CaselessLiteral('sxtw')
+            ^ pp.CaselessLiteral('uxtw')
+        )
+        arith_immediate = pp.Group(
+            immediate.setResultsName('base_immediate')
+            + pp.Suppress(pp.Literal(','))
+            + shift_op.setResultsName('shift_op')
+            + immediate.setResultsName('shift')
+        ).setResultsName(self.IMMEDIATE_ID)
+        # Register:
+        # scalar: [XWBHSDQ][0-9]{1,2}  |   vector: V[0-9]{1,2}\.[12468]{1,2}[BHSD]()?
+        # define SP and ZR register aliases as regex, due to pyparsing does not support
+        # proper lookahead
+        alias_r31_sp = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(sp|SP))')
+        alias_r31_zr = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(zr|ZR))')
+        scalar = pp.Word(pp.alphas, exact=1).setResultsName('prefix') + pp.Word(
+            pp.nums
+        ).setResultsName('name')
+        index = pp.Literal('[') + pp.Word(pp.nums).setResultsName('index') + pp.Literal(']')
+        vector = (
+            pp.CaselessLiteral('v').setResultsName('prefix')
+            + pp.Word(pp.nums).setResultsName('name')
+            + pp.Literal('.')
+            + pp.Optional(pp.Word('12468')).setResultsName('lanes')
+            + pp.Word(pp.alphas, exact=1).setResultsName('shape')
+            + pp.Optional(index)
+        )
+        self.list_element = vector ^ scalar
+        register_list = (
+            pp.Literal('{')
+            + (
+                pp.delimitedList(pp.Combine(self.list_element), delim=',').setResultsName('list')
+                ^ pp.delimitedList(pp.Combine(self.list_element), delim='-').setResultsName(
+                    'range'
+                )
+            )
+            + pp.Literal('}')
+            + pp.Optional(index)
+        )
+        register = pp.Group(
+            (alias_r31_sp | alias_r31_zr | vector | scalar | register_list)
+            + pp.Optional(
+                pp.Suppress(pp.Literal(','))
+                + shift_op.setResultsName('shift_op')
+                + immediate.setResultsName('shift')
+            )
+        ).setResultsName(self.REGISTER_ID)
+        # Memory
+        register_index = register.setResultsName('index') + pp.Optional(
+            pp.Literal(',') + pp.Word(pp.alphas) + immediate.setResultsName('scale')
+        )
+        memory = pp.Group(
+            pp.Literal('[')
+            + pp.Optional(register.setResultsName('base'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(register_index ^ immediate.setResultsName('offset'))
+            + pp.Literal(']')
+            + pp.Optional(
+                pp.Literal('!').setResultsName('pre_indexed')
+                | (pp.Suppress(pp.Literal(',')) + immediate.setResultsName('post_indexed'))
+            )
+        ).setResultsName(self.MEMORY_ID)
+        prefetch_op = pp.Group(
+            pp.Group(pp.CaselessLiteral('PLD') ^ pp.CaselessLiteral('PST')).setResultsName('type')
+            + pp.Group(
+                pp.CaselessLiteral('L1') ^ pp.CaselessLiteral('L2') ^ pp.CaselessLiteral('L3')
+            ).setResultsName('target')
+            + pp.Group(pp.CaselessLiteral('KEEP') ^ pp.CaselessLiteral('STRM')).setResultsName(
+                'policy'
+            )
+        ).setResultsName('prfop')
+        # Combine to instruction form
+        operand_first = pp.Group(
+            register ^ (prefetch_op | immediate) ^ memory ^ arith_immediate ^ identifier
+        )
+        operand_rest = pp.Group((register ^ immediate ^ memory ^ arith_immediate) | identifier)
+        self.instruction_parser = (
+            mnemonic
+            + pp.Optional(operand_first.setResultsName('operand1'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(operand_rest.setResultsName('operand2'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(operand_rest.setResultsName('operand3'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(operand_rest.setResultsName('operand4'))
+            + pp.Optional(self.comment)
+        )
+
+    def parse_line(self, line, line_number=None):
+        """
+        Parse line and return instruction form.
+
+        :param str line: line of assembly code
+        :param int line_id: default None, identifier of instruction form
+        :return: parsed instruction form
+        """
+        instruction_form = AttrDict(
+            {
+                self.INSTRUCTION_ID: None,
+                self.OPERANDS_ID: None,
+                self.DIRECTIVE_ID: None,
+                self.COMMENT_ID: None,
+                self.LABEL_ID: None,
+                'line': line.strip(),
+                'line_number': line_number,
+            }
+        )
+        result = None
+
+        # 1. Parse comment
+        try:
+            result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict())
+            result = AttrDict.convert_dict(result)
+            instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
+        except pp.ParseException:
+            pass
+
+        # 2. Parse label
+        if result is None:
+            try:
+                result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
+                result = AttrDict.convert_dict(result)
+                instruction_form[self.LABEL_ID] = result[self.LABEL_ID].name
+                if self.COMMENT_ID in result[self.LABEL_ID]:
+                    instruction_form[self.COMMENT_ID] = ' '.join(
+                        result[self.LABEL_ID][self.COMMENT_ID]
+                    )
+            except pp.ParseException:
+                pass
+
+        # 3. Parse directive
+        if result is None:
+            try:
+                result = self.process_operand(
+                    self.directive.parseString(line, parseAll=True).asDict()
+                )
+                result = AttrDict.convert_dict(result)
+                instruction_form[self.DIRECTIVE_ID] = AttrDict(
+                    {
+                        'name': result[self.DIRECTIVE_ID].name,
+                        'parameters': result[self.DIRECTIVE_ID].parameters,
+                    }
+                )
+                if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
+                    instruction_form[self.COMMENT_ID] = ' '.join(
+                        result[self.DIRECTIVE_ID][self.COMMENT_ID]
+                    )
+            except pp.ParseException:
+                pass
+
+        # 4. Parse instruction
+        if result is None:
+            try:
+                result = self.parse_instruction(line)
+            except (pp.ParseException, KeyError):
+                print(
+                    '\n\n*-*-*-*-*-*-*-*-*-*-\n{}: {}\n*-*-*-*-*-*-*-*-*-*-\n\n'.format(
+                        line_number, line
+                    )
+                )
+            instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
+            instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
+            instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
+
+        return instruction_form
+
+    def parse_instruction(self, instruction):
+        result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
+        result = AttrDict.convert_dict(result)
+        operands = []
+        # Add operands to list
+        # Check first operand
+        if 'operand1' in result:
+            operands.append(self.process_operand(result['operand1']))
+        # Check second operand
+        if 'operand2' in result:
+            operands.append(self.process_operand(result['operand2']))
+        # Check third operand
+        if 'operand3' in result:
+            operands.append(self.process_operand(result['operand3']))
+        # Check fourth operand
+        if 'operand4' in result:
+            operands.append(self.process_operand(result['operand4']))
+
+        return_dict = AttrDict(
+            {
+                self.INSTRUCTION_ID: result.mnemonic,
+                self.OPERANDS_ID: operands,
+                self.COMMENT_ID: ' '.join(result[self.COMMENT_ID])
+                if self.COMMENT_ID in result
+                else None,
+            }
+        )
+        return return_dict
+
+    def process_operand(self, operand):
+        # structure memory addresses
+        if self.MEMORY_ID in operand:
+            return self.substitute_memory_address(operand[self.MEMORY_ID])
+        # structure register lists
+        if self.REGISTER_ID in operand and (
+            'list' in operand[self.REGISTER_ID] or 'range' in operand[self.REGISTER_ID]
+        ):
+            # TODO: discuss if ranges should be converted to lists
+            return self.substitute_register_list(operand[self.REGISTER_ID])
+        if self.REGISTER_ID in operand and operand[self.REGISTER_ID]['name'] == 'sp':
+            return self.substitute_sp_register(operand[self.REGISTER_ID])
+        # add value attribute to floating point immediates without exponent
+        if self.IMMEDIATE_ID in operand:
+            return self.substitute_immediate(operand[self.IMMEDIATE_ID])
+        if self.LABEL_ID in operand:
+            return self.substitute_label(operand[self.LABEL_ID])
+        return operand
+
+    def substitute_memory_address(self, memory_address):
+        # Remove unnecessarily created dictionary entries during parsing
+        offset = None if 'offset' not in memory_address else memory_address['offset']
+        base = None if 'base' not in memory_address else memory_address['base']
+        index = None if 'index' not in memory_address else memory_address['index']
+        scale = 1
+        if base is not None and 'name' in base and base['name'] == 'sp':
+            base['prefix'] = 'x'
+        if index is not None and 'name' in index and index['name'] == 'sp':
+            index['prefix'] = 'x'
+        valid_shift_ops = ['lsl', 'uxtw', 'sxtw']
+        if 'index' in memory_address:
+            if 'shift' in memory_address['index']:
+                if memory_address['index']['shift_op'].lower() in valid_shift_ops:
+                    scale = 2 ** int(memory_address['index']['shift']['value'])
+        new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale})
+        if 'pre_indexed' in memory_address:
+            new_dict['pre_indexed'] = True
+        if 'post_indexed' in memory_address:
+            new_dict['post_indexed'] = memory_address['post_indexed']
+        return AttrDict({self.MEMORY_ID: new_dict})
+
+    def substitute_sp_register(self, register):
+        reg = register
+        reg['prefix'] = 'x'
+        return AttrDict({self.REGISTER_ID: reg})
+
+    def substitute_register_list(self, register_list):
+        # Remove unnecessarily created dictionary entries during parsing
+        vlist = []
+        dict_name = ''
+        if 'list' in register_list:
+            dict_name = 'list'
+        if 'range' in register_list:
+            dict_name = 'range'
+        for v in register_list[dict_name]:
+            vlist.append(
+                AttrDict.convert_dict(self.list_element.parseString(v, parseAll=True).asDict())
+            )
+        index = None if 'index' not in register_list else register_list['index']
+        new_dict = AttrDict({dict_name: vlist, 'index': index})
+        return AttrDict({self.REGISTER_ID: new_dict})
+
+    def substitute_immediate(self, immediate):
+        dict_name = ''
+        if 'identifier' in immediate:
+            # actually an identifier, change declaration
+            return immediate
+        if 'value' in immediate:
+            # normal integer value, nothing to do
+            return AttrDict({self.IMMEDIATE_ID: immediate})
+        if 'base_immediate' in immediate:
+            # arithmetic immediate, nothing to do
+            return AttrDict({self.IMMEDIATE_ID: immediate})
+        if 'float' in immediate:
+            dict_name = 'float'
+        if 'double' in immediate:
+            dict_name = 'double'
+        if 'exponent' in immediate[dict_name]:
+            # nothing to do
+            return AttrDict({self.IMMEDIATE_ID: immediate})
+        else:
+            # change 'mantissa' key to 'value'
+            return AttrDict(
+                {self.IMMEDIATE_ID: AttrDict({'value': immediate[dict_name]['mantissa']})}
+            )
+
+    def substitute_label(self, label):
+        # remove duplicated 'name' level due to identifier
+        label['name'] = label['name']['name']
+        return AttrDict({self.LABEL_ID: label})
+
+    def get_full_reg_name(self, register):
+        if 'lanes' in register:
+            return (
+                register['prefix']
+                + str(register['name'])
+                + '.'
+                + str(register['lanes'])
+                + register['shape']
+            )
+        return register['prefix'] + str(register['name'])
+
+    def normalize_imd(self, imd):
+        if 'value' in imd:
+            if imd['value'].lower().startswith('0x'):
+                # hex, return decimal
+                return int(imd['value'], 16)
+            return int(imd['value'], 10)
+        elif 'float' in imd:
+            return self.ieee_to_int(imd['float'])
+        elif 'double' in imd:
+            return self.ieee_to_int(imd['double'])
+        # identifier
+        return imd
+
+    def ieee_to_int(self, ieee_val):
+        exponent = int(ieee_val['exponent'], 10)
+        if ieee_val['e_sign'] == '-':
+            exponent *= -1
+        return float(ieee_val['mantissa']) * (10 ** exponent)
+
+    def parse_register(self, register_string):
+        raise NotImplementedError
+
+    def is_gpr(self, register):
+        if register['prefix'] in 'wx':
+            return True
+        return False
+
+    def is_vector_register(self, register):
+        if register['prefix'] in 'bhsdqv':
+            return True
+        return False
+
+    def is_reg_dependend_of(self, reg_a, reg_b):
+        prefixes_gpr = 'wx'
+        prefixes_vec = 'bhsdqv'
+        if reg_a['name'] == reg_b['name']:
+            if reg_a['prefix'].lower() in prefixes_gpr and reg_b['prefix'].lower() in prefixes_gpr:
+                return True
+            if reg_a['prefix'].lower() in prefixes_vec and reg_b['prefix'].lower() in prefixes_vec:
+                return True
+        return False
+
+    def get_reg_type(self, register):
+        return register['prefix']
--- a/osaca/parser/parser_x86att.py
+++ b/osaca/parser/parser_x86att.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+
+import pyparsing as pp
+
+from osaca.parser import AttrDict, BaseParser
+
+
+class ParserX86ATT(BaseParser):
+    def __init__(self):
+        super().__init__()
+
+    def construct_parser(self):
+        decimal_number = pp.Combine(
+            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
+        ).setResultsName('value')
+        hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
+        # Comment
+        symbol_comment = '#'
+        self.comment = pp.Literal(symbol_comment) + pp.Group(
+            pp.ZeroOrMore(pp.Word(pp.printables))
+        ).setResultsName(self.COMMENT_ID)
+        # Define x86 assembly identifier
+        id_offset = pp.Word(pp.nums) + pp.Suppress(pp.Literal('+'))
+        first = pp.Word(pp.alphas + '_.', exact=1)
+        rest = pp.Word(pp.alphanums + '$_.')
+        identifier = pp.Group(
+            pp.Optional(id_offset).setResultsName('offset')
+            + pp.Combine(first + pp.Optional(rest)).setResultsName('name')
+        ).setResultsName('identifier')
+        # Label
+        self.label = pp.Group(
+            identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
+        ).setResultsName(self.LABEL_ID)
+        # Register: pp.Regex('^%[0-9a-zA-Z]+,?')
+        self.register = pp.Group(
+            pp.Literal('%')
+            + pp.Word(pp.alphanums).setResultsName('name')
+            + pp.Optional(
+                pp.Literal('{')
+                + pp.Literal('%')
+                + pp.Word(pp.alphanums).setResultsName('mask')
+                + pp.Literal('}')
+            )
+        ).setResultsName(self.REGISTER_ID)
+        # Immediate: pp.Regex('^\$(-?[0-9]+)|(0x[0-9a-fA-F]+),?')
+        symbol_immediate = '$'
+        immediate = pp.Group(
+            pp.Literal(symbol_immediate) + (hex_number | decimal_number | identifier)
+        ).setResultsName(self.IMMEDIATE_ID)
+        # Memory: offset(base, index, scale)
+        offset = pp.Group(identifier | hex_number | decimal_number).setResultsName(
+            self.IMMEDIATE_ID
+        )
+        scale = pp.Word('1248', exact=1)
+        memory = pp.Group(
+            pp.Optional(offset.setResultsName('offset'))
+            + pp.Literal('(')
+            + pp.Optional(self.register.setResultsName('base'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(self.register.setResultsName('index'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(scale.setResultsName('scale'))
+            + pp.Literal(')')
+        ).setResultsName(self.MEMORY_ID)
+
+        # Directive
+        directive_option = pp.Combine(
+            pp.Word('#@.', exact=1) + pp.Word(pp.printables, excludeChars=',')
+        )
+        directive_parameter = (pp.quotedString | directive_option | identifier | hex_number |
+                               decimal_number | self.register
+        )
+        commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
+        self.directive = pp.Group(
+            pp.Literal('.')
+            + pp.Word(pp.alphanums + '_').setResultsName('name')
+            + commaSeparatedList.setResultsName('parameters')
+            + pp.Optional(self.comment)
+        ).setResultsName(self.DIRECTIVE_ID)
+
+        # Instructions
+        # Mnemonic
+        mnemonic = pp.ZeroOrMore(pp.Literal('data16') | pp.Literal('data32')) + pp.Word(
+            pp.alphanums
+        ).setResultsName('mnemonic')
+        # Combine to instruction form
+        operand_first = pp.Group(self.register ^ immediate ^ memory ^ identifier)
+        operand_rest = pp.Group(self.register ^ immediate ^ memory)
+        self.instruction_parser = (
+            mnemonic
+            + pp.Optional(operand_first.setResultsName('operand1'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(operand_rest.setResultsName('operand2'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(operand_rest.setResultsName('operand3'))
+            + pp.Optional(pp.Suppress(pp.Literal(',')))
+            + pp.Optional(operand_rest.setResultsName('operand4'))
+            + pp.Optional(self.comment)
+        )
+
+    def parse_register(self, register_string):
+        try:
+            return self.process_operand(
+                self.register.parseString(register_string, parseAll=True).asDict()
+            )
+        except pp.ParseException:
+            return None
+
+    def parse_line(self, line, line_number=None):
+        """
+        Parse line and return instruction form.
+
+        :param str line: line of assembly code
+        :param int line_id: default None, identifier of instruction form
+        :return: parsed instruction form
+        """
+        instruction_form = AttrDict(
+            {
+                self.INSTRUCTION_ID: None,
+                self.OPERANDS_ID: None,
+                self.DIRECTIVE_ID: None,
+                self.COMMENT_ID: None,
+                self.LABEL_ID: None,
+                'line': line.strip(),
+                'line_number': line_number,
+            }
+        )
+        result = None
+
+        # 1. Parse comment
+        try:
+            result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict())
+            result = AttrDict.convert_dict(result)
+            instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
+        except pp.ParseException:
+            pass
+
+        # 2. Parse label
+        if result is None:
+            try:
+                result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
+                result = AttrDict.convert_dict(result)
+                instruction_form[self.LABEL_ID] = result[self.LABEL_ID]['name']
+                if self.COMMENT_ID in result[self.LABEL_ID]:
+                    instruction_form[self.COMMENT_ID] = ' '.join(
+                        result[self.LABEL_ID][self.COMMENT_ID]
+                    )
+            except pp.ParseException:
+                pass
+
+        # 3. Parse directive
+        if result is None:
+            try:
+                result = self.process_operand(
+                    self.directive.parseString(line, parseAll=True).asDict()
+                )
+                result = AttrDict.convert_dict(result)
+                instruction_form[self.DIRECTIVE_ID] = AttrDict(
+                    {
+                        'name': result[self.DIRECTIVE_ID]['name'],
+                        'parameters': result[self.DIRECTIVE_ID]['parameters'],
+                    }
+                )
+                if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
+                    instruction_form[self.COMMENT_ID] = ' '.join(
+                        result[self.DIRECTIVE_ID][self.COMMENT_ID]
+                    )
+            except pp.ParseException:
+                pass
+
+        # 4. Parse instruction
+        if result is None:
+            try:
+                result = self.parse_instruction(line)
+            except pp.ParseException as e:
+                raise ValueError('Could not parse instruction on line {}: {!r}'.format(
+                    line_number, line))
+            instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
+            instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
+            instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
+
+        return instruction_form
+
+    def parse_instruction(self, instruction):
+        result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
+        result = AttrDict.convert_dict(result)
+        operands = []
+        # Add operands to list
+        # Check first operand
+        if 'operand1' in result:
+            operands.append(self.process_operand(result['operand1']))
+        # Check second operand
+        if 'operand2' in result:
+            operands.append(self.process_operand(result['operand2']))
+        # Check third operand
+        if 'operand3' in result:
+            operands.append(self.process_operand(result['operand3']))
+        # Check fourth operand
+        if 'operand4' in result:
+            operands.append(self.process_operand(result['operand4']))
+        return_dict = AttrDict(
+            {
+                self.INSTRUCTION_ID: result['mnemonic'],
+                self.OPERANDS_ID: operands,
+                self.COMMENT_ID:
+                    ' '.join(result[self.COMMENT_ID]) if self.COMMENT_ID in result else None,
+            }
+        )
+        return return_dict
+
+    def process_operand(self, operand):
+        # For the moment, only used to structure memory addresses
+        if self.MEMORY_ID in operand:
+            return self.substitute_memory_address(operand[self.MEMORY_ID])
+        if self.IMMEDIATE_ID in operand:
+            return self.substitue_immediate(operand[self.IMMEDIATE_ID])
+        if self.LABEL_ID in operand:
+            return self.substitute_label(operand[self.LABEL_ID])
+        return operand
+
+    def substitute_memory_address(self, memory_address):
+        # Remove unecessarily created dictionary entries during memory address parsing
+        offset = None if 'offset' not in memory_address else memory_address['offset']
+        base = None if 'base' not in memory_address else memory_address['base']
+        index = None if 'index' not in memory_address else memory_address['index']
+        scale = 1 if 'scale' not in memory_address else int(memory_address['scale'])
+        new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale})
+        return AttrDict({self.MEMORY_ID: new_dict})
+
+    def substitute_label(self, label):
+        # remove duplicated 'name' level due to identifier
+        label['name'] = label['name']['name']
+        return AttrDict({self.LABEL_ID: label})
+
+    def substitue_immediate(self, immediate):
+        if 'identifier' in immediate:
+            # actually an identifier, change declaration
+            return immediate
+        # otherwise nothing to do
+        return AttrDict({self.IMMEDIATE_ID: immediate})
+
+    def get_full_reg_name(self, register):
+        # nothing to do
+        return register['name']
+
+    def normalize_imd(self, imd):
+        if 'value' in imd:
+            if imd['value'].lower().startswith('0x'):
+                # hex, return decimal
+                return int(imd['value'], 16)
+            return int(imd['value'], 10)
+        # identifier
+        return imd
+
+    def is_reg_dependend_of(self, reg_a, reg_b):
+        # Check if they are the same registers
+        if reg_a.name == reg_b.name:
+            return True
+        # Check vector registers first
+        if self.is_vector_register(reg_a):
+            if self.is_vector_register(reg_b):
+                if reg_a.name[1:] == reg_b.name[1:]:
+                    # Registers in the same vector space
+                    return True
+            return False
+        # Check basic GPRs
+        a_dep = ['RAX', 'EAX', 'AX', 'AH', 'AL']
+        b_dep = ['RBX', 'EBX', 'BX', 'BH', 'BL']
+        c_dep = ['RCX', 'ECX', 'CX', 'CH', 'CL']
+        d_dep = ['RDX', 'EDX', 'DX', 'DH', 'DL']
+        sp_dep = ['RSP', 'ESP', 'SP', 'SPL']
+        src_dep = ['RSI', 'ESI', 'SI', 'SIL']
+        dst_dep = ['RDI', 'EDI', 'DI', 'DIL']
+        basic_gprs = [a_dep, b_dep, c_dep, d_dep, sp_dep, src_dep, dst_dep]
+        if self.is_basic_gpr(reg_a):
+            if self.is_basic_gpr(reg_b):
+                for dep_group in basic_gprs:
+                    if reg_a['name'].upper() in dep_group:
+                        if reg_b['name'].upper() in dep_group:
+                            return True
+            return False
+        # Check other GPRs
+        gpr_parser = (
+            pp.CaselessLiteral('R')
+            + pp.Word(pp.nums).setResultsName('id')
+            + pp.Optional(pp.Word('dwbDWB', exact=1))
+        )
+        try:
+            id_a = gpr_parser.parseString(reg_a['name'], parseAll=True).asDict()['id']
+            id_b = gpr_parser.parseString(reg_b['name'], parseAll=True).asDict()['id']
+            if id_a == id_b:
+                return True
+        except pp.ParseException:
+            return False
+        # No dependencies
+        return False
+
+    def is_basic_gpr(self, register):
+        if any(char.isdigit() for char in register['name']):
+            return False
+        return True
+
+    def is_gpr(self, register):
+        gpr_parser = (
+            pp.CaselessLiteral('R')
+            + pp.Word(pp.nums).setResultsName('id')
+            + pp.Optional(pp.Word('dwbDWB', exact=1))
+        )
+        if self.is_basic_gpr(register):
+            return True
+        else:
+            try:
+                gpr_parser.parseString(register['name'], parseAll=True)
+                return True
+            except pp.ParseException:
+                return False
+
+    def is_vector_register(self, register):
+        if len(register['name']) > 2 and register['name'][1:3].lower() == 'mm':
+            return True
+        return False
+
+    def get_reg_type(self, register):
+        if self.is_gpr(register):
+            return 'gpr'
+        elif self.is_vector_register(register):
+            return register['name'][:3].lower()
+        raise ValueError
--- a/osaca/semantics/init.py
+++ b/osaca/semantics/init.py
@@ -0,0 +1,11 @@
+"""
+Tools for semantic analysis of parser result.
+
+Only the classes below will be exported, so please add new semantic tools to __all__.
+"""
+from .hw_model import MachineModel
+from .kernel_dg import KernelDG
+from .marker_utils import reduce_to_section
+from .semantics_appender import SemanticsAppender, INSTR_FLAGS
+
+__all__ = ['MachineModel', 'KernelDG', 'reduce_to_section', 'SemanticsAppender', 'INSTR_FLAGS']
--- a/osaca/semantics/hw_model.py
+++ b/osaca/semantics/hw_model.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python3
+
+import re
+from copy import deepcopy
+from itertools import product
+
+import ruamel.yaml
+from ruamel.yaml.compat import StringIO
+
+from osaca import __version__, utils
+from osaca.parser import ParserX86ATT
+
+
+class MachineModel(object):
+    def __init__(self, arch=None, path_to_yaml=None, isa=None):
+        if not arch and not path_to_yaml:
+            if not isa:
+                raise ValueError('One of arch, path_to_yaml and isa must be specified')
+            self._data = {
+                'osaca_version': str(__version__),
+                'micro_architecture': None,
+                'arch_code': None,
+                'isa': isa,
+                'ROB_size': None,
+                'retired_uOps_per_cycle': None,
+                'scheduler_size': None,
+                'hidden_loads': None,
+                'load_latency': {},
+                'load_throughput': [
+                    {'base': b, 'index': i, 'offset': o, 'scale': s, 'port_pressure': []}
+                    for b, i, o, s in product(['gpr'], ['gpr', None], ['imd', None], [1, 8])
+                ],
+                'ports': [],
+                'port_model_scheme': None,
+                'instruction_forms': [],
+            }
+        else:
+            if arch and path_to_yaml:
+                raise ValueError('Only one of arch and path_to_yaml is allowed.')
+            self._path = path_to_yaml
+            self._arch = arch
+            yaml = self._create_yaml_object()
+            if arch:
+                self._arch = arch.lower()
+                with open(utils.find_file(self._arch + '.yml'), 'r') as f:
+                    self._data = yaml.load(f)
+            elif path_to_yaml:
+                with open(self._path, 'r') as f:
+                    self._data = yaml.load(f)
+
+    def __getitem__(self, key):
+        """Return configuration entry."""
+        return self._data[key]
+
+    def __contains__(self, key):
+        """Return true if configuration key is present."""
+        return key in self._data
+
+    ######################################################
+
+    def get_instruction(self, name, operands):
+        """Find and return instruction data from name and operands."""
+        if name is None:
+            return None
+        try:
+            return next(
+                instruction_form
+                for instruction_form in self._data['instruction_forms']
+                if instruction_form['name'].upper() == name.upper()
+                and self._match_operands(instruction_form['operands'], operands)
+            )
+        except StopIteration:
+            return None
+        except TypeError as e:
+            print('\nname: {}\noperands: {}'.format(name, operands))
+            raise TypeError from e
+
+    def average_port_pressure(self, port_pressure):
+        """Construct average port pressure list from instruction data."""
+        port_list = self._data['ports']
+        average_pressure = [0.0] * len(port_list)
+        for cycles, ports in port_pressure:
+            for p in ports:
+                average_pressure[port_list.index(p)] += cycles / len(ports)
+        return average_pressure
+
+    def set_instruction(
+        self, name, operands=None, latency=None, port_pressure=None, throughput=None, uops=None
+    ):
+        """Import instruction form information."""
+        # If it already exists. Overwrite information.
+        instr_data = self.get_instruction(name, operands)
+        if instr_data is None:
+            instr_data = {}
+            self._data['instruction_forms'].append(instr_data)
+
+        instr_data['name'] = name
+        instr_data['operands'] = operands
+        instr_data['latency'] = latency
+        instr_data['port_pressure'] = port_pressure
+        instr_data['throughput'] = throughput
+        instr_data['uops'] = uops
+
+    def set_instruction_entry(self, entry):
+        self.set_instruction(
+            entry['name'],
+            entry['operands'] if 'operands' in entry else None,
+            entry['latency'] if 'latency' in entry else None,
+            entry['port_pressure'] if 'port_pressure' in entry else None,
+            entry['throughput'] if 'throughput' in entry else None,
+            entry['uops'] if 'uops' in entry else None,
+        )
+
+    def add_port(self, port):
+        if port not in self._data['ports']:
+            self._data['ports'].append(port)
+
+    def get_ISA(self):
+        return self._data['isa'].lower()
+
+    def get_arch(self):
+        return self._data['arch_code'].lower()
+
+    def get_ports(self):
+        return self._data['ports']
+
+    def has_hidden_loads(self):
+        if 'hidden_loads' in self._data:
+            return self._data['hidden_loads']
+        return False
+
+    def get_load_latency(self, reg_type):
+        return self._data['load_latency'][reg_type]
+
+    def get_load_throughput(self, memory):
+        ld_tp = [m for m in self._data['load_throughput'] if self._match_mem_entries(memory, m)]
+        if len(ld_tp) > 0:
+            return ld_tp[0]['port_pressure']
+        return None
+
+    def _match_mem_entries(self, mem, i_mem):
+        if self._data['isa'].lower() == 'aarch64':
+            return self._is_AArch64_mem_type(i_mem, mem)
+        if self._data['isa'].lower() == 'x86':
+            return self._is_x86_mem_type(i_mem, mem)
+
+    def get_data_ports(self):
+        data_port = re.compile(r'^[0-9]+D$')
+        data_ports = [x for x in filter(data_port.match, self._data['ports'])]
+        return data_ports
+
+    @staticmethod
+    def get_full_instruction_name(instruction_form):
+        operands = []
+        for op in instruction_form['operands']:
+            op_attrs = [
+                y + ':' + str(op[y])
+                for y in list(filter(lambda x: True if x != 'class' else False, op))
+            ]
+            operands.append('{}({})'.format(op['class'], ','.join(op_attrs)))
+        return '{}  {}'.format(instruction_form['name'], ','.join(operands))
+
+    @staticmethod
+    def get_isa_for_arch(arch):
+        arch_dict = {
+            'tx2': 'aarch64',
+            'zen1': 'x86',
+            'snb': 'x86',
+            'ivb': 'x86',
+            'hsw': 'x86',
+            'bdw': 'x86',
+            'skl': 'x86',
+            'skx': 'x86',
+            'csx': 'x86',
+            'wsm': 'x86',
+            'nhm': 'x86',
+            'kbl': 'x86',
+            'cnl': 'x86',
+            'cfl': 'x86',
+            'zen+': 'x86',
+        }
+        arch = arch.lower()
+        if arch in arch_dict:
+            return arch_dict[arch].lower()
+        else:
+            raise ValueError("Unknown architecture {!r}.".format(arch))
+
+    def dump(self, stream=None):
+        # Replace instruction form's port_pressure with styled version for RoundtripDumper
+        formatted_instruction_forms = deepcopy(self._data['instruction_forms'])
+        for instruction_form in formatted_instruction_forms:
+            cs = ruamel.yaml.comments.CommentedSeq(instruction_form['port_pressure'])
+            cs.fa.set_flow_style()
+            instruction_form['port_pressure'] = cs
+
+        # Create YAML object
+        yaml = self._create_yaml_object()
+        if not stream:
+            # Create stream object to output string
+            stream = StringIO()
+            yaml.dump({k: v for k, v in self._data.items() if k != 'instruction_forms'}, stream)
+            yaml.dump({'instruction_forms': formatted_instruction_forms}, stream)
+            return stream.getvalue()
+        else:
+            # Write in given stream
+            yaml.dump({k: v for k, v in self._data.items() if k != 'instruction_forms'}, stream)
+            yaml.dump({'instruction_forms': formatted_instruction_forms}, stream)
+
+    ######################################################
+
+    def _check_for_duplicate(self, name, operands):
+        matches = [
+            instruction_form
+            for instruction_form in self._data['instruction_forms']
+            if instruction_form['name'].lower() == name.lower()
+            and self._match_operands(instruction_form['operands'], operands)
+        ]
+        if len(matches) > 1:
+            return True
+        return False
+
+    def _match_operands(self, i_operands, operands):
+        if isinstance(operands, dict):
+            operands = operands['operand_list']
+        operands_ok = True
+        if len(operands) != len(i_operands):
+            return False
+        for idx, operand in enumerate(operands):
+            i_operand = i_operands[idx]
+            operands_ok = operands_ok and self._check_operands(i_operand, operand)
+        if operands_ok:
+            return True
+        else:
+            return False
+
+    def _check_operands(self, i_operands, operands):
+        if self._data['isa'].lower() == 'aarch64':
+            return self._check_AArch64_operands(i_operands, operands)
+        if self._data['isa'].lower() == 'x86':
+            return self._check_x86_operands(i_operands, operands)
+
+    def _check_AArch64_operands(self, i_operand, operand):
+        if 'class' in operand:
+            # compare two DB entries
+            return self._compare_db_entries(i_operand, operand)
+        # register
+        if 'register' in operand:
+            if i_operand['class'] != 'register':
+                return False
+            return self._is_AArch64_reg_type(i_operand, operand['register'])
+        # memory
+        if 'memory' in operand:
+            if i_operand['class'] != 'memory':
+                return False
+            return self._is_AArch64_mem_type(i_operand, operand['memory'])
+        # immediate
+        if 'value' in operand or ('immediate' in operand and 'value' in operand['immediate']):
+            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'int'
+        if 'float' in operand or ('immediate' in operand and 'float' in operand['immediate']):
+            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'float'
+        if 'double' in operand or ('immediate' in operand and 'double' in operand['immediate']):
+            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'double'
+        if 'identifier' in operand or (
+            'immediate' in operand and 'identifier' in operand['immediate']
+        ):
+            return i_operand['class'] == 'identifier'
+        # prefetch option
+        if 'prfop' in operand:
+            return i_operand['class'] == 'prfop'
+        # no match
+        return False
+
+    def _check_x86_operands(self, i_operand, operand):
+        if 'class' in operand:
+            # compare two DB entries
+            return self._compare_db_entries(i_operand, operand)
+        # register
+        if 'register' in operand:
+            if i_operand['class'] != 'register':
+                return False
+            return self._is_x86_reg_type(i_operand['name'], operand['register'])
+        # memory
+        if 'memory' in operand:
+            if i_operand['class'] != 'memory':
+                return False
+            return self._is_x86_mem_type(i_operand, operand['memory'])
+        # immediate
+        if 'immediate' in operand or 'value' in operand:
+            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'int'
+        # identifier (e.g., labels)
+        if 'identifier' in operand:
+            return i_operand['class'] == 'identifier'
+
+    def _compare_db_entries(self, operand_1, operand_2):
+        operand_attributes = list(
+            filter(lambda x: True if x != 'source' and x != 'destination' else False, operand_1)
+        )
+        for key in operand_attributes:
+            try:
+                if operand_1[key] != operand_2[key]:
+                    return False
+            except KeyError:
+                return False
+        return True
+
+    def _is_AArch64_reg_type(self, i_reg, reg):
+        if reg['prefix'] != i_reg['prefix']:
+            return False
+        if 'shape' in reg:
+            if 'shape' in i_reg and reg['shape'] == i_reg['shape']:
+                return True
+            return False
+        return True
+
+    def _is_x86_reg_type(self, i_reg_name, reg):
+        # differentiate between vector registers (xmm, ymm, zmm) and others (gpr)
+        parser_x86 = ParserX86ATT()
+        if parser_x86.is_vector_register(reg):
+            if reg['name'][0:3] == i_reg_name:
+                return True
+        else:
+            if i_reg_name == 'gpr':
+                return True
+        return False
+
+    def _is_AArch64_mem_type(self, i_mem, mem):
+        if (
+            # check base
+            mem['base']['prefix'] == i_mem['base']
+            # check offset
+            and (
+                mem['offset'] == i_mem['offset']
+                or (
+                    mem['offset'] is not None
+                    and 'identifier' in mem['offset']
+                    and i_mem['offset'] == 'identifier'
+                )
+                or (
+                    mem['offset'] is not None
+                    and 'value' in mem['offset']
+                    and i_mem['offset'] == 'imd'
+                )
+            )
+            # check index
+            and (
+                mem['index'] == i_mem['index']
+                or (
+                    mem['index'] is not None
+                    and 'prefix' in mem['index']
+                    and mem['index']['prefix'] == i_mem['index']
+                )
+            )
+            and (mem['scale'] == i_mem['scale'] or (mem['scale'] != 1 and i_mem['scale'] != 1))
+            and (('pre_indexed' in mem) == (i_mem['pre-indexed']))
+            and (('post_indexed' in mem) == (i_mem['post-indexed']))
+        ):
+            return True
+        return False
+
+    def _is_x86_mem_type(self, i_mem, mem):
+        if (
+            # check base
+            self._is_x86_reg_type(i_mem['base'], mem['base'])
+            # check offset
+            and (
+                mem['offset'] == i_mem['offset']
+                or (
+                    mem['offset'] is not None
+                    and 'identifier' in mem['offset']
+                    and i_mem['offset'] == 'identifier'
+                )
+                or (
+                    mem['offset'] is not None
+                    and 'value' in mem['offset']
+                    and (
+                        i_mem['offset'] == 'imd'
+                        or (i_mem['offset'] is None and mem['offset']['value'] == '0')
+                    )
+                )
+            )
+            # check index
+            and (
+                mem['index'] == i_mem['index']
+                or (
+                    mem['index'] is not None
+                    and 'name' in mem['index']
+                    and self._is_x86_reg_type(i_mem['index'], mem['index'])
+                )
+            )
+            and (mem['scale'] == i_mem['scale'] or (mem['scale'] != 1 and i_mem['scale'] != 1))
+        ):
+            return True
+        return False
+
+    def _create_yaml_object(self):
+        yaml_obj = ruamel.yaml.YAML()
+        yaml_obj.representer.add_representer(type(None), self.__represent_none)
+        yaml_obj.default_flow_style = None
+        yaml_obj.width = 120
+        yaml_obj.representer.ignore_aliases = lambda *args: True
+        return yaml_obj
+
+    def __represent_none(self, yaml_obj, data):
+        return yaml_obj.represent_scalar(u'tag:yaml.org,2002:null', u'~')
--- a/osaca/semantics/kernel_dg.py
+++ b/osaca/semantics/kernel_dg.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+
+import copy
+from itertools import chain, product
+
+import networkx as nx
+
+from osaca.parser import AttrDict
+from osaca.semantics import MachineModel
+
+
+class KernelDG(nx.DiGraph):
+    def __init__(self, parsed_kernel, parser, hw_model: MachineModel):
+        self.kernel = parsed_kernel
+        self.parser = parser
+        self.model = hw_model
+        self.dg = self.create_DG(self.kernel)
+        self.loopcarried_deps = self.check_for_loopcarried_dep(self.kernel)
+
+    def create_DG(self, kernel):
+        # 1. go through kernel instruction forms and add them as node attribute
+        # 2. find edges (to dependend further instruction)
+        # 3. get LT value and set as edge weight
+        dg = nx.DiGraph()
+        for i, instruction_form in enumerate(kernel):
+            dg.add_node(instruction_form['line_number'])
+            dg.nodes[instruction_form['line_number']]['instruction_form'] = instruction_form
+            # add load as separate node if existent
+            # TODO use INSTR_FLAGS here
+            if (
+                'performs_load' in instruction_form['flags']
+                and 'is_load_instruction' not in instruction_form['flags']
+            ):
+                # add new node
+                dg.add_node(instruction_form['line_number'] + 0.1)
+                dg.nodes[instruction_form['line_number'] + 0.1][
+                    'instruction_form'
+                ] = instruction_form
+                # and set LD latency as edge weight
+                dg.add_edge(
+                    instruction_form['line_number'] + 0.1,
+                    instruction_form['line_number'],
+                    latency=instruction_form['latency'] - instruction_form['latency_wo_load'],
+                )
+            for dep in self.find_depending(instruction_form, kernel[i + 1:]):
+                edge_weight = (
+                    instruction_form['latency']
+                    if 'latency_wo_load' not in instruction_form
+                    else instruction_form['latency_wo_load']
+                )
+                dg.add_edge(
+                    instruction_form['line_number'], dep['line_number'], latency=edge_weight
+                )
+                dg.nodes[dep['line_number']]['instruction_form'] = dep
+        return dg
+
+    def check_for_loopcarried_dep(self, kernel):
+        multiplier = len(kernel) + 1
+        # increase line number for second kernel loop
+        kernel_length = len(kernel)
+        first_line_no = kernel[0].line_number
+        kernel_copy = [AttrDict.convert_dict(d) for d in copy.deepcopy(kernel)]
+        tmp_kernel = kernel + kernel_copy
+        for i, instruction_form in enumerate(tmp_kernel[kernel_length:]):
+            tmp_kernel[i + kernel_length].line_number = instruction_form.line_number * multiplier
+        # get dependency graph
+        dg = self.create_DG(tmp_kernel)
+
+        # build cyclic loop-carried dependencies
+        loopcarried_deps = [
+            (node, list(nx.algorithms.simple_paths.all_simple_paths(dg, node, node * multiplier)))
+            for node in dg.nodes
+            if node < first_line_no * multiplier and node == int(node)
+        ]
+        # filter others and create graph
+        loopcarried_deps = list(
+            chain.from_iterable(
+                [list(product([dep_chain[0]], dep_chain[1])) for dep_chain in loopcarried_deps]
+            )
+        )
+        # adjust line numbers, filter duplicates
+        # and add reference to kernel again
+        loopcarried_deps_dict = {}
+        tmp_list = []
+        for i, dep in enumerate(loopcarried_deps):
+            nodes = [int(n / multiplier) for n in dep[1] if n >= first_line_no * multiplier]
+            loopcarried_deps[i] = (dep[0], nodes)
+        for dep in loopcarried_deps:
+            is_subset = False
+            for other_dep in [x for x in loopcarried_deps if x[0] != dep[0]]:
+                if set(dep[1]).issubset(set(other_dep[1])) and dep[0] in other_dep[1]:
+                    is_subset = True
+            if not is_subset:
+                tmp_list.append(dep)
+        loopcarried_deps = tmp_list
+        for dep in loopcarried_deps:
+            nodes = []
+            for n in dep[1]:
+                self._get_node_by_lineno(int(n))['latency_lcd'] = 0
+            for n in dep[1]:
+                node = self._get_node_by_lineno(int(n))
+                if int(n) != n and int(n) in dep[1]:
+                    node['latency_lcd'] += node['latency'] - node['latency_wo_load']
+                else:
+                    node['latency_lcd'] += node['latency_wo_load']
+                nodes.append(node)
+            loopcarried_deps_dict[dep[0]] = {
+                'root': self._get_node_by_lineno(dep[0]),
+                'dependencies': nodes,
+            }
+
+        return loopcarried_deps_dict
+
+    def _get_node_by_lineno(self, lineno):
+        return [instr for instr in self.kernel if instr.line_number == lineno][0]
+
+    def get_critical_path(self):
+        if nx.algorithms.dag.is_directed_acyclic_graph(self.dg):
+            longest_path = nx.algorithms.dag.dag_longest_path(self.dg, weight='latency')
+            for line_number in longest_path:
+                self._get_node_by_lineno(int(line_number))['latency_cp'] = 0
+            # add LD latency to instruction
+            for line_number in longest_path:
+                node = self._get_node_by_lineno(int(line_number))
+                if line_number != int(line_number) and int(line_number) in longest_path:
+                    node['latency_cp'] += self.dg.edges[(line_number, int(line_number))]['latency']
+                elif (
+                    line_number == int(line_number)
+                    and 'mem_dep' in node
+                    and self.dg.has_edge(node['mem_dep']['line_number'], line_number)
+                ):
+                    node['latency_cp'] += node['latency']
+                else:
+                    node['latency_cp'] += (
+                        node['latency']
+                        if 'latency_wo_load' not in node
+                        else node['latency_wo_load']
+                    )
+            return [x for x in self.kernel if x['line_number'] in longest_path]
+        else:
+            # split to DAG
+            raise NotImplementedError('Kernel is cyclic.')
+
+    def get_loopcarried_dependencies(self):
+        if nx.algorithms.dag.is_directed_acyclic_graph(self.dg):
+            return self.loopcarried_deps
+        else:
+            # split to DAG
+            raise NotImplementedError('Kernel is cyclic.')
+
+    def find_depending(self, instruction_form, kernel, include_write=False):
+        if instruction_form.operands is None:
+            return
+        for dst in instruction_form.operands.destination + instruction_form.operands.src_dst:
+            if 'register' in dst:
+                # Check for read of register until overwrite
+                for instr_form in kernel:
+                    if self.is_read(dst.register, instr_form):
+                        yield instr_form
+                        if self.is_written(dst.register, instr_form):
+                            # operand in src_dst list
+                            if include_write:
+                                yield instr_form
+                            break
+                    elif self.is_written(dst.register, instr_form):
+                        if include_write:
+                            yield instr_form
+                        break
+            elif 'memory' in dst:
+                # Check if base register is altered during memory access
+                if 'pre_indexed' in dst.memory or 'post_indexed' in dst.memory:
+                    # Check for read of base register until overwrite
+                    for instr_form in kernel:
+                        if self.is_read(dst.memory.base, instr_form):
+                            instr_form['mem_dep'] = instruction_form
+                            yield instr_form
+                            if self.is_written(dst.memory.base, instr_form):
+                                # operand in src_dst list
+                                if include_write:
+                                    instr_form['mem_dep'] = instruction_form
+                                    yield instr_form
+                                break
+                        elif self.is_written(dst.memory.base, instr_form):
+                            if include_write:
+                                instr_form['mem_dep'] = instruction_form
+                                yield instr_form
+                            break
+
+    def get_dependent_instruction_forms(self, instr_form=None, line_number=None):
+        """
+        Returns iterator
+        """
+        if not instr_form and not line_number:
+            raise ValueError('Either instruction form or line_number required.')
+        line_number = line_number if line_number else instr_form['line_number']
+        if self.dg.has_node(line_number):
+            return self.dg.successors(line_number)
+        return iter([])
+
+    def is_read(self, register, instruction_form):
+        is_read = False
+        if instruction_form.operands is None:
+            return is_read
+        for src in instruction_form.operands.source + instruction_form.operands.src_dst:
+            if 'register' in src:
+                is_read = self.parser.is_reg_dependend_of(register, src.register) or is_read
+            if 'memory' in src:
+                if src.memory.base is not None:
+                    is_read = self.parser.is_reg_dependend_of(register, src.memory.base) or is_read
+                if src.memory.index is not None:
+                    is_read = (
+                        self.parser.is_reg_dependend_of(register, src.memory.index) or is_read
+                    )
+        # Check also if read in destination memory address
+        for dst in instruction_form.operands.destination + instruction_form.operands.src_dst:
+            if 'memory' in dst:
+                if dst.memory.base is not None:
+                    is_read = self.parser.is_reg_dependend_of(register, dst.memory.base) or is_read
+                if dst.memory.index is not None:
+                    is_read = (
+                        self.parser.is_reg_dependend_of(register, dst.memory.index) or is_read
+                    )
+        return is_read
+
+    def is_written(self, register, instruction_form):
+        is_written = False
+        if instruction_form.operands is None:
+            return is_written
+        for dst in instruction_form.operands.destination + instruction_form.operands.src_dst:
+            if 'register' in dst:
+                is_written = self.parser.is_reg_dependend_of(register, dst.register) or is_written
+            if 'memory' in dst:
+                if 'pre_indexed' in dst.memory or 'post_indexed' in dst.memory:
+                    is_written = (
+                        self.parser.is_reg_dependend_of(register, dst.memory.base) or is_written
+                    )
+        # Check also for possible pre- or post-indexing in memory addresses
+        for src in instruction_form.operands.source + instruction_form.operands.src_dst:
+            if 'memory' in src:
+                if 'pre_indexed' in src.memory or 'post_indexed' in src.memory:
+                    is_written = (
+                        self.parser.is_reg_dependend_of(register, src.memory.base) or is_written
+                    )
+        return is_written
+
+    def export_graph(self, filepath=None):
+        graph = copy.deepcopy(self.dg)
+        cp = self.get_critical_path()
+        cp_line_numbers = [x['line_number'] for x in cp]
+        lcd = self.get_loopcarried_dependencies()
+        lcd_line_numbers = {}
+        for dep in lcd:
+            lcd_line_numbers[dep] = [x['line_number'] for x in lcd[dep]['dependencies']]
+        # add color scheme
+        graph.graph['node'] = {'colorscheme': 'accent8'}
+        graph.graph['edge'] = {'colorscheme': 'accent8'}
+
+        # create LCD edges
+        for dep in lcd_line_numbers:
+            min_line_number = min(lcd_line_numbers[dep])
+            max_line_number = max(lcd_line_numbers[dep])
+            graph.add_edge(max_line_number, min_line_number)
+            graph.edges[max_line_number, min_line_number]['latency'] = [
+                x for x in lcd[dep]['dependencies'] if x['line_number'] == max_line_number
+            ][0]['latency_lcd']
+
+        # add label to edges
+        for e in graph.edges:
+            graph.edges[e]['label'] = graph.edges[e]['latency']
+
+        # add CP values to graph
+        for n in cp:
+            graph.nodes[n['line_number']]['instruction_form']['latency_cp'] = n['latency_cp']
+
+        # color CP and LCD
+        for n in graph.nodes:
+            if n in cp_line_numbers:
+                # graph.nodes[n]['color'] = 1
+                graph.nodes[n]['style'] = 'bold'
+                graph.nodes[n]['penwidth'] = 4
+            for col, dep in enumerate(lcd):
+                if n in lcd_line_numbers[dep]:
+                    if 'style' not in graph.nodes[n]:
+                        graph.nodes[n]['style'] = 'filled'
+                    else:
+                        graph.nodes[n]['style'] += ',filled'
+                    graph.nodes[n]['fillcolor'] = 2 + col
+
+        # color edges
+        for e in graph.edges:
+            if (
+                graph.nodes[e[0]]['instruction_form']['line_number'] in cp_line_numbers
+                and graph.nodes[e[1]]['instruction_form']['line_number'] in cp_line_numbers
+                and e[0] < e[1]
+            ):
+                bold_edge = True
+                for i in range(e[0] + 1, e[1]):
+                    if i in cp_line_numbers:
+                        bold_edge = False
+                if bold_edge:
+                    graph.edges[e]['style'] = 'bold'
+                    graph.edges[e]['penwidth'] = 3
+            for dep in lcd_line_numbers:
+                if (
+                    graph.nodes[e[0]]['instruction_form']['line_number'] in lcd_line_numbers[dep]
+                    and graph.nodes[e[1]]['instruction_form']['line_number']
+                    in lcd_line_numbers[dep]
+                ):
+                    graph.edges[e]['color'] = graph.nodes[e[1]]['fillcolor']
+
+        # rename node from [idx] to [idx mnemonic] and add shape
+        mapping = {}
+        for n in graph.nodes:
+            if int(n) != n:
+                mapping[n] = '{}: LOAD'.format(int(n))
+                graph.nodes[n]['fontname'] = 'italic'
+                graph.nodes[n]['fontsize'] = 11.0
+            else:
+                node = graph.nodes[n]['instruction_form']
+                if node['instruction'] is not None:
+                    mapping[n] = '{}: {}'.format(n, node['instruction'])
+                else:
+                    label = 'label' if node['label'] else None
+                    label = 'directive' if node['directive'] else label
+                    label = 'comment' if node['comment'] and label is None else label
+                    mapping[n] = '{}: {}'.format(n, label)
+                    graph.nodes[n]['fontname'] = 'italic'
+                    graph.nodes[n]['fontsize'] = 11.0
+                graph.nodes[n]['shape'] = 'rectangle'
+
+        nx.relabel.relabel_nodes(graph, mapping, copy=False)
+        if filepath:
+            nx.drawing.nx_agraph.write_dot(graph, filepath)
+        else:
+            nx.drawing.nx_agraph.write_dot(graph, 'osaca_dg.dot')
--- a/osaca/semantics/marker_utils.py
+++ b/osaca/semantics/marker_utils.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+from osaca.parser import ParserAArch64v81, ParserX86ATT
+
+
+def reduce_to_section(kernel, isa):
+    isa = isa.lower()
+    if isa == 'x86':
+        start, end = find_marked_kernel_x86ATT(kernel)
+    elif isa == 'aarch64':
+        start, end = find_marked_kernel_AArch64(kernel)
+    else:
+        raise ValueError('ISA not supported.')
+    if start == -1:
+        raise LookupError('Could not find START MARKER. Make sure it is inserted!')
+    if end == -1:
+        raise LookupError('Could not find END MARKER. Make sure it is inserted!')
+    return kernel[start:end]
+
+
+def find_marked_kernel_AArch64(lines):
+    nop_bytes = ['213', '3', '32', '31']
+    return find_marked_kernel(
+        lines, ParserAArch64v81(), ['mov'], 'x1', [111, 222], nop_bytes, reverse=True
+    )
+
+
+def find_marked_kernel_x86ATT(lines):
+    nop_bytes = ['100', '103', '144']
+    return find_marked_kernel(lines, ParserX86ATT(), ['mov', 'movl'], 'ebx', [111, 222], nop_bytes)
+
+
+def find_marked_kernel(lines, parser, mov_instr, mov_reg, mov_vals, nop_bytes, reverse=False):
+    index_start = -1
+    index_end = -1
+    for i, line in enumerate(lines):
+        try:
+            if line.instruction in mov_instr and lines[i + 1].directive is not None:
+                source = line.operands[0 if not reverse else 1]
+                destination = line.operands[1 if not reverse else 0]
+                # instruction pair matches, check for operands
+                if (
+                    'immediate' in source
+                    and parser.normalize_imd(source.immediate) == mov_vals[0]
+                    and 'register' in destination
+                    and parser.get_full_reg_name(destination.register) == mov_reg
+                ):
+                    # operands of first instruction match start, check for second one
+                    match, line_count = match_bytes(lines, i + 1, nop_bytes)
+                    if match:
+                        # return first line after the marker
+                        index_start = i + 1 + line_count
+                elif (
+                    'immediate' in source
+                    and parser.normalize_imd(source.immediate) == mov_vals[1]
+                    and 'register' in destination
+                    and parser.get_full_reg_name(destination.register) == mov_reg
+                ):
+                    # operand of first instruction match end, check for second one
+                    match, line_count = match_bytes(lines, i + 1, nop_bytes)
+                    if match:
+                        # return line of the marker
+                        index_end = i
+        except TypeError:
+            print(i, line)
+        if index_start != -1 and index_end != -1:
+            break
+    return index_start, index_end
+
+
+def match_bytes(lines, index, byte_list):
+    # either all bytes are in one line or in separate ones
+    extracted_bytes = []
+    line_count = 0
+    while (
+        index < len(lines)
+        and lines[index].directive is not None
+        and lines[index].directive.name == 'byte'
+    ):
+        line_count += 1
+        extracted_bytes += lines[index].directive.parameters
+        index += 1
+    if extracted_bytes[0:len(byte_list)] == byte_list:
+        return True, line_count
+    return False, -1
--- a/osaca/semantics/semantics_appender.py
+++ b/osaca/semantics/semantics_appender.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+
+import warnings
+from functools import reduce
+
+from osaca import utils
+from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
+from osaca.semantics import MachineModel
+
+
+class INSTR_FLAGS:
+    """
+    Flags used for unknown or special instructions
+    """
+
+    LD = 'is_load_instruction'
+    TP_UNKWN = 'tp_unknown'
+    LT_UNKWN = 'lt_unknown'
+    NOT_BOUND = 'not_bound'
+    HIDDEN_LD = 'hidden_load'
+    HAS_LD = 'performs_load'
+    HAS_ST = 'performs_store'
+
+
+class SemanticsAppender(object):
+    def __init__(self, machine_model: MachineModel, path_to_yaml=None):
+        self._machine_model = machine_model
+        self._isa = machine_model.get_ISA().lower()
+        path = utils.find_file('isa/' + self._isa + '.yml')
+        self._isa_model = MachineModel(path_to_yaml=path)
+        if self._isa == 'x86':
+            self._parser = ParserX86ATT()
+        elif self._isa == 'aarch64':
+            self._parser = ParserAArch64v81()
+
+    # SUMMARY FUNCTION
+    def add_semantics(self, kernel):
+        for instruction_form in kernel:
+            self.assign_src_dst(instruction_form)
+            self.assign_tp_lt(instruction_form)
+        if self._machine_model.has_hidden_loads():
+            self.set_hidden_loads(kernel)
+
+    def set_hidden_loads(self, kernel):
+        loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr['flags']]
+        stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr['flags']]
+        # Filter instructions including load and store
+        load_ids = [instr['line_number'] for instr in loads]
+        store_ids = [instr['line_number'] for instr in stores]
+        shared_ldst = list(set(load_ids).intersection(set(store_ids)))
+        loads = [instr for instr in loads if instr['line_number'] not in shared_ldst]
+        stores = [instr for instr in stores if instr['line_number'] not in shared_ldst]
+
+        if len(stores) == 0 or len(loads) == 0:
+            # nothing to do
+            return
+        if len(loads) <= len(stores):
+            # Hide all loads
+            for load in loads:
+                load['flags'] += [INSTR_FLAGS.HIDDEN_LD]
+                load['port_pressure'] = self._nullify_data_ports(load['port_pressure'])
+        else:
+            for store in stores:
+                # Get 'closest' load instruction
+                min_distance_load = min(
+                    [
+                        (
+                            abs(load_instr['line_number'] - store['line_number']),
+                            load_instr['line_number'],
+                        )
+                        for load_instr in loads
+                        if INSTR_FLAGS.HIDDEN_LD not in load_instr['flags']
+                    ]
+                )
+                load = [instr for instr in kernel if instr['line_number'] == min_distance_load[1]][
+                    0
+                ]
+                # Hide load
+                load['flags'] += [INSTR_FLAGS.HIDDEN_LD]
+                load['port_pressure'] = self._nullify_data_ports(load['port_pressure'])
+
+    # get parser result and assign throughput and latency value to instruction form
+    # mark instruction form with semantic flags
+    def assign_tp_lt(self, instruction_form):
+        flags = []
+        port_number = len(self._machine_model['ports'])
+        if instruction_form['instruction'] is None:
+            # No instruction (label, comment, ...) --> ignore
+            throughput = 0.0
+            latency = 0.0
+            latency_wo_load = latency
+            instruction_form['port_pressure'] = [0.0 for i in range(port_number)]
+        else:
+            instruction_data = self._machine_model.get_instruction(
+                instruction_form['instruction'], instruction_form['operands']
+            )
+            if instruction_data:
+                # instruction form in DB
+                throughput = instruction_data['throughput']
+                port_pressure = self._machine_model.average_port_pressure(
+                    instruction_data['port_pressure']
+                )
+                try:
+                    assert isinstance(port_pressure, list)
+                    assert len(port_pressure) == port_number
+                    instruction_form['port_pressure'] = port_pressure
+                    if sum(port_pressure) == 0 and throughput is not None:
+                        # port pressure on all ports 0 --> not bound to a port
+                        flags.append(INSTR_FLAGS.NOT_BOUND)
+                except AssertionError:
+                    warnings.warn(
+                        'Port pressure could not be imported correctly from database. '
+                        + 'Please check entry for:\n {}'.format(instruction_form)
+                    )
+                    instruction_form['port_pressure'] = [0.0 for i in range(port_number)]
+                    flags.append(INSTR_FLAGS.TP_UNKWN)
+                if throughput is None:
+                    # assume 0 cy and mark as unknown
+                    throughput = 0.0
+                    flags.append(INSTR_FLAGS.TP_UNKWN)
+                latency = instruction_data['latency']
+                latency_wo_load = latency
+                if latency is None:
+                    # assume 0 cy and mark as unknown
+                    latency = 0.0
+                    latency_wo_load = latency
+                    flags.append(INSTR_FLAGS.LT_UNKWN)
+                if INSTR_FLAGS.HAS_LD in instruction_form['flags']:
+                    flags.append(INSTR_FLAGS.LD)
+            else:
+                # instruction could not be found in DB
+                assign_unknown = True
+                # check for equivalent register-operands DB entry if LD
+                if INSTR_FLAGS.HAS_LD in instruction_form['flags']:
+                    # --> combine LD and reg form of instruction form
+                    operands = self.substitute_mem_address(instruction_form['operands'])
+                    instruction_data_reg = self._machine_model.get_instruction(
+                        instruction_form['instruction'], operands
+                    )
+                    if instruction_data_reg:
+                        assign_unknown = False
+                        reg_types = [
+                            self._parser.get_reg_type(op['register'])
+                            for op in operands['operand_list']
+                            if 'register' in op
+                        ]
+                        load_port_pressure = self._machine_model.average_port_pressure(
+                            self._machine_model.get_load_throughput(
+                                [
+                                    x['memory']
+                                    for x in instruction_form['operands']['source']
+                                    if 'memory' in x
+                                ][0]
+                            )
+                        )
+                        if 'load_throughput_multiplier' in self._machine_model:
+                            multiplier = self._machine_model['load_throughput_multiplier'][
+                                reg_types[0]
+                            ]
+                            load_port_pressure = [pp * multiplier for pp in load_port_pressure]
+                        throughput = max(
+                            max(load_port_pressure), instruction_data_reg['throughput']
+                        )
+                        latency = (
+                            self._machine_model.get_load_latency(reg_types[0])
+                            + instruction_data_reg['latency']
+                        )
+                        latency_wo_load = instruction_data_reg['latency']
+                        instruction_form['port_pressure'] = [
+                            sum(x)
+                            for x in zip(
+                                load_port_pressure,
+                                self._machine_model.average_port_pressure(
+                                    instruction_data_reg['port_pressure']
+                                ),
+                            )
+                        ]
+                if assign_unknown:
+                    # --> mark as unknown and assume 0 cy for latency/throughput
+                    throughput = 0.0
+                    latency = 0.0
+                    latency_wo_load = latency
+                    instruction_form['port_pressure'] = [0.0 for i in range(port_number)]
+                    flags += [INSTR_FLAGS.TP_UNKWN, INSTR_FLAGS.LT_UNKWN]
+        # flatten flag list
+        flags = list(set(flags))
+        if 'flags' not in instruction_form:
+            instruction_form['flags'] = flags
+        else:
+            instruction_form['flags'] += flags
+        instruction_form['throughput'] = throughput
+        instruction_form['latency'] = latency
+        instruction_form['latency_wo_load'] = latency_wo_load
+        # for later CP and loop-carried dependency analysis
+        instruction_form['latency_cp'] = 0
+        instruction_form['latency_lcd'] = 0
+
+    def substitute_mem_address(self, operands):
+        regs = [op for op in operands['operand_list'] if 'register' in op]
+        if (
+            len(regs) > 1
+            and len(set([self._parser.get_reg_type(x['register']) for x in regs])) != 1
+        ):
+            warnings.warn('Load type could not be identified clearly.')
+        reg_type = self._parser.get_reg_type(regs[0]['register'])
+
+        source = [
+            operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
+            for operand in operands['source']
+        ]
+        destination = [
+            operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
+            for operand in operands['destination']
+        ]
+        src_dst = [
+            operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
+            for operand in operands['destination']
+        ]
+        operand_list = [
+            operand if 'memory' not in operand else self.convert_mem_to_reg(operand, reg_type)
+            for operand in operands['operand_list']
+        ]
+        return {
+            'source': source,
+            'destination': destination,
+            'src_dst': src_dst,
+            'operand_list': operand_list,
+        }
+
+    def convert_mem_to_reg(self, memory, reg_type, reg_id='0'):
+        if self._isa == 'x86':
+            register = {'register': {'name': reg_type + reg_id}}
+        elif self._isa == 'aarch64':
+            register = {'register': {'prefix': reg_type, 'name': reg_id}}
+        return register
+
+    # get ;parser result and assign operands to
+    # - source
+    # - destination
+    # - source/destination
+    def assign_src_dst(self, instruction_form):
+        # if the instruction form doesn't have operands, there's nothing to do
+        if instruction_form['operands'] is None:
+            return
+        # check if instruction form is in ISA yaml, otherwise apply standard operand assignment
+        # (one dest, others source)
+        isa_data = self._isa_model.get_instruction(
+            instruction_form['instruction'], instruction_form['operands']
+        )
+        operands = instruction_form['operands']
+        op_dict = {}
+        if isa_data is None:
+            # no irregular operand structure, apply default
+            op_dict['source'] = self._get_regular_source_operands(instruction_form)
+            op_dict['destination'] = self._get_regular_destination_operands(instruction_form)
+            op_dict['src_dst'] = []
+        else:
+            # load src/dst structure from isa_data
+            op_dict['source'] = []
+            op_dict['destination'] = []
+            op_dict['src_dst'] = []
+            for i, op in enumerate(isa_data['operands']):
+                if op['source'] and op['destination']:
+                    op_dict['src_dst'].append(operands[i])
+                    continue
+                if op['source']:
+                    op_dict['source'].append(operands[i])
+                    continue
+                if op['destination']:
+                    op_dict['destination'].append(operands[i])
+                    continue
+        # store operand list in dict and reassign operand key/value pair
+        op_dict['operand_list'] = operands
+        instruction_form['operands'] = AttrDict.convert_dict(op_dict)
+        # assign LD/ST flags
+        instruction_form['flags'] = (
+            instruction_form['flags'] if 'flags' in instruction_form else []
+        )
+        if self._has_load(instruction_form):
+            instruction_form['flags'] += [INSTR_FLAGS.HAS_LD]
+        if self._has_store(instruction_form):
+            instruction_form['flags'] += [INSTR_FLAGS.HAS_ST]
+
+    def _nullify_data_ports(self, port_pressure):
+        data_ports = self._machine_model.get_data_ports()
+        for port in data_ports:
+            index = self._machine_model.get_ports().index(port)
+            port_pressure[index] = 0.0
+        return port_pressure
+
+    def _has_load(self, instruction_form):
+        for operand in (
+            instruction_form['operands']['source'] + instruction_form['operands']['src_dst']
+        ):
+            if 'memory' in operand:
+                return True
+        return False
+
+    def _has_store(self, instruction_form):
+        for operand in (
+            instruction_form['operands']['destination'] + instruction_form['operands']['src_dst']
+        ):
+            if 'memory' in operand:
+                return True
+        return False
+
+    def _get_regular_source_operands(self, instruction_form):
+        if self._isa == 'x86':
+            return self._get_regular_source_x86ATT(instruction_form)
+        if self._isa == 'aarch64':
+            return self._get_regular_source_AArch64(instruction_form)
+
+    def _get_regular_destination_operands(self, instruction_form):
+        if self._isa == 'x86':
+            return self._get_regular_destination_x86ATT(instruction_form)
+        if self._isa == 'aarch64':
+            return self._get_regular_destination_AArch64(instruction_form)
+
+    def _get_regular_source_x86ATT(self, instruction_form):
+        # return all but last operand
+        sources = [
+            op for op in instruction_form['operands'][0 : len(instruction_form['operands']) - 1]
+        ]
+        return sources
+
+    def _get_regular_source_AArch64(self, instruction_form):
+        # return all but first operand
+        sources = [
+            op for op in instruction_form['operands'][1 : len(instruction_form['operands'])]
+        ]
+        return sources
+
+    def _get_regular_destination_x86ATT(self, instruction_form):
+        # return last operand
+        return instruction_form['operands'][-1:]
+
+    def _get_regular_destination_AArch64(self, instruction_form):
+        # return first operand
+        return instruction_form['operands'][:1]
+
+    @staticmethod
+    def get_throughput_sum(kernel):
+        tp_sum = reduce(
+            (lambda x, y: [sum(z) for z in zip(x, y)]),
+            [instr['port_pressure'] for instr in kernel],
+        )
+        tp_sum = [round(x, 2) for x in tp_sum]
+        return tp_sum
--- a/osaca/testcase.py
+++ b/osaca/testcase.py
@@ -1,410 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-from subprocess import call
-from math import ceil
-
-from osaca.param import Register, MemAddr, Parameter
-#from param import Register, MemAddr, Parameter
-
-
-class Testcase(object):
-    # ------------------Constant variables--------------------------
-    # Lookup tables for regs
-    gprs64 = ['rax', 'rbx', 'rcx', 'rdx', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15']
-    gprs32 = ['eax', 'ebx', 'ecx', 'edx', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d']
-    gprs16 = ['ax', 'bx', 'cx', 'dx', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w']
-    gprs8 = ['al', 'bl', 'cl', 'dl', 'r9l', 'r10l', 'r11l', 'r12l', 'r13l', 'r14l', 'r15l']
-    fpus = ['st0', 'st1', 'st2', 'st3', 'st4', 'st5', 'st6', 'st7']
-    mmxs = ['mm0', 'mm1', 'mm2', 'mm3', 'mm4', 'mm5', 'mm6', 'mm7']
-    ks = ['k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7']
-    bnds = ['bnd0', 'bnd1', 'bnd2', 'bnd3', 'bnd4', 'bnd5', 'bnd6', 'bnd7']
-    xmms = ['xmm0', 'xmm1', 'xmm2', 'xmm3', 'xmm4', 'xmm5', 'xmm6', 'xmm7', 'xmm8', 'xmm9',
-            'xmm10', 'xmm11', 'xmm12', 'xmm13', 'xmm14', 'xmm15']
-    ymms = ['ymm0', 'ymm1', 'ymm2', 'ymm3', 'ymm4', 'ymm5', 'ymm6', 'ymm7', 'ymm8', 'ymm9',
-            'ymm10', 'ymm11', 'ymm12', 'ymm13', 'ymm14', 'ymm15']
-    zmms = ['zmm0', 'zmm1', 'zmm2', 'zmm3', 'zmm4', 'zmm5', 'zmm6', 'zmm7', 'zmm8', 'zmm9',
-            'zmm10', 'zmm11', 'zmm12', 'zmm13', 'zmm14', 'zmm15']
-    # Lookup table for memory
-    mems = ['[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]', '[rip+PI]',
-            '[rip+PI]']
-    # Lookup table for immediates
-    imds = ['1', '2', '13', '22', '8', '78', '159', '222', '3', '9', '5', '55', '173', '317',
-            '254', '255']
-    # TODO Differentiate between AVX512 (with additional xmm16-31) and the rest
-    #       ...
-    #       ...
-    # end TODO
-
-    ops = {'gpr64': gprs64, 'gpr32': gprs32, 'gpr16': gprs16, 'gpr8': gprs8, 'fpu': fpus,
-           'mmx': mmxs, 'k': ks, 'bnd': bnds, 'xmm': xmms, 'ymm': ymms, 'zmm': zmms, 'mem': mems,
-           'imd': imds}
-
-    # Create Single Precision 1.0
-    sp1 = ('\t\t# create SP 1.0\n'
-           '\t\tvpcmpeqw xmm0, xmm0, xmm0\n'
-           '\t\tvpslld xmm0, xmm0, 25\t\t\t# logical left shift: 11111110..0 (25=32-(8-1))\n'
-           '\t\tvpsrld xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading '
-           'mantissa bit is zero\n'
-           '\t\t# copy SP 1.0\n')
-    # Create Double Precision 1.0
-    dp1 = ('\t\t# create DP 1.0\n'
-           '\t\tvpcmpeqw xmm0, xmm0, xmm0\t\t# all ones\n'
-           '\t\tvpsllq xmm0, xmm0, 54\t\t\t# logical left shift: 11111110..0 (54=64-(10-1))\n'
-           '\t\tvpsrlq xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading '
-           'mantissa bit is zero\n')
-    # Create epilogue
-    done = ('done:\n'
-            '\t\tmov\trsp, rbp\n'
-            '\t\tpop\trbp\n'
-            '\t\tret\n'
-            '.size latency, .-latency')
-    # ----------------------------------------------------------------
-
-    # Constructor
-    def __init__(self, _mnemonic, _param_list, _num_instr='32'):
-        self.instr = _mnemonic.lower()
-        self.param_list = _param_list
-        # num_instr must be an even number
-        self.num_instr = str(ceil(int(_num_instr)/2)*2)
-        # Check for the number of operands and initialise the GPRs if necessary
-        self.op_a, self.op_b, self.op_c, self.gprPush, self.gprPop, self.zeroGPR, self.copy = \
-            self.__define_operands()
-        self.num_operands = len(self.param_list)
-
-        # Create asm header
-        self.def_instr, self.ninstr, self.init, self.expand = self.__define_header()
-        # Create latency and throughput loop
-        self.loop_lat = self.__define_loop_lat()
-        self.loop_thrpt = self.__define_loop_thrpt()
-        # Create extension for testcase name
-        sep0 = '-' if (self.num_operands > 0) else ''
-        sep1 = '_' if (self.num_operands > 1) else ''
-        sep2 = '_' if (self.num_operands > 2) else ''
-        self.extension = (sep0 + (self.op_a if ('gpr' not in self.op_a) else 'r' + self.op_a[3:])
-                          + sep1 + (self.op_b if ('gpr' not in self.op_b) else 'r' + self.op_b[3:])
-                          + sep2 + (self.op_c if ('gpr' not in self.op_c) else 'r' + self.op_c[3:]))
-
-    def write_testcase(self, tp=True, lt=True):
-        """
-        Write testcase for class attributes in a file.
-
-        Parameters
-        ----------
-        tp : bool
-            Controls if throughput testcase should be written
-            (default True)
-
-        lt : bool
-            Controls if latency testcase should be written
-            (default True)
-        """
-        osaca_dir = os.path.expanduser('~') + '/.osaca/'
-        if lt:
-            # Write latency file
-            call(['mkdir', '-p', osaca_dir + 'benchmarks'])
-            f = open(osaca_dir + 'benchmarks/'+self.instr+self.extension+'.S', 'w')
-            data = (self.def_instr + self.ninstr + self.init + self.dp1 + self.expand + self.gprPush
-                    + self.zeroGPR + self.copy + self.loop_lat + self.gprPop + self.done)
-            f.write(data)
-            f.close()
-        if tp:
-            # Write throughput file
-            call(['mkdir', '-p', osaca_dir + 'benchmarks'])
-            f = open(osaca_dir + 'benchmarks/' + self.instr + self.extension
-                     + '-TP.S', 'w')
-            data = (self.def_instr + self.ninstr + self.init + self.dp1 + self.expand + self.gprPush
-                    + self.zeroGPR + self.copy + self.loop_thrpt + self.gprPop + self.done)
-            f.write(data)
-            f.close()
-
-    # Check operands
-    def __define_operands(self):
-        """
-        Check for the number of operands and initialise the GPRs if necessary.
-
-        Returns
-        -------
-        (str, str, str, str, str, str)
-            String tuple containing types of operands and if needed push/pop operations, the
-            initialisation of general purpose regs and the copy if registers.
-        """
-        operands = self.param_list
-        op_a, op_b, op_c = ('', '', '')
-        gpr_push, gpr_pop, zero_gpr = ('', '', '')
-        if len(operands) > 0:
-            if isinstance(operands[0], Register):
-                op_a = operands[0].reg_type.lower()
-            elif isinstance(operands[0], MemAddr):
-                op_a = 'mem'
-            elif isinstance(operands[0], Parameter) and str(operands[0]) == 'IMD':
-                op_a = 'imd'
-            if op_a == 'gpr':
-                gpr_push, gpr_pop, zero_gpr = self.__initialise_gprs()
-                op_a += str(operands[0].size)
-        if len(operands) > 1:
-            if isinstance(operands[1], Register):
-                op_b = operands[1].reg_type.lower()
-            elif isinstance(operands[1], MemAddr):
-                op_b = 'mem'
-            elif isinstance(operands[1], Parameter) and str(operands[1]) == 'IMD':
-                op_b = 'imd'
-            if op_b == 'gpr':
-                op_b += str(operands[1].size)
-                if 'gpr' not in op_a:
-                    gpr_push, gpr_pop, zero_gpr = self.__initialise_gprs()
-        if len(operands) == 3:
-            if isinstance(operands[2], Register):
-                op_c = operands[2].reg_type.lower()
-            elif isinstance(operands[2], MemAddr):
-                op_c = 'mem'
-            elif isinstance(operands[2], Parameter) and str(operands[2]) == 'IMD':
-                op_c = 'imd'
-            if op_c == 'gpr':
-                op_c += str(operands[2].size)
-                if ('gpr' not in op_a) and ('gpr' not in op_b):
-                    gpr_push, gpr_pop, zero_gpr = self.__initialise_gprs()
-        if len(operands) == 1 and isinstance(operands[0], Register):
-            copy = self.__copy_regs(operands[0])
-        elif len(operands) > 1 and isinstance(operands[1], Register):
-            copy = self.__copy_regs(operands[1])
-        elif len(operands) > 2 and isinstance(operands[2], Register):
-            copy = self.__copy_regs(operands[1])
-        else:
-            copy = ''
-        return op_a, op_b, op_c, gpr_push, gpr_pop, zero_gpr, copy
-
-    def __initialise_gprs(self):
-        """
-        Initialize eleven general purpose registers and set them to zero.
-
-        Returns
-        -------
-        (str, str, str)
-            String tuple for push, pop and initalisation operations
-        """
-
-        gpr_push = ''
-        gpr_pop = ''
-        zero_gpr = ''
-        for reg in self.gprs64:
-            gpr_push += '\t\tpush    {}\n'.format(reg)
-        for reg in reversed(self.gprs64):
-            gpr_pop += '\t\tpop     {}\n'.format(reg)
-        for reg in self.gprs64:
-            zero_gpr += '\t\txor     {}, {}\n'.format(reg, reg)
-        return gpr_push, gpr_pop, zero_gpr
-
-
-    # Copy created values in specific register
-    def __copy_regs(self, reg):
-        """
-        Copy created values in specific register.
-
-        Parameters
-        ----------
-        reg : Register
-            Register for copying the value
-
-        Returns
-        -------
-        str
-            String containing the copy instructions
-        """
-        copy = '\t\t# copy DP 1.0\n'
-        # Different handling for GPR, MMX and SSE/AVX registers
-        if reg.reg_type == 'GPR':
-            copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
-            copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][1])
-            copy += '\t\t# Create DP 2.0\n'
-            copy += '\t\tadd {}, {}\n'.format(self.ops['gpr64'][1], self.ops['gpr64'][0])
-            copy += '\t\t# Create DP 0.5\n'
-            copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
-            copy += '\t\tmovq {}, {}\n'.format(self.ops['gpr64'][2], self.ops['gpr64'][0])
-            copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
-        elif reg.reg_type == 'MMX':
-            copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][0])
-            copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][1])
-            copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
-            copy += '\t\t# Create DP 2.0\n'
-            copy += '\t\tadd {}, {}\n'.format(self.ops['mmx'][1], self.ops['mmx'][0])
-            copy += '\t\t# Create DP 0.5\n'
-            copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
-            copy += '\t\tmovq {}, {}\n'.format(self.ops['mmx'][2], self.ops['gpr64'][0])
-        elif reg.reg_type == 'XMM' or reg.reg_type == 'YMM' or reg.reg_type == 'ZMM':
-            key = reg.reg_type.lower()
-            copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][0], self.ops[key][0])
-            copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][1], self.ops[key][0])
-            copy += '\t\t# Create DP 2.0\n'
-            copy += '\t\tvaddpd {}, {}, {}\n'.format(self.ops[key][1], self.ops[key][1],
-                                                     self.ops[key][1])
-            copy += '\t\t# Create DP 0.5\n'
-            copy += '\t\tvdivpd {}, {}, {}\n'.format(self.ops[key][2], self.ops[key][0],
-                                                     self.ops[key][1])
-        else:
-            copy = ''
-        return copy
-
-    def __define_header(self):
-        """
-        Define header.
-
-        Returns
-        -------
-        (str, str, str, str)
-            String tuple containing the header, value initalisations and extensions
-        """
-        def_instr = '#define INSTR '+self.instr+'\n'
-        ninstr = '#define NINST '+self.num_instr+'\n'
-        pi = ('PI:\n'
-              '.long  0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, '   # 128 bit
-              '0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, '          # 256 bit
-              '0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9, '          # 384 bit
-              '0xf01b866e, 0x400921f9, 0xf01b866e, 0x400921f9\n')         # 512 bit
-        init = ('#define N edi\n'
-                '#define i r8d\n\n\n'
-                '.intel_syntax noprefix\n'
-                '.globl ninst\n'
-                '.data\n'
-                'ninst:\n'
-                '.long NINST\n'
-                '.align 32\n'
-                + pi +
-                '.text\n'
-                '.globl latency\n'
-                '.type latency, @function\n'
-                '.align 32\n'
-                'latency:\n'
-                '\t\tpush      rbp\n'
-                '\t\tmov       rbp, rsp\n'
-                '\t\txor       i, i\n'
-                '\t\ttest      N, N\n'
-                '\t\tjle       done\n')
-        # Expand to AVX(512) if necessary
-        expand = ''
-        if self.op_a == 'ymm' or self.op_b == 'ymm' or self.op_c == 'ymm':
-            expand = ('\t\t# expand from SSE to AVX\n'
-                      '\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n')
-        if self.op_a == 'zmm' or self.op_b == 'zmm' or self.op_c == 'zmm':
-            expand = ('\t\t# expand from SSE to AVX\n'
-                      '\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n'
-                      '\t\t# expand from AVX to AVX512\n'
-                      '\t\tvinsert64x4 zmm0, zmm0, ymm0, 0x1\n')
-        return def_instr, ninstr, init, expand
-
-    def __define_loop_lat(self):
-        """
-        Create latency loop.
-
-        Returns
-        -------
-        str
-            Latency loop as string
-        """
-        loop_lat = ('loop:\n'
-                    '\t\tinc      i\n')
-        if self.num_operands == 0:
-            for i in range(0, int(self.num_instr)):
-                loop_lat += '\t\tINSTR\n'
-        if self.num_operands == 1:
-            for i in range(0, int(self.num_instr)):
-                loop_lat += '\t\tINSTR    {}\n'.format(self.ops[self.op_a][0])
-        elif self.num_operands == 2 and self.op_a == self.op_b:
-            for i in range(0, int(self.num_instr), 2):
-                loop_lat += '\t\tINSTR    {}, {}\n'.format(self.ops[self.op_a][0],
-                                                           self.ops[self.op_b][1])
-                loop_lat += '\t\tINSTR    {}, {}\n'.format(self.ops[self.op_b][1],
-                                                           self.ops[self.op_b][0])
-        elif self.num_operands == 2 and self.op_a != self.op_b:
-            for i in range(0, int(self.num_instr), 2):
-                loop_lat += '\t\tINSTR    {}, {}\n'.format(self.ops[self.op_a][0],
-                                                           self.ops[self.op_b][0])
-                loop_lat += '\t\tINSTR    {}, {}\n'.format(self.ops[self.op_a][0],
-                                                           self.ops[self.op_b][0])
-        elif self.num_operands == 3 and self.op_a == self.op_b:
-            for i in range(0, int(self.num_instr), 2):
-                loop_lat += '\t\tINSTR    {}, {}, {}\n'.format(self.ops[self.op_a][0],
-                                                               self.ops[self.op_b][1],
-                                                               self.ops[self.op_c][0])
-                loop_lat += '\t\tINSTR    {}, {}, {}\n'.format(self.ops[self.op_a][1],
-                                                               self.ops[self.op_b][0],
-                                                               self.ops[self.op_c][0])
-        elif self.num_operands == 3 and self.op_a == self.op_c:
-            for i in range(0, int(self.num_instr), 2):
-                loop_lat += '\t\tINSTR    {}, {}, {}\n'.format(self.ops[self.op_a][0],
-                                                               self.ops[self.op_b][0],
-                                                               self.ops[self.op_c][0])
-                loop_lat += '\t\tINSTR    {}, {}, {}\n'.format(self.ops[self.op_a][1],
-                                                               self.ops[self.op_b][0],
-                                                               self.ops[self.op_c][0])
-        loop_lat += ('\t\tcmp      i, N\n'
-                     '\t\tjl       loop\n')
-        return loop_lat
-
-    def __define_loop_thrpt(self):
-        """
-        Create throughput loop.
-
-        Returns
-        -------
-        str
-            Throughput loop as string
-        """
-        loop_thrpt = ('loop:\n'
-                      '\t\tinc      i\n')
-        ext = ''
-        ext1 = False
-        ext2 = False
-        if self.num_operands == 2:
-            ext1 = True
-        if self.num_operands == 3:
-            ext1 = True
-            ext2 = True
-        for i in range(0, int(self.num_instr)):
-            if self.num_operands == 0:
-                loop_thrpt += '\t\tINSTR\n'
-                continue
-            if ext1:
-                ext = ', {}'.format(self.ops[self.op_b][i % 3])
-            if ext2:
-                ext += ', {}'.format(self.ops[self.op_c][i % 3])
-            reg_num = (i % (len(self.ops[self.op_a]) - 3)) + 3
-            loop_thrpt += '\t\tINSTR    {}{}\n'.format(self.ops[self.op_a][reg_num], ext)
-        loop_thrpt += ('\t\tcmp      i, N\n'
-                       '\t\tjl       loop\n')
-        return loop_thrpt
-
-    def is_in_dir(self):
-        """
-        Check if testcases with the same name already exist in testcase
-        directory.
-
-        Returns
-        -------
-        (bool, bool)
-            True    if file is in directory
-            False   if file is not in directory
-            While the first value stands for the throughput testcase
-            and the second value stands for the latency testcase
-        """
-        tp = False
-        lt = False
-        name = self.instr+self.extension
-        for root, dirs, files in os.walk(os.path.dirname(__file__)+'/benchmarks'):
-            if (name + '-tp.S') in files:
-                tp = True
-            if name+'.S' in files:
-                lt = True
-        return tp, lt
-
-    def get_entryname(self):
-        """
-        Return the name of the entry the instruction form would be the data file
-
-        Returns
-        -------
-        str
-            The composited string out of instruction mnemonic and operands
-        """
-        name = self.instr+self.extension
-        return name
--- a/osaca/utils.py
+++ b/osaca/utils.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+import os.path
+
+
+def find_file(name):
+    """Check for existence of name in user or package data folders and return path."""
+    search_paths = [os.path.expanduser('~/.osaca/data'),
+                    os.path.join(os.path.dirname(__file__), 'data')]
+    for dir in search_paths:
+        path = os.path.join(dir, name)
+        if os.path.exists(path):
+            return path
+    raise FileNotFoundError("Could not find {!r} in {!r}.".format(name, search_paths))
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [pep8]
-max-line-length=100
+max-line-length=99

 [metadata]
 license-file=LICENSE
--- a/setup.py
+++ b/setup.py
@@ -75,6 +75,7 @@ setup(
        # that you indicate wheter you support Python2, Python 3 or both.
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
    ],

    # What doesd your project relate to?
@@ -89,9 +90,10 @@ setup(
    # requirements files see:
    # https://packaging.python.org/en/latest/requirements.html
    install_requires=[
-        'numpy',
-        'pandas',
        'kerncraft',
+        'networkx',
+        'pyparsing',
+        'pygraphviz',
    ],
    python_requires='>=3.5',

--- a/tests/all_tests.py
+++ b/tests/all_tests.py
@@ -1,14 +1,19 @@
-#!/usr//bin/env python
+#!/usr/bin/env python3

 import sys
-
 import unittest

-
 sys.path[0:0] = ['.', '..']
 suite = unittest.TestLoader().loadTestsFromNames(
    [
-        'test_osaca'
+        'test_base_parser',
+        'test_parser_x86att',
+        'test_parser_AArch64v81',
+        'test_marker_utils',
+        'test_semantics',
+        'test_frontend',
+        'test_db_interface',
+        'test_kerncraftAPI',
    ]
 )

--- a/tests/test_base_parser.py
+++ b/tests/test_base_parser.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Unit tests for base assembly parser
+"""
+
+import os
+import unittest
+
+from osaca.parser import AttrDict, BaseParser
+
+
+class TestBaseParser(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        try:
+            self.parser = BaseParser()
+        except NotImplementedError:
+            pass
+        with open(self._find_file('triad-x86-iaca.s')) as f:
+            self.triad_code = f.read()
+
+    ##################
+    # Test
+    ##################
+
+    def test_parse_file(self):
+        with self.assertRaises(NotImplementedError):
+            self.parser.parse_file(self.triad_code)
+
+    def test_parse_line(self):
+        line_instruction = '\t\tlea       2(%rax,%rax), %ecx #12.9'
+        with self.assertRaises(NotImplementedError):
+            self.parser.parse_line(line_instruction)
+
+    def test_parse_instruction(self):
+        instr1 = '\t\tvcvtsi2ss %edx, %xmm2, %xmm2\t\t\t#12.27'
+        with self.assertRaises(NotImplementedError):
+            self.parser.parse_instruction(instr1)
+
+    def test_register_funcs(self):
+        reg_a1 = AttrDict({'name': 'rax'})
+        reg_a2 = AttrDict({'name': 'eax'})
+        register_string = 'v1.2d'
+        with self.assertRaises(NotImplementedError):
+            self.parser.is_reg_dependend_of(reg_a1, reg_a2)
+        with self.assertRaises(NotImplementedError):
+            self.parser.parse_register(register_string)
+        with self.assertRaises(NotImplementedError):
+            self.parser.is_gpr(reg_a1)
+        with self.assertRaises(NotImplementedError):
+            self.parser.is_vector_register(reg_a1)
+        with self.assertRaises(NotImplementedError):
+            self.parser.process_operand(reg_a1)
+        with self.assertRaises(NotImplementedError):
+            self.parser.get_full_reg_name(reg_a1)
+
+    def test_normalize_imd(self):
+        imd_hex_1 = {'value': '0x4f'}
+        with self.assertRaises(NotImplementedError):
+            self.parser.normalize_imd(imd_hex_1)
+
+    ##################
+    # Helper functions
+    ##################
+
+    @staticmethod
+    def _find_file(name):
+        testdir = os.path.dirname(__file__)
+        name = os.path.join(testdir, 'test_files', name)
+        assert os.path.exists(name)
+        return name
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestBaseParser)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_db_interface.py
+++ b/tests/test_db_interface.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+"""
+Unit tests for DB interface
+"""
+
+import unittest
+
+from osaca.db_interface import sanity_check
+from osaca.semantics import MachineModel
+
+
+class TestDBInterface(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        sample_entry = {
+            'name': 'DoItRightAndDoItFast',
+            'operands': [
+                {'class': 'memory', 'offset': 'imd', 'base': 'gpr', 'index': 'gpr', 'scale': 8},
+                {'class': 'register', 'name': 'xmm'},
+            ],
+            'throughput': 1.25,
+            'latency': 125,
+            'uops': 6,
+        }
+        self.entry_csx = sample_entry.copy()
+        self.entry_tx2 = sample_entry.copy()
+        self.entry_zen1 = sample_entry.copy()
+
+        # self.entry_csx['port_pressure'] = [1.25, 0, 1.25, 0.5, 0.5, 0.5, 0.5, 0, 1.25, 1.25, 0]
+        self.entry_csx['port_pressure'] = [[5, '0156'], [1, '23'], [1, ['2D', '3D']]]
+        # self.entry_tx2['port_pressure'] = [2.5, 2.5, 0, 0, 0.5, 0.5]
+        self.entry_tx2['port_pressure'] = [[5, '01'], [1, '45']]
+        del self.entry_tx2['operands'][1]['name']
+        self.entry_tx2['operands'][1]['prefix'] = 'x'
+        # self.entry_zen1['port_pressure'] = [1, 1, 1, 1, 0, 1, 0, 0, 0, 0.5, 1, 0.5, 1]
+        self.entry_zen1['port_pressure'] = [[4, '0123'], [1, '4'], [1, '89'], [2, ['8D', '9D']]]
+
+    ###########
+    # Tests
+    ###########
+
+    def test_add_single_entry(self):
+        mm_csx = MachineModel('csx')
+        mm_tx2 = MachineModel('tx2')
+        mm_zen1 = MachineModel('zen1')
+        num_entries_csx = len(mm_csx['instruction_forms'])
+        num_entries_tx2 = len(mm_tx2['instruction_forms'])
+        num_entries_zen1 = len(mm_zen1['instruction_forms'])
+
+        mm_csx.set_instruction_entry(self.entry_csx)
+        mm_tx2.set_instruction_entry(self.entry_tx2)
+        mm_zen1.set_instruction_entry({'name': 'empty_operation'})
+
+        num_entries_csx = len(mm_csx['instruction_forms']) - num_entries_csx
+        num_entries_tx2 = len(mm_tx2['instruction_forms']) - num_entries_tx2
+        num_entries_zen1 = len(mm_zen1['instruction_forms']) - num_entries_zen1
+
+        self.assertEqual(num_entries_csx, 1)
+        self.assertEqual(num_entries_tx2, 1)
+        self.assertEqual(num_entries_zen1, 1)
+
+    def test_invalid_add(self):
+        entry = {}
+        with self.assertRaises(KeyError):
+            MachineModel('csx').set_instruction_entry(entry)
+        with self.assertRaises(TypeError):
+            MachineModel('csx').set_instruction()
+
+    def test_sanity_check(self):
+        # non-verbose
+        sanity_check('csx', verbose=False)
+        sanity_check('tx2', verbose=False)
+        sanity_check('zen1', verbose=False)
+        # verbose
+        sanity_check('csx', verbose=True)
+        sanity_check('tx2', verbose=True)
+        sanity_check('zen1', verbose=True)
+
+    ##################
+    # Helper functions
+    ##################
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestDBInterface)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_files/hidden_load_machine_model.yml
+++ b/tests/test_files/hidden_load_machine_model.yml
@@ -0,0 +1,539 @@
+osaca_version: 0.3.1
+micro_architecture: AMD Zen (family 17h)
+arch_code: ZEN1
+isa: x86
+load_latency: {gpr: 4.0, xmm: 4.0, ymm: 4.0}
+load_throughput_multiplier: {gpr: 1.0, xmm: 1.0, ymm: 2.0}
+load_throughput:
+- {base: gpr, index: ~, offset: ~, scale: 1, port_pressure:     [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: ~, offset: ~, scale: 8, port_pressure:     [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: ~, offset: imd, scale: 1, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: ~, offset: imd, scale: 8, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: ~, scale: 1, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: ~, scale: 8, port_pressure:   [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: imd, scale: 1, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
+- {base: gpr, index: gpr, offset: imd, scale: 8, port_pressure: [[1, '89'], [1, ['8D','9D']]]}
+hidden_loads: true
+ports: ['0', '1', '2', '3', 3DV, '4', '5', '6', '7', '8', '9', 8D, 9D, ST]
+port_model_scheme: |
+  ┌--------------------------------------┐  ┌-----------------------------------------------┐
+  |       96 entries OoO scheduler       |  |           84 entries OoO scheduler            |
+  └--------------------------------------┘  └-----------------------------------------------┘
+     0 |        1 |       2 |       3 |        4 |     5 |     6 |     7 |      8 |     9 |
+       ▼         ▼         ▼         ▼          ▼       ▼       ▼       ▼        ▼       ▼
+   ┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐  ┌------┐ ┌-----┐ ┌-----┐ ┌------┐ ┌-----┐ ┌-----┐
+   |SSE ALU| |SSE ALU| |SSE ALU| |SSE ALU|  | ALU  | | ALU | | ALU | | ALU  | | AGU | | AGU |
+   └-------┘ └-------┘ └-------┘ └-------┘  └------┘ └-----┘ └-----┘ └------┘ └-----┘ └-----┘
+   ┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐  ┌------┐ ┌-----┐ ┌-----┐ ┌------┐    |       |
+   |SSE MUL| |SSE MUL| |SSE ADD| |SSE ADD|  |BRANCH| | MUL | | MUL | |BRANCH|    ▼       ▼
+   └-------┘ └-------┘ └-------┘ └-------┘  └------┘ └-----┘ └-----┘ └------┘ ┌-------------┐ 
+   ┌-------┐ ┌-------┐ ┌-------┐ ┌-------┐                                    |    LOAD     |
+   |SSE FMA| |SSE FMA| |  SSE  | |SSE DIV|                                    └-------------┘
+   └-------┘ └-------┘ |  SHUF | └-------┘                                    ┌-------------┐ 
+             ┌-------┐ └-------┘                                              |    LOAD     |
+             |  SSE  |                                                        └-------------┘
+             |  SHUF |                                                        ┌-------------┐
+             └-------┘                                                        |    STORE    |
+                                                                              └-------------┘
+instruction_forms:
+- name: add
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: add
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: addl
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: addq
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: cmpl
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: ~  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: cmpq
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: ~  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: incq
+  operands:
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567     
+  port_pressure: [[1, '4567']]
+- name: ja
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: ~
+  port_pressure: []
+- name: jb
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: ~
+  port_pressure: []
+- name: jne
+  operands:
+  - class: identifier
+  throughput: 0.0
+  latency: ~
+  port_pressure: []
+- name: leaq
+  operands:
+  - class: memory
+    base: gpr
+    offset: imd
+    index: ~
+    scale: 1
+  - class: register
+    name: gpr
+  throughput: 0.5
+  latency: ~  # 	1*p89
+  port_pressure: [[1, '89']]
+- name: movl
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: mulsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: mulss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: rcpss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: ~     #1.0
+  latency: 5.0
+  port_pressure: []
+- name: sqrtsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: ~     #8.0
+  latency: 23.0
+  port_pressure: []
+- name: sqrtss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: ~     #5.0
+  latency: 17.0
+  port_pressure: []
+- name: subq
+  operands:
+  - class: register
+    name: gpr
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: subq
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+- name: vaddpd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 3.0  # 	2*p23
+  port_pressure: [[2, '23']]
+- name: vaddsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p23
+  port_pressure: [[1, '23']]
+- name: vaddss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p23
+  port_pressure: [[1, '23']]
+- name: vdivsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 4.0
+  latency: 13.0  # 	1*p3+4*p3DV
+  port_pressure: [[1, '3'], [4.0, [3DV]]]
+- name: vdivss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 3.0
+  latency: 10.0
+  port_pressure: [[1, '3'], [3.0, [3DV]]]
+- name: vfmadd213pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vfmadd231pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vfmadd132pd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vmulsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulss
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 3.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulpd
+  operands:
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01+1*p89+1*p8D9D
+  port_pressure: [[1, '01'], [1, '89'], [1, [8D, 9D]]]
+- name: vmulpd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: vmulpd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 1.0
+  latency: 4.0  # 	2*p01
+  port_pressure: [[2, '01']]
+- name: vmovapd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovapd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovapd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovaps
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovaps
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 2.0
+  latency: 3.0  # 	2*p89+2*pST
+  port_pressure: [[2, '89'], [2, [ST]]]
+- name: vmovupd
+  operands:
+  - class: register
+    name: ymm
+  - class: register
+    name: ymm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovsd
+  operands:
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  - class: register
+    name: xmm
+  throughput: 0.5
+  latency: 4.0  # 	1*p89+1*p8D9D
+  port_pressure: [[1, '89'], [1, [8D, 9D]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: register
+    name: xmm
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: ~
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: ~
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: ~
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
+- name: vmovsd
+  operands:
+  - class: register
+    name: xmm
+  - class: memory
+    base: gpr
+    offset: imd
+    index: gpr
+    scale: 1
+  throughput: 1.0
+  latency: 4.0  # 	1*p89+1*pST
+  port_pressure: [[1, '89'], [1, [ST]]]
--- a/tests/test_files/kernel-AArch64.s
+++ b/tests/test_files/kernel-AArch64.s
@@ -0,0 +1,27 @@
+// mov x1, #111
+// .byte 213,3,32,31
+.LBB0_32:
+    ldp q4, q5, [x9, #-32]
+    ldp q6, q7, [x9], #64
+    ldp q16, q17, [x11, #-32]!
+    ldp q18, q19, [x11], #64
+    fmul    v4.2d, v4.2d, v16.2d
+    fmul    v5.2d, v5.2d, v17.2d
+    fmul    v6.2d, v6.2d, v18.2d
+    fmul    v7.2d, v7.2d, v19.2d
+    ldp q0, q1, [x8, #-32]
+    ldp q2, q3, [x8], #64
+    fadd    v0.2d, v0.2d, v4.2d
+    fadd    v1.2d, v1.2d, v5.2d
+    stp q0, q1, [x10, #-32]
+    fadd    v2.2d, v2.2d, v6.2d
+    fadd    v3.2d, v3.2d, v7.2d
+    stp q2, q3, [x10]
+    add x10, x10, #64           // =64
+    adds    x12, x12, #1            // =1
+    fmov    s0, -1.0e+0
+    fmov    s1, #2.0e+2f
+    prfm    pldl1keep, [x26, #2112]
+    b.ne    .LBB0_32
+// mov x1, #222
+// .byte 213,3,32,31
--- a/tests/test_files/kernel-x86.s
+++ b/tests/test_files/kernel-x86.s
@@ -0,0 +1,13 @@
+#movl    $111,%ebx
+#.byte   100,103,144
+.L10:	
+    vmovapd	(%r15,%rax), %ymm0
+	vmovapd	(%r12,%rax), %ymm3
+	addl	$1, %ecx
+	vfmadd132pd	0(%r13,%rax), %ymm3, %ymm0
+	vmovapd	%ymm0, (%r14,%rax)
+	addq	$32, %rax
+	cmpl	%ecx, %r10d
+	ja	.L10
+#movl    $222,%ebx
+#.byte   100,103,144
--- a/tests/test_files/triad-arm-iaca.s
+++ b/tests/test_files/triad-arm-iaca.s
@@ -0,0 +1,645 @@
+	.text
+	.file	"triad.c"
+	.section	.rodata.cst8,"aM",@progbits,8
+	.p2align	3               // -- Begin function triad
+.LCPI0_0:
+	.xword	4596373779694328218     // double 0.20000000000000001
+.LCPI0_1:
+	.xword	4652007308841189376     // double 1000
+.LCPI0_2:
+	.xword	4517329193108106637     // double 9.9999999999999995E-7
+.LCPI0_3:
+	.xword	4629700416936869888     // double 32
+.LCPI0_4:
+	.xword	4562146422526312448     // double 9.765625E-4
+	.text
+	.globl	triad
+	.p2align	6
+	.type	triad,@function
+triad:                                  // @triad
+	.cfi_startproc
+// %bb.0:
+	sub	sp, sp, #224            // =224
+	str	d8, [sp, #112]          // 8-byte Folded Spill
+	stp	x28, x27, [sp, #128]    // 16-byte Folded Spill
+	stp	x26, x25, [sp, #144]    // 16-byte Folded Spill
+	stp	x24, x23, [sp, #160]    // 16-byte Folded Spill
+	stp	x22, x21, [sp, #176]    // 16-byte Folded Spill
+	stp	x20, x19, [sp, #192]    // 16-byte Folded Spill
+	stp	x29, x30, [sp, #208]    // 16-byte Folded Spill
+	add	x29, sp, #208           // =208
+	.cfi_def_cfa w29, 16
+	.cfi_offset w30, -8
+	.cfi_offset w29, -16
+	.cfi_offset w19, -24
+	.cfi_offset w20, -32
+	.cfi_offset w21, -40
+	.cfi_offset w22, -48
+	.cfi_offset w23, -56
+	.cfi_offset w24, -64
+	.cfi_offset w25, -72
+	.cfi_offset w26, -80
+	.cfi_offset w27, -88
+	.cfi_offset w28, -96
+	.cfi_offset b8, -112
+	mov	w19, w0
+	orr	w0, wzr, #0x40
+	sbfiz	x23, x19, #3, #32
+	mov	x1, x23
+	bl	aligned_alloc
+	mov	x20, x0
+	orr	w0, wzr, #0x40
+	mov	x1, x23
+	bl	aligned_alloc
+	str	x0, [sp, #88]           // 8-byte Folded Spill
+	orr	w0, wzr, #0x40
+	mov	x1, x23
+	bl	aligned_alloc
+	mov	x22, x0
+	orr	w0, wzr, #0x40
+	mov	x1, x23
+	bl	aligned_alloc
+	mov	x23, x0
+	cmp	w19, #0                 // =0
+	b.le	.LBB0_3
+// %bb.1:
+	mov	w24, w19
+	cmp	w19, #7                 // =7
+	b.hi	.LBB0_9
+// %bb.2:
+	mov	x8, xzr
+	b	.LBB0_17
+.LBB0_3:
+	adrp	x8, .LCPI0_0
+	orr	w25, wzr, #0x1
+	ldr	d8, [x8, :lo12:.LCPI0_0]
+	.p2align	6
+.LBB0_4:                                // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_5 Depth 2
+	sub	x0, x29, #88            // =88
+	add	x1, sp, #96             // =96
+	bl	timing
+	mov	w21, w25
+	cbz	w25, .LBB0_8
+	.p2align	6
+.LBB0_5:                                //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	d0, [x20]
+	fcmp	d0, #0.0
+	b.le	.LBB0_7
+// %bb.6:                               //   in Loop: Header=BB0_5 Depth=2
+	mov	x0, x20
+	bl	dummy
+.LBB0_7:                                //   in Loop: Header=BB0_5 Depth=2
+	subs	w21, w21, #1            // =1
+	b.ne	.LBB0_5
+.LBB0_8:                                //   in Loop: Header=BB0_4 Depth=1
+	add	x0, sp, #104            // =104
+	add	x1, sp, #96             // =96
+	bl	timing
+	ldr	d0, [sp, #104]
+	ldur	d1, [x29, #-88]
+	fsub	d1, d0, d1
+	lsl	w25, w25, #1
+	fcmp	d1, d8
+	b.mi	.LBB0_4
+	b	.LBB0_38
+.LBB0_9:
+	and	x8, x24, #0xfffffff8
+	sub	x10, x8, #8             // =8
+	lsr	x11, x10, #3
+	add	w9, w11, #1             // =1
+	and	x9, x9, #0x3
+	cmp	x10, #24                // =24
+	b.hs	.LBB0_11
+// %bb.10:
+	orr	w13, wzr, #0x20
+	cbnz	x9, .LBB0_14
+	b	.LBB0_16
+.LBB0_11:
+	mov	x16, #28286
+	movk	x16, #29109, lsl #16
+	ldr	x15, [sp, #88]          // 8-byte Folded Reload
+	movk	x16, #34426, lsl #32
+	movk	x16, #16000, lsl #48
+	dup	v0.2d, x16
+	mvn	x11, x11
+	mov	x10, xzr
+	add	x11, x9, x11
+	add	x12, x23, #128          // =128
+	add	x13, x20, #128          // =128
+	add	x14, x22, #128          // =128
+	add	x15, x15, #128          // =128
+	.p2align	6
+.LBB0_12:                               // =>This Inner Loop Header: Depth=1
+	stp	q0, q0, [x12]
+	stp	q0, q0, [x12, #-128]
+	stp	q0, q0, [x12, #32]
+	stp	q0, q0, [x12, #-96]
+	stp	q0, q0, [x14]
+	add	x10, x10, #32           // =32
+	stp	q0, q0, [x14, #-128]
+	stp	q0, q0, [x14, #32]
+	stp	q0, q0, [x14, #-96]
+	stp	q0, q0, [x15]
+	stp	q0, q0, [x15, #-128]
+	stp	q0, q0, [x15, #32]
+	stp	q0, q0, [x15, #-96]
+	stp	q0, q0, [x13]
+	stp	q0, q0, [x13, #-128]
+	stp	q0, q0, [x13, #32]
+	stp	q0, q0, [x13, #-96]
+	stp	q0, q0, [x12, #64]
+	stp	q0, q0, [x12, #-64]
+	stp	q0, q0, [x12, #96]
+	stp	q0, q0, [x12, #-32]
+	add	x12, x12, #256          // =256
+	stp	q0, q0, [x14, #64]
+	stp	q0, q0, [x14, #-64]
+	stp	q0, q0, [x14, #96]
+	stp	q0, q0, [x14, #-32]
+	add	x14, x14, #256          // =256
+	stp	q0, q0, [x15, #64]
+	stp	q0, q0, [x15, #-64]
+	stp	q0, q0, [x15, #96]
+	stp	q0, q0, [x15, #-32]
+	add	x15, x15, #256          // =256
+	stp	q0, q0, [x13, #64]
+	stp	q0, q0, [x13, #-64]
+	stp	q0, q0, [x13, #96]
+	stp	q0, q0, [x13, #-32]
+	add	x13, x13, #256          // =256
+	adds	x11, x11, #4            // =4
+	b.ne	.LBB0_12
+// %bb.13:
+	lsl	x10, x10, #3
+	orr	x13, x10, #0x20
+	cbz	x9, .LBB0_16
+.LBB0_14:
+	ldr	x14, [sp, #88]          // 8-byte Folded Reload
+	add	x10, x23, x13
+	add	x11, x22, x13
+	add	x12, x20, x13
+	add	x13, x14, x13
+	mov	x14, #28286
+	movk	x14, #29109, lsl #16
+	movk	x14, #34426, lsl #32
+	movk	x14, #16000, lsl #48
+	dup	v0.2d, x14
+	neg	x9, x9
+	.p2align	6
+.LBB0_15:                               // =>This Inner Loop Header: Depth=1
+	stp	q0, q0, [x10]
+	stp	q0, q0, [x11]
+	stp	q0, q0, [x10, #-32]
+	stp	q0, q0, [x13]
+	stp	q0, q0, [x11, #-32]
+	add	x10, x10, #64           // =64
+	stp	q0, q0, [x12]
+	stp	q0, q0, [x13, #-32]
+	add	x11, x11, #64           // =64
+	stp	q0, q0, [x12, #-32]
+	add	x12, x12, #64           // =64
+	add	x13, x13, #64           // =64
+	adds	x9, x9, #1              // =1
+	b.ne	.LBB0_15
+.LBB0_16:
+	cmp	x8, x24
+	b.eq	.LBB0_19
+.LBB0_17:
+	ldr	x10, [sp, #88]          // 8-byte Folded Reload
+	mov	x13, #28286
+	movk	x13, #29109, lsl #16
+	lsl	x12, x8, #3
+	movk	x13, #34426, lsl #32
+	add	x9, x20, x12
+	movk	x13, #16000, lsl #48
+	add	x10, x10, x12
+	add	x11, x22, x12
+	add	x12, x23, x12
+	sub	x8, x24, x8
+	.p2align	6
+.LBB0_18:                               // =>This Inner Loop Header: Depth=1
+	str	x13, [x12], #8
+	str	x13, [x11], #8
+	str	x13, [x10], #8
+	str	x13, [x9], #8
+	subs	x8, x8, #1              // =1
+	b.ne	.LBB0_18
+.LBB0_19:
+	ldr	x10, [sp, #88]          // 8-byte Folded Reload
+	add	x8, x20, #256           // =256
+	and	x26, x24, #0xfffffff8
+	str	x8, [sp, #40]           // 8-byte Folded Spill
+	add	x8, x23, #256           // =256
+	sub	x27, x26, #8            // =8
+	str	x8, [sp, #32]           // 8-byte Folded Spill
+	add	x8, x22, #256           // =256
+	orr	w25, wzr, #0x1
+	str	x8, [sp, #24]           // 8-byte Folded Spill
+	add	x8, x10, #256           // =256
+	str	x8, [sp, #16]           // 8-byte Folded Spill
+	lsr	x8, x27, #3
+	add	w9, w8, #1              // =1
+	mvn	x8, x8
+	and	x28, x9, #0x7
+	add	x8, x28, x8
+	str	x8, [sp, #8]            // 8-byte Folded Spill
+	neg	x8, x28
+	str	x8, [sp, #80]           // 8-byte Folded Spill
+	add	x8, x10, #32            // =32
+	str	x8, [sp, #72]           // 8-byte Folded Spill
+	add	x8, x22, #32            // =32
+	str	x8, [sp, #64]           // 8-byte Folded Spill
+	add	x8, x20, #32            // =32
+	str	x8, [sp, #56]           // 8-byte Folded Spill
+	add	x8, x23, #32            // =32
+	str	x8, [sp, #48]           // 8-byte Folded Spill
+	adrp	x8, .LCPI0_0
+	ldr	d8, [x8, :lo12:.LCPI0_0]
+	.p2align	6
+.LBB0_20:                               // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_22 Depth 2
+                                        //       Child Loop BB0_29 Depth 3
+                                        //       Child Loop BB0_32 Depth 3
+                                        //       Child Loop BB0_35 Depth 3
+	sub	x0, x29, #88            // =88
+	add	x1, sp, #96             // =96
+	bl	timing
+	cbz	w25, .LBB0_37
+// %bb.21:                              //   in Loop: Header=BB0_20 Depth=1
+	mov	w21, wzr
+	.p2align	6
+.LBB0_22:                               //   Parent Loop BB0_20 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_29 Depth 3
+                                        //       Child Loop BB0_32 Depth 3
+                                        //       Child Loop BB0_35 Depth 3
+	ldr	d0, [x20]
+	fcmp	d0, #0.0
+	b.le	.LBB0_24
+// %bb.23:                              //   in Loop: Header=BB0_22 Depth=2
+	mov	x0, x20
+	bl	dummy
+.LBB0_24:                               //   in Loop: Header=BB0_22 Depth=2
+	cmp	w19, #7                 // =7
+	b.hi	.LBB0_26
+// %bb.25:                              //   in Loop: Header=BB0_22 Depth=2
+	mov	x12, xzr
+	b	.LBB0_34
+	.p2align	6
+.LBB0_26:                               //   in Loop: Header=BB0_22 Depth=2
+	cmp	x27, #56                // =56
+	b.hs	.LBB0_28
+// %bb.27:                              //   in Loop: Header=BB0_22 Depth=2
+	mov	x8, xzr
+	cbnz	x28, .LBB0_31
+	b	.LBB0_33
+	.p2align	6
+.LBB0_28:                               //   in Loop: Header=BB0_22 Depth=2
+	ldp	x9, x10, [sp, #16]      // 8-byte Folded Reload
+	ldp	x11, x12, [sp, #32]     // 8-byte Folded Reload
+	ldr	x13, [sp, #8]           // 8-byte Folded Reload
+	mov	x8, xzr
+	.p2align	6
+    mov x1, #111                // OSACA START
+    .byte 213,3,32,31           // OSACA START
+.LBB0_29:                               //   Parent Loop BB0_20 Depth=1
+                                        //     Parent Loop BB0_22 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldp	q2, q5, [x10, #-256]
+	ldp	q6, q7, [x10, #-224]
+	ldp	q16, q17, [x11, #-256]
+	ldp	q18, q19, [x11, #-224]
+	fmul	v2.2d, v2.2d, v16.2d
+	fmul	v5.2d, v5.2d, v17.2d
+	fmul	v6.2d, v6.2d, v18.2d
+	ldp	q0, q1, [x9, #-256]
+	ldp	q3, q4, [x9, #-224]
+	fmul	v7.2d, v7.2d, v19.2d
+	fadd	v0.2d, v0.2d, v2.2d
+	fadd	v2.2d, v1.2d, v5.2d
+	stp	q0, q2, [x12, #-256]
+	fadd	v1.2d, v3.2d, v6.2d
+	ldp	q6, q17, [x10, #-192]
+	ldp	q18, q19, [x10, #-160]
+	ldp	q20, q21, [x11, #-192]
+	ldp	q22, q23, [x11, #-160]
+	fmul	v6.2d, v6.2d, v20.2d
+	fmul	v17.2d, v17.2d, v21.2d
+	fmul	v18.2d, v18.2d, v22.2d
+	fadd	v3.2d, v4.2d, v7.2d
+	stp	q1, q3, [x12, #-224]
+	ldp	q4, q5, [x9, #-192]
+	ldp	q7, q16, [x9, #-160]
+	fmul	v19.2d, v19.2d, v23.2d
+	fadd	v4.2d, v4.2d, v6.2d
+	fadd	v6.2d, v5.2d, v17.2d
+	stp	q4, q6, [x12, #-192]
+	fadd	v5.2d, v7.2d, v18.2d
+	ldp	q18, q21, [x10, #-128]
+	ldp	q22, q23, [x10, #-96]
+	ldp	q24, q25, [x11, #-128]
+	ldp	q26, q27, [x11, #-96]
+	fmul	v18.2d, v18.2d, v24.2d
+	fmul	v21.2d, v21.2d, v25.2d
+	fmul	v22.2d, v22.2d, v26.2d
+	fadd	v7.2d, v16.2d, v19.2d
+	stp	q5, q7, [x12, #-160]
+	ldp	q16, q17, [x9, #-128]
+	ldp	q19, q20, [x9, #-96]
+	fadd	v16.2d, v16.2d, v18.2d
+	fadd	v18.2d, v17.2d, v21.2d
+	stp	q16, q18, [x12, #-128]
+	fadd	v17.2d, v19.2d, v22.2d
+	ldp	q22, q25, [x10, #-64]
+	ldp	q28, q29, [x11, #-64]
+	fmul	v23.2d, v23.2d, v27.2d
+	ldp	q26, q27, [x10, #-32]
+	fmul	v22.2d, v22.2d, v28.2d
+	fmul	v25.2d, v25.2d, v29.2d
+	ldp	q28, q29, [x11, #-32]
+	fmul	v26.2d, v26.2d, v28.2d
+	fmul	v27.2d, v27.2d, v29.2d
+	fadd	v19.2d, v20.2d, v23.2d
+	stp	q17, q19, [x12, #-96]
+	ldp	q20, q21, [x9, #-64]
+	ldp	q23, q24, [x9, #-32]
+	fadd	v20.2d, v20.2d, v22.2d
+	fadd	v22.2d, v21.2d, v25.2d
+	stp	q20, q22, [x12, #-64]
+	fadd	v21.2d, v23.2d, v26.2d
+	fadd	v23.2d, v24.2d, v27.2d
+	stp	q21, q23, [x12, #-32]
+	ldp	q24, q25, [x10]
+	ldp	q28, q29, [x11]
+	ldp	q26, q27, [x10, #32]
+	fmul	v24.2d, v24.2d, v28.2d
+	fmul	v25.2d, v25.2d, v29.2d
+	ldp	q28, q29, [x11, #32]
+	fmul	v26.2d, v26.2d, v28.2d
+	fmul	v27.2d, v27.2d, v29.2d
+	ldp	q28, q29, [x9]
+	fadd	v24.2d, v28.2d, v24.2d
+	fadd	v25.2d, v29.2d, v25.2d
+	stp	q24, q25, [x12]
+	ldp	q28, q29, [x9, #32]
+	fadd	v26.2d, v28.2d, v26.2d
+	fadd	v27.2d, v29.2d, v27.2d
+	stp	q26, q27, [x12, #32]
+	ldp	q24, q25, [x10, #64]
+	ldp	q28, q29, [x11, #64]
+	ldp	q26, q27, [x10, #96]
+	fmul	v24.2d, v24.2d, v28.2d
+	fmul	v25.2d, v25.2d, v29.2d
+	ldp	q28, q29, [x11, #96]
+	fmul	v26.2d, v26.2d, v28.2d
+	fmul	v27.2d, v27.2d, v29.2d
+	ldp	q28, q29, [x9, #64]
+	fadd	v24.2d, v28.2d, v24.2d
+	fadd	v25.2d, v29.2d, v25.2d
+	stp	q24, q25, [x12, #64]
+	ldp	q28, q29, [x9, #96]
+	fadd	v26.2d, v28.2d, v26.2d
+	fadd	v27.2d, v29.2d, v27.2d
+	stp	q26, q27, [x12, #96]
+	ldp	q24, q25, [x10, #128]
+	ldp	q28, q29, [x11, #128]
+	ldp	q26, q27, [x10, #160]
+	fmul	v24.2d, v24.2d, v28.2d
+	fmul	v25.2d, v25.2d, v29.2d
+	ldp	q28, q29, [x11, #160]
+	fmul	v26.2d, v26.2d, v28.2d
+	fmul	v27.2d, v27.2d, v29.2d
+	ldp	q28, q29, [x9, #128]
+	fadd	v24.2d, v28.2d, v24.2d
+	fadd	v25.2d, v29.2d, v25.2d
+	stp	q24, q25, [x12, #128]
+	ldp	q28, q29, [x9, #160]
+	fadd	v26.2d, v28.2d, v26.2d
+	fadd	v27.2d, v29.2d, v27.2d
+	stp	q26, q27, [x12, #160]
+	ldp	q24, q25, [x10, #192]
+	ldp	q26, q27, [x11, #192]
+	fmul	v24.2d, v24.2d, v26.2d
+	ldp	q26, q28, [x10, #224]
+	fmul	v25.2d, v25.2d, v27.2d
+	ldp	q27, q0, [x11, #224]
+	fmul	v2.2d, v26.2d, v27.2d
+	fmul	v0.2d, v28.2d, v0.2d
+	ldp	q1, q3, [x9, #192]
+	ldp	q4, q5, [x9, #224]
+	fadd	v1.2d, v1.2d, v24.2d
+	fadd	v3.2d, v3.2d, v25.2d
+	stp	q1, q3, [x12, #192]
+	fadd	v2.2d, v4.2d, v2.2d
+	fadd	v0.2d, v5.2d, v0.2d
+	stp	q2, q0, [x12, #224]
+	add	x8, x8, #64             // =64
+	add	x12, x12, #512          // =512
+	add	x11, x11, #512          // =512
+	add	x10, x10, #512          // =512
+	add	x9, x9, #512            // =512
+	adds	x13, x13, #8            // =8
+	b.ne	.LBB0_29
+    mov x1, #222                // OSACA END
+    .byte 213,3,32,31           // OSACA END
+// %bb.30:                              //   in Loop: Header=BB0_22 Depth=2
+	cbz	x28, .LBB0_33
+.LBB0_31:                               //   in Loop: Header=BB0_22 Depth=2
+	lsl	x11, x8, #3
+	ldp	x9, x8, [sp, #64]       // 8-byte Folded Reload
+	ldp	x12, x10, [sp, #48]     // 8-byte Folded Reload
+	add	x8, x8, x11
+	add	x9, x9, x11
+	add	x10, x10, x11
+	add	x11, x12, x11
+	ldr	x12, [sp, #80]          // 8-byte Folded Reload
+	.p2align	6
+.LBB0_32:                               //   Parent Loop BB0_20 Depth=1
+                                        //     Parent Loop BB0_22 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldp	q4, q5, [x9, #-32]
+	ldp	q6, q7, [x9], #64
+	ldp	q16, q17, [x11, #-32]
+	ldp	q18, q19, [x11], #64
+	fmul	v4.2d, v4.2d, v16.2d
+	fmul	v5.2d, v5.2d, v17.2d
+	fmul	v6.2d, v6.2d, v18.2d
+	fmul	v7.2d, v7.2d, v19.2d
+	ldp	q0, q1, [x8, #-32]
+	ldp	q2, q3, [x8], #64
+	fadd	v0.2d, v0.2d, v4.2d
+	fadd	v1.2d, v1.2d, v5.2d
+	stp	q0, q1, [x10, #-32]
+	fadd	v2.2d, v2.2d, v6.2d
+	fadd	v3.2d, v3.2d, v7.2d
+	stp	q2, q3, [x10]
+	add	x10, x10, #64           // =64
+	adds	x12, x12, #1            // =1
+	b.ne	.LBB0_32
+.LBB0_33:                               //   in Loop: Header=BB0_22 Depth=2
+	mov	x12, x26
+	cmp	x26, x24
+	b.eq	.LBB0_36
+.LBB0_34:                               //   in Loop: Header=BB0_22 Depth=2
+	ldr	x8, [sp, #88]           // 8-byte Folded Reload
+	lsl	x11, x12, #3
+	sub	x12, x24, x12
+	add	x8, x8, x11
+	add	x9, x22, x11
+	add	x10, x23, x11
+	add	x11, x20, x11
+	.p2align	6
+.LBB0_35:                               //   Parent Loop BB0_20 Depth=1
+                                        //     Parent Loop BB0_22 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	d0, [x8], #8
+	ldr	d1, [x9], #8
+	ldr	d2, [x10], #8
+	fmul	d1, d1, d2
+	fadd	d0, d0, d1
+	str	d0, [x11], #8
+	subs	x12, x12, #1            // =1
+	b.ne	.LBB0_35
+.LBB0_36:                               //   in Loop: Header=BB0_22 Depth=2
+	add	w21, w21, #1            // =1
+	cmp	w21, w25
+	b.ne	.LBB0_22
+.LBB0_37:                               //   in Loop: Header=BB0_20 Depth=1
+	add	x0, sp, #104            // =104
+	add	x1, sp, #96             // =96
+	bl	timing
+	ldr	d0, [sp, #104]
+	ldur	d1, [x29, #-88]
+	fsub	d1, d0, d1
+	lsl	w25, w25, #1
+	fcmp	d1, d8
+	b.mi	.LBB0_20
+.LBB0_38:
+	scvtf	d4, w19
+	lsr	w1, w25, #1
+	adrp	x8, .LCPI0_1
+	scvtf	d6, w1
+	fadd	d2, d4, d4
+	ldr	d5, [x8, :lo12:.LCPI0_1]
+	adrp	x8, .LCPI0_2
+	fmov	d0, #8.00000000
+	fmul	d2, d2, d6
+	ldr	d3, [x8, :lo12:.LCPI0_2]
+	adrp	x8, .LCPI0_3
+	adrp	x0, .L.str
+	fmul	d2, d2, d3
+	ldr	d3, [x8, :lo12:.LCPI0_3]
+	adrp	x8, .LCPI0_4
+	add	x0, x0, :lo12:.L.str
+	fmul	d3, d6, d3
+	fmul	d0, d4, d0
+	fmul	d3, d3, d4
+	fmul	d4, d4, d6
+	fdiv	d3, d3, d1
+	fdiv	d4, d4, d1
+	fdiv	d4, d4, d5
+	fdiv	d0, d0, d5
+	fdiv	d2, d2, d1
+	ldr	d7, [x8, :lo12:.LCPI0_4]
+	fmul	d3, d3, d7
+	fdiv	d4, d4, d5
+	fmul	d3, d3, d7
+	mov	w2, w19
+	bl	printf
+	mov	x0, x20
+	bl	free
+	ldr	x0, [sp, #88]           // 8-byte Folded Reload
+	bl	free
+	mov	x0, x22
+	bl	free
+	mov	x0, x23
+	bl	free
+	ldp	x29, x30, [sp, #208]    // 16-byte Folded Reload
+	ldp	x20, x19, [sp, #192]    // 16-byte Folded Reload
+	ldp	x22, x21, [sp, #176]    // 16-byte Folded Reload
+	ldp	x24, x23, [sp, #160]    // 16-byte Folded Reload
+	ldp	x26, x25, [sp, #144]    // 16-byte Folded Reload
+	ldp	x28, x27, [sp, #128]    // 16-byte Folded Reload
+	ldr	d8, [sp, #112]          // 8-byte Folded Reload
+	add	sp, sp, #224            // =224
+	ret
+.Lfunc_end0:
+	.size	triad, .Lfunc_end0-triad
+	.cfi_endproc
+                                        // -- End function
+	.globl	main                    // -- Begin function main
+	.p2align	6
+	.type	main,@function
+main:                                   // @main
+	.cfi_startproc
+// %bb.0:
+	stp	x29, x30, [sp, #-16]!   // 16-byte Folded Spill
+	mov	x29, sp
+	.cfi_def_cfa w29, 16
+	.cfi_offset w30, -8
+	.cfi_offset w29, -16
+	adrp	x0, .Lstr
+	add	x0, x0, :lo12:.Lstr
+	bl	puts
+	adrp	x0, .Lstr.3
+	add	x0, x0, :lo12:.Lstr.3
+	bl	puts
+	mov	w0, #190
+	bl	triad
+	mov	w0, #247
+	bl	triad
+	mov	w0, #321
+	bl	triad
+	mov	w0, #417
+	bl	triad
+	mov	w0, #542
+	bl	triad
+	mov	w0, #705
+	bl	triad
+	mov	w0, #917
+	bl	triad
+	mov	w0, #1192
+	bl	triad
+	mov	w0, #1550
+	bl	triad
+	mov	w0, #2015
+	bl	triad
+	mov	w0, #2619
+	bl	triad
+	mov	w0, #3405
+	bl	triad
+	mov	w0, #4427
+	bl	triad
+	mov	w0, #5756
+	bl	triad
+	mov	w0, #7482
+	bl	triad
+	mov	w0, #9727
+	bl	triad
+	mov	w0, wzr
+	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
+	ret
+.Lfunc_end1:
+	.size	main, .Lfunc_end1-main
+	.cfi_endproc
+	.type	.L.str,@object          // @.str
+	.section	.rodata.str1.1,"aMS",@progbits,1
+.L.str:
+	.asciz	"%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
+	.size	.L.str, 53
+	.type	.Lstr,@object           // @str
+	.section	.rodata.str1.16,"aMS",@progbits,1
+	.p2align	4
+.Lstr:
+	.asciz	"TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
+	.size	.Lstr, 51
+	.type	.Lstr.3,@object         // @str.3
+	.p2align	4
+.Lstr.3:
+	.asciz	"Size (KByte) |   runtime  |  MFlop/s  |  MB/s   |  MLUP/s | repeat | size"
+	.size	.Lstr.3, 74
+	.ident	"Arm C/C++/Fortran Compiler version 19.0 (build number 69) (based on LLVM 7.0.2)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
--- a/tests/test_files/triad-x86-iaca.s
+++ b/tests/test_files/triad-x86-iaca.s
@@ -0,0 +1,353 @@
+	.file	"triad.c"
+	.section	.rodata.str1.8,"aMS",@progbits,1
+	.align 8
+.LC9:
+	.string	"%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
+	.text
+	.p2align 4,,15
+	.globl	triad
+	.type	triad, @function
+triad:
+.LFB24:
+	.cfi_startproc
+	pushq	%r13
+	.cfi_def_cfa_offset 16
+	.cfi_offset 13, -16
+	movslq	%edi, %rax
+	movl	$64, %edi
+	leaq	16(%rsp), %r13
+	.cfi_def_cfa 13, 0
+	andq	$-32, %rsp
+	pushq	-8(%r13)
+	pushq	%rbp
+	.cfi_escape 0x10,0x6,0x2,0x76,0
+	movq	%rsp, %rbp
+	pushq	%r15
+	.cfi_escape 0x10,0xf,0x2,0x76,0x78
+	leaq	0(,%rax,8), %r15
+	pushq	%r14
+	movq	%r15, %rsi
+	pushq	%r13
+	.cfi_escape 0xf,0x3,0x76,0x68,0x6
+	.cfi_escape 0x10,0xe,0x2,0x76,0x70
+	pushq	%r12
+	pushq	%rbx
+	.cfi_escape 0x10,0xc,0x2,0x76,0x60
+	.cfi_escape 0x10,0x3,0x2,0x76,0x58
+	movq	%rax, %rbx
+	subq	$72, %rsp
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r14
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r12
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r13
+	call	aligned_alloc
+	movq	%rax, %r15
+	leal	-1(%rbx), %eax
+	movl	%eax, -96(%rbp)
+	testl	%ebx, %ebx
+	jle	.L2
+	cmpl	$2, %eax
+	jbe	.L14
+	movl	%ebx, %esi
+	vmovapd	.LC0(%rip), %ymm0
+	xorl	%eax, %eax
+	xorl	%ecx, %ecx
+	shrl	$2, %esi
+	.p2align 4,,10
+	.p2align 3
+.L4:
+	addl	$1, %ecx
+	vmovapd	%ymm0, (%r15,%rax)
+	vmovapd	%ymm0, 0(%r13,%rax)
+	vmovapd	%ymm0, (%r12,%rax)
+	vmovapd	%ymm0, (%r14,%rax)
+	addq	$32, %rax
+	cmpl	%ecx, %esi
+	ja	.L4
+	movl	%ebx, %eax
+	andl	$-4, %eax
+	cmpl	%eax, %ebx
+	je	.L26
+	vzeroupper
+.L3:
+	vmovsd	.LC1(%rip), %xmm0
+	movslq	%eax, %rcx
+	vmovsd	%xmm0, (%r15,%rcx,8)
+	vmovsd	%xmm0, 0(%r13,%rcx,8)
+	vmovsd	%xmm0, (%r12,%rcx,8)
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	leal	1(%rax), %ecx
+	cmpl	%ecx, %ebx
+	jle	.L2
+	movslq	%ecx, %rcx
+	addl	$2, %eax
+	vmovsd	%xmm0, (%r15,%rcx,8)
+	vmovsd	%xmm0, 0(%r13,%rcx,8)
+	vmovsd	%xmm0, (%r12,%rcx,8)
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	cmpl	%eax, %ebx
+	jle	.L2
+	cltq
+	vmovsd	%xmm0, (%r15,%rax,8)
+	vmovsd	%xmm0, 0(%r13,%rax,8)
+	vmovsd	%xmm0, (%r12,%rax,8)
+	vmovsd	%xmm0, (%r14,%rax,8)
+.L2:
+	movl	%ebx, %eax
+	movl	$1, -84(%rbp)
+	movl	%ebx, %r10d
+	andl	$-4, %eax
+	shrl	$2, %r10d
+	movl	%eax, -100(%rbp)
+	.p2align 4,,10
+	.p2align 3
+.L13:
+	leaq	-56(%rbp), %rsi
+	leaq	-72(%rbp), %rdi
+	movl	%r10d, -88(%rbp)
+	call	timing
+	movl	-88(%rbp), %r10d
+	xorl	%r11d, %r11d
+	.p2align 4,,10
+	.p2align 3
+.L12:
+	vmovsd	(%r14), %xmm0
+	vxorpd	%xmm7, %xmm7, %xmm7
+	vucomisd	%xmm7, %xmm0
+	jbe	.L6
+	movq	%r14, %rdi
+	movl	%r11d, -92(%rbp)
+	movl	%r10d, -88(%rbp)
+	vzeroupper
+	call	dummy
+	movl	-92(%rbp), %r11d
+	movl	-88(%rbp), %r10d
+.L6:
+	testl	%ebx, %ebx
+	jle	.L8
+	cmpl	$2, -96(%rbp)
+	jbe	.L15
+	xorl	%eax, %eax
+	xorl	%ecx, %ecx
+	.p2align 4,,10
+	.p2align 3
+        movl      $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+.L10:
+	vmovapd	(%r15,%rax), %ymm0
+	vmovapd	(%r12,%rax), %ymm3
+	addl	$1, %ecx
+	vfmadd132pd	0(%r13,%rax), %ymm3, %ymm0
+	vmovapd	%ymm0, (%r14,%rax)
+	addq	$32, %rax
+	cmpl	%ecx, %r10d
+	ja	.L10
+        movl      $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+	movl	-100(%rbp), %eax
+	cmpl	%ebx, %eax
+	je	.L8
+.L9:
+	movslq	%eax, %rcx
+	vmovsd	0(%r13,%rcx,8), %xmm0
+	vmovsd	(%r12,%rcx,8), %xmm5
+	vfmadd132sd	(%r15,%rcx,8), %xmm5, %xmm0
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	leal	1(%rax), %ecx
+	cmpl	%ebx, %ecx
+	jge	.L8
+	movslq	%ecx, %rcx
+	addl	$2, %eax
+	vmovsd	0(%r13,%rcx,8), %xmm0
+	vmovsd	(%r12,%rcx,8), %xmm6
+	vfmadd132sd	(%r15,%rcx,8), %xmm6, %xmm0
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	cmpl	%eax, %ebx
+	jle	.L8
+	cltq
+	vmovsd	(%r15,%rax,8), %xmm0
+	vmovsd	(%r12,%rax,8), %xmm4
+	vfmadd132sd	0(%r13,%rax,8), %xmm4, %xmm0
+	vmovsd	%xmm0, (%r14,%rax,8)
+.L8:
+	addl	$1, %r11d
+	cmpl	-84(%rbp), %r11d
+	jne	.L12
+	leaq	-56(%rbp), %rsi
+	leaq	-64(%rbp), %rdi
+	movl	%r11d, -84(%rbp)
+	movl	%r10d, -88(%rbp)
+	vzeroupper
+	call	timing
+	vmovsd	-64(%rbp), %xmm1
+	vsubsd	-72(%rbp), %xmm1, %xmm1
+	vmovsd	.LC3(%rip), %xmm2
+	movl	-84(%rbp), %r11d
+	movl	-88(%rbp), %r10d
+	vucomisd	%xmm1, %xmm2
+	leal	(%r11,%r11), %eax
+	movl	%eax, -84(%rbp)
+	ja	.L13
+	movl	%eax, %esi
+	vxorpd	%xmm6, %xmm6, %xmm6
+	vxorpd	%xmm0, %xmm0, %xmm0
+	movl	%ebx, %edx
+	sarl	%esi
+	vcvtsi2sd	%ebx, %xmm0, %xmm0
+	movl	$.LC9, %edi
+	movl	$5, %eax
+	vcvtsi2sd	%esi, %xmm6, %xmm6
+	vmulsd	.LC5(%rip), %xmm6, %xmm2
+	vmovsd	.LC4(%rip), %xmm5
+	vmovsd	.LC6(%rip), %xmm7
+	vmulsd	%xmm0, %xmm6, %xmm4
+	vmulsd	%xmm0, %xmm2, %xmm2
+	vdivsd	%xmm1, %xmm4, %xmm4
+	vdivsd	%xmm1, %xmm2, %xmm2
+	vdivsd	%xmm5, %xmm4, %xmm4
+	vmulsd	%xmm7, %xmm2, %xmm3
+	vaddsd	%xmm0, %xmm0, %xmm2
+	vmulsd	.LC8(%rip), %xmm0, %xmm0
+	vmulsd	%xmm6, %xmm2, %xmm2
+	vmulsd	.LC7(%rip), %xmm2, %xmm2
+	vmulsd	%xmm7, %xmm3, %xmm3
+	vdivsd	%xmm5, %xmm0, %xmm0
+	vdivsd	%xmm5, %xmm4, %xmm4
+	vdivsd	%xmm1, %xmm2, %xmm2
+	call	printf
+	movq	%r14, %rdi
+	call	free
+	movq	%r12, %rdi
+	call	free
+	movq	%r13, %rdi
+	call	free
+	addq	$72, %rsp
+	movq	%r15, %rdi
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	.cfi_remember_state
+	.cfi_def_cfa 13, 0
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	leaq	-16(%r13), %rsp
+	.cfi_def_cfa 7, 16
+	popq	%r13
+	.cfi_def_cfa_offset 8
+	jmp	free
+	.p2align 4,,10
+	.p2align 3
+.L15:
+	.cfi_restore_state
+	xorl	%eax, %eax
+	jmp	.L9
+.L26:
+	vzeroupper
+	jmp	.L2
+.L14:
+	xorl	%eax, %eax
+	jmp	.L3
+	.cfi_endproc
+.LFE24:
+	.size	triad, .-triad
+	.section	.rodata.str1.8
+	.align 8
+.LC10:
+	.string	"TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
+	.align 8
+.LC11:
+	.string	"Size (KByte) |   runtime  |  MFlop/s  |  MB/s   |  MLUP/s | repeat | size"
+	.section	.text.startup,"ax",@progbits
+	.p2align 4,,15
+	.globl	main
+	.type	main, @function
+main:
+.LFB25:
+	.cfi_startproc
+	pushq	%rbx
+	.cfi_def_cfa_offset 16
+	.cfi_offset 3, -16
+	movl	$.LC10, %edi
+	movl	$20, %ebx
+	call	puts
+	movl	$.LC11, %edi
+	call	puts
+	.p2align 4,,10
+	.p2align 3
+.L28:
+	vxorpd	%xmm1, %xmm1, %xmm1
+	movq	.LC12(%rip), %rax
+	vcvtsi2sd	%ebx, %xmm1, %xmm1
+	addl	$1, %ebx
+	vmovq	%rax, %xmm0
+	call	pow
+	vcvttsd2si	%xmm0, %edi
+	call	triad
+	cmpl	$36, %ebx
+	jne	.L28
+	xorl	%eax, %eax
+	popq	%rbx
+	.cfi_def_cfa_offset 8
+	ret
+	.cfi_endproc
+.LFE25:
+	.size	main, .-main
+	.section	.rodata.cst32,"aM",@progbits,32
+	.align 32
+.LC0:
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.section	.rodata.cst8,"aM",@progbits,8
+	.align 8
+.LC1:
+	.long	1907715710
+	.long	1048610426
+	.align 8
+.LC3:
+	.long	2576980378
+	.long	1070176665
+	.align 8
+.LC4:
+	.long	0
+	.long	1083129856
+	.align 8
+.LC5:
+	.long	0
+	.long	1077936128
+	.align 8
+.LC6:
+	.long	0
+	.long	1062207488
+	.align 8
+.LC7:
+	.long	2696277389
+	.long	1051772663
+	.align 8
+.LC8:
+	.long	0
+	.long	1075838976
+	.align 8
+.LC12:
+	.long	3435973837
+	.long	1073007820
+	.ident	"GCC: (GNU) 7.2.0"
+	.section	.note.GNU-stack,"",@progbits
--- a/tests/test_frontend.py
+++ b/tests/test_frontend.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Unit tests for OSACA Frontend
+"""
+
+import os
+import unittest
+
+from osaca.frontend import Frontend
+from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.semantics.hw_model import MachineModel
+from osaca.semantics.kernel_dg import KernelDG
+from osaca.semantics.semantics_appender import SemanticsAppender
+
+
+class TestFrontend(unittest.TestCase):
+    MODULE_DATA_DIR = os.path.join(
+        os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
+    )
+
+    @classmethod
+    def setUpClass(self):
+        # set up parser and kernels
+        self.parser_x86 = ParserX86ATT()
+        self.parser_AArch64 = ParserAArch64v81()
+        with open(self._find_file('kernel-x86.s')) as f:
+            code_x86 = f.read()
+        with open(self._find_file('kernel-AArch64.s')) as f:
+            code_AArch64 = f.read()
+        self.kernel_x86 = self.parser_x86.parse_file(code_x86)
+        self.kernel_AArch64 = self.parser_AArch64.parse_file(code_AArch64)
+
+        # set up machine models
+        self.machine_model_csx = MachineModel(
+            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
+        )
+        self.machine_model_tx2 = MachineModel(
+            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
+        )
+        self.semantics_csx = SemanticsAppender(
+            self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
+        )
+        self.semantics_tx2 = SemanticsAppender(
+            self.machine_model_tx2,
+            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/aarch64.yml'),
+        )
+        for i in range(len(self.kernel_x86)):
+            self.semantics_csx.assign_src_dst(self.kernel_x86[i])
+            self.semantics_csx.assign_tp_lt(self.kernel_x86[i])
+        for i in range(len(self.kernel_AArch64)):
+            self.semantics_tx2.assign_src_dst(self.kernel_AArch64[i])
+            self.semantics_tx2.assign_tp_lt(self.kernel_AArch64[i])
+
+    ###########
+    # Tests
+    ###########
+
+    def test_frontend_creation(self):
+        with self.assertRaises(ValueError):
+            Frontend()
+        with self.assertRaises(ValueError):
+            Frontend(arch='csx', path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml'))
+        with self.assertRaises(FileNotFoundError):
+            Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'THE_MACHINE.yml'))
+        with self.assertRaises(FileNotFoundError):
+            Frontend(arch='THE_MACHINE')
+        Frontend(arch='zen1')
+
+    def test_frontend_x86(self):
+        dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
+        fe = Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml'))
+        fe.print_throughput_analysis(self.kernel_x86, show_cmnts=False)
+        fe.print_latency_analysis(dg.get_critical_path())
+
+    def test_frontend_AArch64(self):
+        dg = KernelDG(self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2)
+        fe = Frontend(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml'))
+        fe.print_full_analysis(self.kernel_AArch64, dg, verbose=True)
+
+    ##################
+    # Helper functions
+    ##################
+
+    @staticmethod
+    def _find_file(name):
+        testdir = os.path.dirname(__file__)
+        name = os.path.join(testdir, 'test_files', name)
+        assert os.path.exists(name)
+        return name
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestFrontend)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_kerncraftAPI.py
+++ b/tests/test_kerncraftAPI.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+Unit tests for OSACA Kerncraft API
+"""
+
+import os
+import unittest
+
+from collections import OrderedDict
+
+from osaca.api import KerncraftAPI
+from osaca.parser import ParserAArch64v81, ParserX86ATT
+
+
+class TestKerncraftAPI(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        # set up parser and kernels
+        self.parser_x86 = ParserX86ATT()
+        self.parser_AArch64 = ParserAArch64v81()
+        with open(self._find_file('triad-x86-iaca.s')) as f:
+            self.code_x86 = f.read()
+        with open(self._find_file('triad-arm-iaca.s')) as f:
+            self.code_AArch64 = f.read()
+
+    ###########
+    # Tests
+    ###########
+
+    def test_kerncraft_API_x86(self):
+        kapi = KerncraftAPI('csx', self.code_x86)
+
+        kapi.create_output()
+        self.assertEqual(kapi.get_unmatched_instruction_ratio(), 0.0)
+        port_occupation = OrderedDict(
+            [
+                ('0', 1.25),
+                ('0DV', 0.0),
+                ('1', 1.25),
+                ('2', 2.0),
+                ('2D', 1.5),
+                ('3', 2.0),
+                ('3D', 1.5),
+                ('4', 1.0),
+                ('5', 0.75),
+                ('6', 0.75),
+                ('7', 0.0),
+            ]
+        )
+        self.assertEqual(kapi.get_port_occupation_cycles(), port_occupation)
+        self.assertEqual(kapi.get_total_throughput(), 2.0)
+        self.assertEqual(kapi.get_latency(), (1.0, 13.0))
+
+    def test_kerncraft_API_AArch64(self):
+        kapi = KerncraftAPI('tx2', self.code_AArch64)
+
+        kapi.create_output()
+        self.assertEqual(kapi.get_unmatched_instruction_ratio(), 0.0)
+        port_occupation = OrderedDict(
+            [
+                ('0', 34.0),
+                ('0DV', 0.0),
+                ('1', 34.0),
+                ('1DV', 0.0),
+                ('2', 2.0),
+                ('3', 64.0),
+                ('4', 64.0),
+                ('5', 32.0),
+            ]
+        )
+        self.assertEqual(kapi.get_port_occupation_cycles(), port_occupation)
+        self.assertEqual(kapi.get_total_throughput(), 64.0)
+        # TODO add missing latency values
+        # self.assertEqual(kapi.get_latency(kernel), 20.0)
+
+    ##################
+    # Helper functions
+    ##################
+
+    @staticmethod
+    def _find_file(name):
+        testdir = os.path.dirname(__file__)
+        name = os.path.join(testdir, 'test_files', name)
+        assert os.path.exists(name)
+        return name
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestKerncraftAPI)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_marker_utils.py
+++ b/tests/test_marker_utils.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+"""
+Unit tests for IACA/OSACA marker utilities
+"""
+import os
+import unittest
+
+from osaca.semantics import reduce_to_section
+from osaca.parser import ParserAArch64v81, ParserX86ATT
+
+
+class TestMarkerUtils(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.parser_AArch = ParserAArch64v81()
+        self.parser_x86 = ParserX86ATT()
+        with open(self._find_file('triad-arm-iaca.s')) as f:
+            triad_code_arm = f.read()
+        with open(self._find_file('triad-x86-iaca.s')) as f:
+            triad_code_x86 = f.read()
+        self.parsed_AArch = self.parser_AArch.parse_file(triad_code_arm)
+        self.parsed_x86 = self.parser_x86.parse_file(triad_code_x86)
+
+    #################
+    # Test
+    #################
+
+    def test_marker_detection_AArch64(self):
+        kernel = reduce_to_section(self.parsed_AArch, 'AArch64')
+        self.assertEquals(len(kernel), 138)
+        self.assertEquals(kernel[0].line_number, 307)
+        self.assertEquals(kernel[-1].line_number, 444)
+
+    def test_marker_detection_x86(self):
+        kernel = reduce_to_section(self.parsed_x86, 'x86')
+        self.assertEquals(len(kernel), 9)
+        self.assertEquals(kernel[0].line_number, 146)
+        self.assertEquals(kernel[-1].line_number, 154)
+
+    def test_marker_matching_AArch64(self):
+        # preparation
+        bytes_1_line = '.byte     213,3,32,31\n'
+        bytes_2_lines_1 = '.byte     213,3,32\n' + '.byte 31\n'
+        bytes_2_lines_2 = '.byte     213,3\n' + '.byte 32,31\n'
+        bytes_2_lines_3 = '.byte     213\n' + '.byte 3,32,31\n'
+        bytes_3_lines_1 = '.byte     213,3\n' + '.byte     32\n' + '.byte     31\n'
+        bytes_3_lines_2 = '.byte     213\n' + '.byte     3,32\n' + '.byte     31\n'
+        bytes_3_lines_3 = '.byte     213\n' + '.byte     3\n' + '.byte     32,31\n'
+        bytes_4_lines = '.byte     213\n' + '.byte     3\n' + '.byte     32\n' + '.byte     31\n'
+        mov_start_1 = 'mov      x1, #111\n'
+        mov_start_2 = 'mov      x1, 111  // should work as well\n'
+        mov_end_1 = 'mov      x1, #222 // preferred way\n'
+        mov_end_2 = 'mov      x1, 222\n'
+        prologue = (
+            'mov x12, xzr\n'
+            + '\tldp x9, x10, [sp, #16]      // 8-byte Folded Reload\n'
+            + '     .p2align    6\n'
+        )
+        kernel = (
+            '.LBB0_28:\n'
+            + 'fmul    v7.2d, v7.2d, v19.2d\n'
+            + 'stp q0, q1, [x10, #-32]\n'
+            + 'b.ne    .LBB0_28\n'
+        )
+        epilogue = '.LBB0_29:   //   Parent Loop BB0_20 Depth=1\n' + 'bl    dummy\n'
+        kernel_length = len(list(filter(None, kernel.split('\n'))))
+
+        bytes_variations = [
+            bytes_1_line,
+            bytes_2_lines_1,
+            bytes_2_lines_2,
+            bytes_2_lines_3,
+            bytes_3_lines_1,
+            bytes_3_lines_2,
+            bytes_3_lines_3,
+            bytes_4_lines,
+        ]
+        mov_start_variations = [mov_start_1, mov_start_2]
+        mov_end_variations = [mov_end_1, mov_end_2]
+        # actual tests
+        for mov_start_var in mov_start_variations:
+            for bytes_var_1 in bytes_variations:
+                for mov_end_var in mov_end_variations:
+                    for bytes_var_2 in bytes_variations:
+                        sample_code = (
+                            prologue
+                            + mov_start_var
+                            + bytes_var_1
+                            + kernel
+                            + mov_end_var
+                            + bytes_var_2
+                            + epilogue
+                        )
+                        with self.subTest(
+                            mov_start=mov_start_var,
+                            bytes_start=bytes_var_1,
+                            mov_end=mov_end_var,
+                            bytes_end=bytes_var_2,
+                        ):
+                            sample_parsed = self.parser_AArch.parse_file(sample_code)
+                            sample_kernel = reduce_to_section(sample_parsed, 'AArch64')
+                            self.assertEquals(len(sample_kernel), kernel_length)
+                            kernel_start = len(
+                                list(
+                                    filter(
+                                        None, (prologue + mov_start_var + bytes_var_1).split('\n')
+                                    )
+                                )
+                            )
+                            parsed_kernel = self.parser_AArch.parse_file(
+                                kernel, start_line=kernel_start
+                            )
+                            self.assertEquals(sample_kernel, parsed_kernel)
+
+    def test_marker_matching_x86(self):
+        # preparation
+        bytes_1_line = '.byte     100,103,144\n'
+        bytes_2_lines_1 = '.byte     100,103\n' + '.byte 144\n'
+        bytes_2_lines_2 = '.byte     100\n' + '.byte 103,144\n'
+        bytes_3_lines = (
+            '.byte     100 # IACA MARKER UTILITY\n'
+            + '.byte     103 # IACA MARKER UTILITY\n'
+            + '.byte     144 # IACA MARKER UTILITY\n'
+        )
+        mov_start_1 = 'movl      $111, %ebx # IACA START\n'
+        mov_start_2 = 'mov      $111, %ebx # IACA START\n'
+        mov_end_1 = 'movl      $222, %ebx # IACA END\n'
+        mov_end_2 = 'mov      $222, %ebx # IACA END\n'
+        prologue = 'movl    -92(%rbp), %r11d\n' + 'movl      $111, %ebx\n'
+        kernel = (
+            'vfmadd132sd (%r15,%rcx,8), %xmm5, %xmm0\n'
+            + 'vmovsd  %xmm0, (%r14,%rcx,8)\n'
+            + 'cmpl    %ebx, %ecx\n'
+            + 'jge .L8\n'
+        )
+        epilogue = '.LE9:\t\t#12.2\n' 'call    dummy\n'
+        kernel_length = len(list(filter(None, kernel.split('\n'))))
+
+        bytes_variations = [bytes_1_line, bytes_2_lines_1, bytes_2_lines_2, bytes_3_lines]
+        mov_start_variations = [mov_start_1, mov_start_2]
+        mov_end_variations = [mov_end_1, mov_end_2]
+        # actual tests
+        for mov_start_var in mov_start_variations:
+            for bytes_var_1 in bytes_variations:
+                for mov_end_var in mov_end_variations:
+                    for bytes_var_2 in bytes_variations:
+                        sample_code = (
+                            prologue
+                            + mov_start_var
+                            + bytes_var_1
+                            + kernel
+                            + mov_end_var
+                            + bytes_var_2
+                            + epilogue
+                        )
+                        with self.subTest(
+                            mov_start=mov_start_var,
+                            bytes_start=bytes_var_1,
+                            mov_end=mov_end_var,
+                            bytes_end=bytes_var_2,
+                        ):
+                            sample_parsed = self.parser_x86.parse_file(sample_code)
+                            sample_kernel = reduce_to_section(sample_parsed, 'x86')
+                            self.assertEquals(len(sample_kernel), kernel_length)
+                            kernel_start = len(
+                                list(
+                                    filter(
+                                        None, (prologue + mov_start_var + bytes_var_1).split('\n')
+                                    )
+                                )
+                            )
+                            parsed_kernel = self.parser_x86.parse_file(
+                                kernel, start_line=kernel_start
+                            )
+                            self.assertEquals(sample_kernel, parsed_kernel)
+
+    def test_marker_special_cases_AArch(self):
+        bytes_line = '.byte     213,3,32,31\n'
+        mov_start = 'mov      x1, #111\n'
+        mov_end = 'mov      x1, #222\n'
+        prologue = 'dup v0.2d, x14\n' + '    neg x9, x9\n' + '    .p2align    6\n'
+        kernel = (
+            '.LBB0_28:\n'
+            + 'fmul    v7.2d, v7.2d, v19.2d\n'
+            + 'stp q0, q1, [x10, #-32]\n'
+            + 'b.ne    .LBB0_28\n'
+        )
+        epilogue = '.LBB0_29:   //   Parent Loop BB0_20 Depth=1\n' + 'bl    dummy\n'
+        kernel_length = len(list(filter(None, kernel.split('\n'))))
+
+        # marker directly at the beginning
+        code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
+        beginning_parsed = self.parser_AArch.parse_file(code_beginning)
+        test_kernel = reduce_to_section(beginning_parsed, 'AArch64')
+        self.assertEquals(len(test_kernel), kernel_length)
+        kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
+        parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
+        self.assertEquals(test_kernel, parsed_kernel)
+
+        # marker at the end
+        code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
+        end_parsed = self.parser_AArch.parse_file(code_end)
+        test_kernel = reduce_to_section(end_parsed, 'AArch64')
+        self.assertEquals(len(test_kernel), kernel_length)
+        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
+        parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
+        self.assertEquals(test_kernel, parsed_kernel)
+
+        # no kernel
+        code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
+        empty_parsed = self.parser_AArch.parse_file(code_empty)
+        test_kernel = reduce_to_section(empty_parsed, 'AArch64')
+        self.assertEquals(len(test_kernel), 0)
+        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
+        self.assertEquals(test_kernel, [])
+
+        # no start marker
+        code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
+        no_start_parsed = self.parser_AArch.parse_file(code_no_start)
+        with self.assertRaises(LookupError):
+            reduce_to_section(no_start_parsed, 'AArch64')
+
+        # no end marker
+        code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
+        no_end_parsed = self.parser_AArch.parse_file(code_no_end)
+        with self.assertRaises(LookupError):
+            reduce_to_section(no_end_parsed, 'AArch64')
+
+        # no marker at all
+        code_no_marker = prologue + kernel + epilogue
+        no_marker_parsed = self.parser_AArch.parse_file(code_no_marker)
+        with self.assertRaises(LookupError):
+            reduce_to_section(no_marker_parsed, 'AArch64')
+
+    def test_marker_special_cases_x86(self):
+        bytes_line = '.byte     100\n.byte     103\n.byte     144\n'
+        mov_start = 'movl     $111, %ebx\n'
+        mov_end = 'movl     $222, %ebx\n'
+        prologue = 'movl    -88(%rbp), %r10d\n' + 'xorl    %r11d, %r11d\n' + '.p2align 4,,10\n'
+        kernel = (
+            '.L3: #L3\n'
+            + 'vmovsd  .LC1(%rip), %xmm0\n'
+            + 'vmovsd  %xmm0, (%r15,%rcx,8)\n'
+            + 'cmpl    %ecx, %ebx\n'
+            + 'jle .L3\n'
+        )
+        epilogue = 'leaq    -56(%rbp), %rsi\n' + 'movl    %r10d, -88(%rbp)\n' + 'call    timing\n'
+        kernel_length = len(list(filter(None, kernel.split('\n'))))
+
+        # marker directly at the beginning
+        code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
+        beginning_parsed = self.parser_x86.parse_file(code_beginning)
+        test_kernel = reduce_to_section(beginning_parsed, 'x86')
+        self.assertEquals(len(test_kernel), kernel_length)
+        kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
+        parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
+        self.assertEquals(test_kernel, parsed_kernel)
+
+        # marker at the end
+        code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
+        end_parsed = self.parser_x86.parse_file(code_end)
+        test_kernel = reduce_to_section(end_parsed, 'x86')
+        self.assertEquals(len(test_kernel), kernel_length)
+        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
+        parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
+        self.assertEquals(test_kernel, parsed_kernel)
+
+        # no kernel
+        code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
+        empty_parsed = self.parser_x86.parse_file(code_empty)
+        test_kernel = reduce_to_section(empty_parsed, 'x86')
+        self.assertEquals(len(test_kernel), 0)
+        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
+        self.assertEquals(test_kernel, [])
+
+        # no start marker
+        code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
+        no_start_parsed = self.parser_x86.parse_file(code_no_start)
+        with self.assertRaises(LookupError):
+            reduce_to_section(no_start_parsed, 'x86')
+
+        # no end marker
+        code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
+        no_end_parsed = self.parser_x86.parse_file(code_no_end)
+        with self.assertRaises(LookupError):
+            reduce_to_section(no_end_parsed, 'x86')
+
+        # no marker at all
+        code_no_marker = prologue + kernel + epilogue
+        no_marker_parsed = self.parser_x86.parse_file(code_no_marker)
+        with self.assertRaises(LookupError):
+            reduce_to_section(no_marker_parsed, 'x86')
+
+    ##################
+    # Helper functions
+    ##################
+
+    @staticmethod
+    def _find_file(name):
+        testdir = os.path.dirname(__file__)
+        name = os.path.join(testdir, 'test_files', name)
+        assert os.path.exists(name)
+        return name
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestMarkerUtils)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_osaca.py
+++ b/tests/test_osaca.py
@@ -1,69 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-from io import StringIO
-import os
-
-import unittest
-
-sys.path.insert(0, '..')
-from osaca import osaca
-
-
-class TestOsaca(unittest.TestCase):
-    maxDiff = None
-
-    def setUp(self):
-        self.curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
-
-    @unittest.skip("Binary analysis is error prone and currently not working with FSF's objdump")
-    def testIACABinary(self):
-        assembly = osaca.get_assembly_from_binary(self.curr_dir + '/testfiles/taxCalc-ivb-iaca')
-        osa = osaca.OSACA('IVB', assembly)
-        result = osa.generate_text_output()
-        result = result[result.find('Port Binding in Cycles Per Iteration:'):]
-        with open(self.curr_dir + '/test_osaca_iaca.out', encoding='utf-8') as f:
-            assertion = f.read()
-        self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
-
-    # Test ASM file with IACA marker in two lines
-    def testIACAasm1(self):
-        with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca.S') as f:
-            osa = osaca.OSACA('IVB', f.read())
-        result = osa.generate_text_output()
-        result = result[result.find('Port Binding in Cycles Per Iteration:'):]
-        with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
-            assertion = f.read()
-        self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
-
-    # Test ASM file with IACA marker in four lines
-    def testIACAasm2(self):
-        with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca2.S') as f:
-            osa = osaca.OSACA('IVB', f.read())
-        result = osa.generate_text_output()
-        result = result[result.find('Port Binding in Cycles Per Iteration:'):]
-        with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
-            assertion = f.read()
-        self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
-
-    #@unittest.skip("Skip until required instructions are supported.")
-    def test_asm_API(self):
-        with open(self.curr_dir + '/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s') as f:
-            osa = osaca.OSACA('SKX', f.read())
-
-        text_output = osa.create_output()
-        print(text_output)
-        # Derived from IACA (and manually considering OSACAs equal distribution to ports)
-        self.assertEqual(dict(osa.get_port_occupation_cycles()),
-                         {'0': 4.0,
-                          '0DV': 0.0,
-                          '1': 3.5,
-                          '2': 3.5,
-                          '3': 3.5,
-                          '4': 1.0,
-                          '5': 4.5,
-                          '6': 3.5,
-                          '7': 0.0})
-        # TODO consider frontend bottleneck -> 6.25 cy
-        self.assertEqual(osa.get_total_throughput(),
-                         4.5)
--- a/tests/test_osaca_iaca.out
+++ b/tests/test_osaca_iaca.out
@@ -1,26 +0,0 @@
-Port Binding in Cycles Per Iteration:
-------------------------------------------------
-|  Port  |   0  |   1  |  2  |  3  |  4  |   5  |
-------------------------------------------------
-| Cycles | 3.67 | 5.67 | 1.0 | 1.0 | 2.0 | 3.67 |
-------------------------------------------------
-
-
-          Ports Pressure in cycles          
-|  0   |  1   |  2   |  3   |  4   |  5   |
-------------------------------------------
-| 0.50 | 0.50 |      |      |      |      | lea	1(%rax,%rax),%edx
-|      | 1.00 |      |      |      | 1.00 | vcvtsi2ss	%edx,%xmm2,%xmm2
-| 1.00 |      |      |      |      |      | vmulss	%xmm2,%xmm0,%xmm3
-| 0.50 | 0.50 |      |      |      |      | lea	2(%rax,%rax),%ecx
-|      | 1.00 |      |      |      |      | vaddss	%xmm3,%xmm1,%xmm4
-|      |      |      |      |      | 1.00 | vxorps	%xmm1,%xmm1,%xmm1
-|      | 1.00 |      |      |      | 1.00 | vcvtsi2ss	%ecx,%xmm1,%xmm1
-| 1.00 |      |      |      |      |      | vmulss	%xmm1,%xmm0,%xmm5
-|      |      | 0.50 | 0.50 | 1.00 |      | vmovss	%xmm4,4(%rsp,%rax,8)
-|      | 1.00 |      |      |      |      | vaddss	%xmm5,%xmm4,%xmm1
-|      |      | 0.50 | 0.50 | 1.00 |      | vmovss	%xmm1,8(%rsp,%rax,8)
-| 0.33 | 0.33 |      |      |      | 0.33 | inc	%rax
-| 0.33 | 0.33 |      |      |      | 0.33 | cmp	$499,%rax
-|      |      |      |      |      |      | X jb	main_98
-Total number of estimated throughput: 5.67
--- a/tests/test_osaca_iaca_asm.out
+++ b/tests/test_osaca_iaca_asm.out
@@ -1,26 +0,0 @@
-Port Binding in Cycles Per Iteration:
-------------------------------------------------
-|  Port  |   0  |   1  |  2  |  3  |  4  |   5  |
-------------------------------------------------
-| Cycles | 3.67 | 5.67 | 1.0 | 1.0 | 2.0 | 3.67 |
-------------------------------------------------
-
-
-          Ports Pressure in cycles          
-|  0   |  1   |  2   |  3   |  4   |  5   |
-------------------------------------------
-| 0.50 | 0.50 |      |      |      |      | lea       1(%rax,%rax), %edx
-|      | 1.00 |      |      |      | 1.00 | vcvtsi2ss %edx, %xmm2, %xmm2
-| 1.00 |      |      |      |      |      | vmulss    %xmm2, %xmm0, %xmm3
-| 0.50 | 0.50 |      |      |      |      | lea       2(%rax,%rax), %ecx
-|      | 1.00 |      |      |      |      | vaddss    %xmm3, %xmm1, %xmm4
-|      |      |      |      |      | 1.00 | vxorps    %xmm1, %xmm1, %xmm1
-|      | 1.00 |      |      |      | 1.00 | vcvtsi2ss %ecx, %xmm1, %xmm1
-| 1.00 |      |      |      |      |      | vmulss    %xmm1, %xmm0, %xmm5
-|      |      | 0.50 | 0.50 | 1.00 |      | vmovss    %xmm4, 4(%rsp,%rax,8)
-|      | 1.00 |      |      |      |      | vaddss    %xmm5, %xmm4, %xmm1
-|      |      | 0.50 | 0.50 | 1.00 |      | vmovss    %xmm1, 8(%rsp,%rax,8)
-| 0.33 | 0.33 |      |      |      | 0.33 | incq      %rax
-| 0.33 | 0.33 |      |      |      | 0.33 | cmpq      $499, %rax
-|      |      |      |      |      |      | jb        ..B1.4
-Total number of estimated throughput: 5.67
--- a/tests/test_parser_AArch64v81.py
+++ b/tests/test_parser_AArch64v81.py
@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+"""
+Unit tests for ARMv8 AArch64 assembly parser
+"""
+
+import os
+import unittest
+
+from pyparsing import ParseException
+
+from osaca.parser import AttrDict, ParserAArch64v81
+
+
+class TestParserAArch64v81(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.parser = ParserAArch64v81()
+        with open(self._find_file('triad-arm-iaca.s')) as f:
+            self.triad_code = f.read()
+
+    ##################
+    # Test
+    ##################
+
+    def test_comment_parser(self):
+        self.assertEqual(self._get_comment(self.parser, '// some comments'), 'some comments')
+        self.assertEqual(
+            self._get_comment(self.parser, '\t\t//AA BB CC \t end \t'), 'AA BB CC end'
+        )
+        self.assertEqual(
+            self._get_comment(self.parser, '\t//// comment //// comment'),
+            '// comment //// comment',
+        )
+
+    def test_label_parser(self):
+        self.assertEqual(self._get_label(self.parser, 'main:').name, 'main')
+        self.assertEqual(self._get_label(self.parser, '..B1.10:').name, '..B1.10')
+        self.assertEqual(self._get_label(self.parser, '.2.3_2_pack.3:').name, '.2.3_2_pack.3')
+        self.assertEqual(self._get_label(self.parser, '.L1:\t\t\t//label1').name, '.L1')
+        self.assertEqual(
+            ' '.join(self._get_label(self.parser, '.L1:\t\t\t//label1').comment), 'label1'
+        )
+        with self.assertRaises(ParseException):
+            self._get_label(self.parser, '\t.cfi_startproc')
+
+    def test_directive_parser(self):
+        self.assertEqual(self._get_directive(self.parser, '\t.text').name, 'text')
+        self.assertEqual(len(self._get_directive(self.parser, '\t.text').parameters), 0)
+        self.assertEqual(self._get_directive(self.parser, '\t.align\t16,0x90').name, 'align')
+        self.assertEqual(len(self._get_directive(self.parser, '\t.align\t16,0x90').parameters), 2)
+        self.assertEqual(
+            self._get_directive(self.parser, '\t.align\t16,0x90').parameters[1], '0x90'
+        )
+        self.assertEqual(
+            self._get_directive(self.parser, '        .byte 100,103,144       //IACA START')[
+                'name'
+            ],
+            'byte',
+        )
+        self.assertEqual(
+            self._get_directive(self.parser, '        .byte 100,103,144       //IACA START')[
+                'parameters'
+            ][2],
+            '144',
+        )
+        self.assertEqual(
+            ' '.join(
+                self._get_directive(self.parser, '        .byte 100,103,144       //IACA START')[
+                    'comment'
+                ]
+            ),
+            'IACA START',
+        )
+
+    def test_parse_instruction(self):
+        instr1 = '\t\tvcvt.F32.S32 w1, w2\t\t\t//12.27'
+        instr2 = 'b.lo        ..B1.4 \t'
+        instr3 = '        mov x2,#0x222          //NOT IACA END'
+        instr4 = 'str x28, [sp, x1, lsl #4] //12.9'
+        instr5 = 'ldr x0, [x0, #:got_lo12:q2c]'
+        instr6 = 'adrp    x0, :got:visited'
+        instr7 = 'fadd    v17.2d, v16.2d, v1.2d'
+
+        parsed_1 = self.parser.parse_instruction(instr1)
+        parsed_2 = self.parser.parse_instruction(instr2)
+        parsed_3 = self.parser.parse_instruction(instr3)
+        parsed_4 = self.parser.parse_instruction(instr4)
+        parsed_5 = self.parser.parse_instruction(instr5)
+        parsed_6 = self.parser.parse_instruction(instr6)
+        parsed_7 = self.parser.parse_instruction(instr7)
+
+        self.assertEqual(parsed_1.instruction, 'vcvt.F32.S32')
+        self.assertEqual(parsed_1.operands[0].register.name, '1')
+        self.assertEqual(parsed_1.operands[0].register.prefix, 'w')
+        self.assertEqual(parsed_1.operands[1].register.name, '2')
+        self.assertEqual(parsed_1.operands[1].register.prefix, 'w')
+        self.assertEqual(parsed_1.comment, '12.27')
+
+        self.assertEqual(parsed_2.instruction, 'b.lo')
+        self.assertEqual(parsed_2.operands[0].identifier.name, '..B1.4')
+        self.assertEqual(len(parsed_2.operands), 1)
+        self.assertIsNone(parsed_2.comment)
+
+        self.assertEqual(parsed_3.instruction, 'mov')
+        self.assertEqual(parsed_3.operands[0].register.name, '2')
+        self.assertEqual(parsed_3.operands[0].register.prefix, 'x')
+        self.assertEqual(parsed_3.operands[1].immediate.value, '0x222')
+        self.assertEqual(parsed_3.comment, 'NOT IACA END')
+
+        self.assertEqual(parsed_4.instruction, 'str')
+        self.assertIsNone(parsed_4.operands[1].memory.offset)
+        self.assertEqual(parsed_4.operands[1].memory.base.name, 'sp')
+        self.assertEqual(parsed_4.operands[1].memory.base.prefix, 'x')
+        self.assertEqual(parsed_4.operands[1].memory.index.name, '1')
+        self.assertEqual(parsed_4.operands[1].memory.index.prefix, 'x')
+        self.assertEqual(parsed_4.operands[1].memory.scale, 16)
+        self.assertEqual(parsed_4.operands[0].register.name, '28')
+        self.assertEqual(parsed_4.operands[0].register.prefix, 'x')
+        self.assertEqual(parsed_4.comment, '12.9')
+
+        self.assertEqual(parsed_5.instruction, 'ldr')
+        self.assertEqual(parsed_5.operands[0].register.name, '0')
+        self.assertEqual(parsed_5.operands[0].register.prefix, 'x')
+        self.assertEqual(parsed_5.operands[1].memory.offset.identifier.name, 'q2c')
+        self.assertEqual(parsed_5.operands[1].memory.offset.identifier.relocation, ':got_lo12:')
+        self.assertEqual(parsed_5.operands[1].memory.base.name, '0')
+        self.assertEqual(parsed_5.operands[1].memory.base.prefix, 'x')
+        self.assertIsNone(parsed_5.operands[1].memory.index)
+        self.assertEqual(parsed_5.operands[1].memory.scale, 1)
+
+        self.assertEqual(parsed_6.instruction, 'adrp')
+        self.assertEqual(parsed_6.operands[0].register.name, '0')
+        self.assertEqual(parsed_6.operands[0].register.prefix, 'x')
+        self.assertEqual(parsed_6.operands[1].identifier.relocation, ':got:')
+        self.assertEqual(parsed_6.operands[1].identifier.name, 'visited')
+
+        self.assertEqual(parsed_7.instruction, 'fadd')
+        self.assertEqual(parsed_7.operands[0].register.name, '17')
+        self.assertEqual(parsed_7.operands[0].register.prefix, 'v')
+        self.assertEqual(parsed_7.operands[0].register.lanes, '2')
+        self.assertEqual(parsed_7.operands[0].register.shape, 'd')
+        self.assertEqual(self.parser.get_full_reg_name(parsed_7.operands[2].register), 'v1.2d')
+
+    def test_parse_line(self):
+        line_comment = '// -- Begin  main'
+        line_label = '.LBB0_1:              // =>This Inner Loop Header: Depth=1'
+        line_directive = '\t.cfi_def_cfa w29, -16'
+        line_instruction = '\tldr s0, [x11, w10, sxtw #2]\t\t// = <<2'
+        line_prefetch = 'prfm    pldl1keep, [x26, #2048] //HPL'
+        line_preindexed = 'stp x29, x30, [sp, #-16]!'
+        line_postindexed = 'ldp q2, q3, [x11], #64'
+
+        instruction_form_1 = {
+            'instruction': None,
+            'operands': None,
+            'directive': None,
+            'comment': '-- Begin main',
+            'label': None,
+            'line': '// -- Begin  main',
+            'line_number': 1,
+        }
+
+        instruction_form_2 = {
+            'instruction': None,
+            'operands': None,
+            'directive': None,
+            'comment': '=>This Inner Loop Header: Depth=1',
+            'label': '.LBB0_1',
+            'line': '.LBB0_1:              // =>This Inner Loop Header: Depth=1',
+            'line_number': 2,
+        }
+        instruction_form_3 = {
+            'instruction': None,
+            'operands': None,
+            'directive': {'name': 'cfi_def_cfa', 'parameters': ['w29', '-16']},
+            'comment': None,
+            'label': None,
+            'line': '.cfi_def_cfa w29, -16',
+            'line_number': 3,
+        }
+        instruction_form_4 = {
+            'instruction': 'ldr',
+            'operands': [
+                {'register': {'prefix': 's', 'name': '0'}},
+                {
+                    'memory': {
+                        'offset': None,
+                        'base': {'prefix': 'x', 'name': '11'},
+                        'index': {
+                            'prefix': 'w',
+                            'name': '10',
+                            'shift_op': 'sxtw',
+                            'shift': {'value': '2'},
+                        },
+                        'scale': 4,
+                    }
+                },
+            ],
+            'directive': None,
+            'comment': '= <<2',
+            'label': None,
+            'line': 'ldr s0, [x11, w10, sxtw #2]\t\t// = <<2',
+            'line_number': 4,
+        }
+        instruction_form_5 = {
+            'instruction': 'prfm',
+            'operands': [
+                {'prfop': {'type': ['PLD'], 'target': ['L1'], 'policy': ['KEEP']}},
+                {
+                    'memory': {
+                        'offset': {'value': '2048'},
+                        'base': {'prefix': 'x', 'name': '26'},
+                        'index': None,
+                        'scale': 1,
+                    }
+                },
+            ],
+            'directive': None,
+            'comment': 'HPL',
+            'label': None,
+            'line': 'prfm    pldl1keep, [x26, #2048] //HPL',
+            'line_number': 5,
+        }
+        instruction_form_6 = {
+            'instruction': 'stp',
+            'operands': [
+                {'register': {'prefix': 'x', 'name': '29'}},
+                {'register': {'prefix': 'x', 'name': '30'}},
+                {
+                    'memory': {
+                        'offset': {'value': '-16'},
+                        'base': {'name': 'sp', 'prefix': 'x'},
+                        'index': None,
+                        'scale': 1,
+                        'pre_indexed': True,
+                    }
+                },
+            ],
+            'directive': None,
+            'comment': None,
+            'label': None,
+            'line': 'stp x29, x30, [sp, #-16]!',
+            'line_number': 6,
+        }
+        instruction_form_7 = {
+            'instruction': 'ldp',
+            'operands': [
+                {'register': {'prefix': 'q', 'name': '2'}},
+                {'register': {'prefix': 'q', 'name': '3'}},
+                {
+                    'memory': {
+                        'offset': None,
+                        'base': {'prefix': 'x', 'name': '11'},
+                        'index': None,
+                        'scale': 1,
+                        'post_indexed': {'value': '64'},
+                    }
+                },
+            ],
+            'directive': None,
+            'comment': None,
+            'label': None,
+            'line': 'ldp q2, q3, [x11], #64',
+            'line_number': 7,
+        }
+        parsed_1 = self.parser.parse_line(line_comment, 1)
+        parsed_2 = self.parser.parse_line(line_label, 2)
+        parsed_3 = self.parser.parse_line(line_directive, 3)
+        parsed_4 = self.parser.parse_line(line_instruction, 4)
+        parsed_5 = self.parser.parse_line(line_prefetch, 5)
+        parsed_6 = self.parser.parse_line(line_preindexed, 6)
+        parsed_7 = self.parser.parse_line(line_postindexed, 7)
+
+        self.assertEqual(parsed_1, instruction_form_1)
+        self.assertEqual(parsed_2, instruction_form_2)
+        self.assertEqual(parsed_3, instruction_form_3)
+        self.assertEqual(parsed_4, instruction_form_4)
+        self.assertEqual(parsed_5, instruction_form_5)
+        self.assertEqual(parsed_6, instruction_form_6)
+        self.assertEqual(parsed_7, instruction_form_7)
+
+    def test_parse_file(self):
+        parsed = self.parser.parse_file(self.triad_code)
+        self.assertEqual(parsed[0].line_number, 1)
+        self.assertEqual(len(parsed), 645)
+
+    def test_normalize_imd(self):
+        imd_decimal_1 = {'value': '79'}
+        imd_hex_1 = {'value': '0x4f'}
+        imd_decimal_2 = {'value': '8'}
+        imd_hex_2 = {'value': '0x8'}
+        imd_float_11 = {'float': {'mantissa': '0.79', 'e_sign': '+', 'exponent': '2'}}
+        imd_float_12 = {'float': {'mantissa': '790.0', 'e_sign': '-', 'exponent': '1'}}
+        imd_double_11 = {'double': {'mantissa': '0.79', 'e_sign': '+', 'exponent': '2'}}
+        imd_double_12 = {'double': {'mantissa': '790.0', 'e_sign': '-', 'exponent': '1'}}
+        identifier = {'identifier': {'name': '..B1.4'}}
+
+        value1 = self.parser.normalize_imd(imd_decimal_1)
+        self.assertEqual(value1, self.parser.normalize_imd(imd_hex_1))
+        self.assertEqual(
+            self.parser.normalize_imd(imd_decimal_2), self.parser.normalize_imd(imd_hex_2)
+        )
+        self.assertEqual(self.parser.normalize_imd(imd_float_11), value1)
+        self.assertEqual(self.parser.normalize_imd(imd_float_12), value1)
+        self.assertEqual(self.parser.normalize_imd(imd_double_11), value1)
+        self.assertEqual(self.parser.normalize_imd(imd_double_12), value1)
+        self.assertEqual(self.parser.normalize_imd(identifier), identifier)
+
+    def test_multiple_regs(self):
+        instr_range = 'PUSH {r5-r7}'
+        reg_range = AttrDict({
+            'register': {
+                'range': [
+                    {'prefix': 'r', 'name': '5'},
+                    {'prefix': 'r', 'name': '7'}
+                ],
+                'index': None
+            }
+        })
+        instr_list = 'POP {r5, r7, r9}'
+        reg_list = AttrDict({
+            'register': {
+                'list': [
+                    {'prefix': 'r', 'name': '5'},
+                    {'prefix': 'r', 'name': '7'},
+                    {'prefix': 'r', 'name': '9'}
+                ],
+                'index': None
+            }
+        })
+        prange = self.parser.parse_line(instr_range)
+        plist = self.parser.parse_line(instr_list)
+
+        self.assertEqual(prange.operands[0], reg_range)
+        self.assertEqual(plist.operands[0], reg_list)
+
+    def test_reg_dependency(self):
+        reg_1_1 = AttrDict({'prefix': 'b', 'name': '1'})
+        reg_1_2 = AttrDict({'prefix': 'h', 'name': '1'})
+        reg_1_3 = AttrDict({'prefix': 's', 'name': '1'})
+        reg_1_4 = AttrDict({'prefix': 'd', 'name': '1'})
+        reg_1_4 = AttrDict({'prefix': 'q', 'name': '1'})
+        reg_2_1 = AttrDict({'prefix': 'w', 'name': '2'})
+        reg_2_2 = AttrDict({'prefix': 'x', 'name': '2'})
+        reg_v1_1 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '16', 'shape': 'b'})
+        reg_v1_2 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '8', 'shape': 'h'})
+        reg_v1_3 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '4', 'shape': 's'})
+        reg_v1_4 = AttrDict({'prefix': 'v', 'name': '11', 'lanes': '2', 'shape': 'd'})
+
+        reg_b5 = AttrDict({'prefix': 'b', 'name': '5'})
+        reg_q15 = AttrDict({'prefix': 'q', 'name': '15'})
+        reg_v10 = AttrDict({'prefix': 'v', 'name': '10', 'lanes': '2', 'shape': 's'})
+        reg_v20 = AttrDict({'prefix': 'v', 'name': '20', 'lanes': '2', 'shape': 'd'})
+
+        reg_1 = [reg_1_1, reg_1_2, reg_1_3, reg_1_4]
+        reg_2 = [reg_2_1, reg_2_2]
+        reg_v = [reg_v1_1, reg_v1_2, reg_v1_3, reg_v1_4]
+        reg_others = [reg_b5, reg_q15, reg_v10, reg_v20]
+        regs = reg_1 + reg_2 + reg_v + reg_others
+
+        # test each register against each other
+        for ri in reg_1:
+            for rj in regs:
+                assert_value = True if rj in reg_1 else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+        for ri in reg_2:
+            for rj in regs:
+                assert_value = True if rj in reg_2 else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+        for ri in reg_v:
+            for rj in regs:
+                assert_value = True if rj in reg_v else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+        for ri in reg_others:
+            for rj in regs:
+                assert_value = True if rj == ri else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+
+    ##################
+    # Helper functions
+    ##################
+    def _get_comment(self, parser, comment):
+        return ' '.join(
+            AttrDict.convert_dict(
+                parser.process_operand(parser.comment.parseString(comment, parseAll=True).asDict())
+            ).comment
+        )
+
+    def _get_label(self, parser, label):
+        return AttrDict.convert_dict(
+            parser.process_operand(parser.label.parseString(label, parseAll=True).asDict())
+        ).label
+
+    def _get_directive(self, parser, directive):
+        return AttrDict.convert_dict(
+            parser.process_operand(parser.directive.parseString(directive, parseAll=True).asDict())
+        ).directive
+
+    @staticmethod
+    def _find_file(name):
+        testdir = os.path.dirname(__file__)
+        name = os.path.join(testdir, 'test_files', name)
+        assert os.path.exists(name)
+        return name
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParserAArch64v81)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_parser_x86att.py
+++ b/tests/test_parser_x86att.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Unit tests for x86 AT&T assembly parser
+"""
+
+import os
+import unittest
+
+from pyparsing import ParseException
+
+from osaca.parser import AttrDict, ParserX86ATT
+
+
+class TestParserX86ATT(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.parser = ParserX86ATT()
+        with open(self._find_file('triad-x86-iaca.s')) as f:
+            self.triad_code = f.read()
+
+    ##################
+    # Test
+    ##################
+
+    def test_comment_parser(self):
+        self.assertEqual(self._get_comment(self.parser, '# some comments'), 'some comments')
+        self.assertEqual(self._get_comment(self.parser, '\t\t#AA BB CC \t end \t'), 'AA BB CC end')
+        self.assertEqual(
+            self._get_comment(self.parser, '\t## comment ## comment'), '# comment ## comment'
+        )
+
+    def test_label_parser(self):
+        self.assertEqual(self._get_label(self.parser, 'main:').name, 'main')
+        self.assertEqual(self._get_label(self.parser, '..B1.10:').name, '..B1.10')
+        self.assertEqual(self._get_label(self.parser, '.2.3_2_pack.3:').name, '.2.3_2_pack.3')
+        self.assertEqual(self._get_label(self.parser, '.L1:\t\t\t#label1').name, '.L1')
+        self.assertEqual(
+            ' '.join(self._get_label(self.parser, '.L1:\t\t\t#label1').comment), 'label1'
+        )
+        with self.assertRaises(ParseException):
+            self._get_label(self.parser, '\t.cfi_startproc')
+
+    def test_directive_parser(self):
+        self.assertEqual(self._get_directive(self.parser, '\t.text').name, 'text')
+        self.assertEqual(len(self._get_directive(self.parser, '\t.text').parameters), 0)
+        self.assertEqual(self._get_directive(self.parser, '\t.align\t16,0x90').name, 'align')
+        self.assertEqual(len(self._get_directive(self.parser, '\t.align\t16,0x90').parameters), 2)
+        self.assertEqual(
+            self._get_directive(self.parser, '\t.align\t16,0x90').parameters[1], '0x90'
+        )
+        self.assertEqual(
+            self._get_directive(self.parser, '        .byte 100,103,144       #IACA START')[
+                'name'
+            ],
+            'byte',
+        )
+        self.assertEqual(
+            self._get_directive(self.parser, '        .byte 100,103,144       #IACA START')[
+                'parameters'
+            ][2],
+            '144',
+        )
+        self.assertEqual(
+            ' '.join(
+                self._get_directive(self.parser, '        .byte 100,103,144       #IACA START')[
+                    'comment'
+                ]
+            ),
+            'IACA START',
+        )
+
+    def test_parse_instruction(self):
+        instr1 = '\t\tvcvtsi2ss %edx, %xmm2, %xmm2\t\t\t#12.27'
+        instr2 = 'jb        ..B1.4 \t'
+        instr3 = '        movl $222,%ebx          #IACA END'
+        instr4 = 'vmovss    %xmm4, -4(%rsp,%rax,8) #12.9'
+        instr5 = 'mov %ebx,var(,1)'
+        instr6 = 'lea (,%rax,8),%rbx'
+        instr7 = 'vinsertf128 $0x1, %xmm0, %ymm1, %ymm1'
+
+        parsed_1 = self.parser.parse_instruction(instr1)
+        parsed_2 = self.parser.parse_instruction(instr2)
+        parsed_3 = self.parser.parse_instruction(instr3)
+        parsed_4 = self.parser.parse_instruction(instr4)
+        parsed_5 = self.parser.parse_instruction(instr5)
+        parsed_6 = self.parser.parse_instruction(instr6)
+        parsed_7 = self.parser.parse_instruction(instr7)
+
+        self.assertEqual(parsed_1.instruction, 'vcvtsi2ss')
+        self.assertEqual(parsed_1.operands[0].register.name, 'edx')
+        self.assertEqual(parsed_1.operands[1].register.name, 'xmm2')
+        self.assertEqual(parsed_1.comment, '12.27')
+
+        self.assertEqual(parsed_2.instruction, 'jb')
+        self.assertEqual(parsed_2.operands[0].identifier.name, '..B1.4')
+        self.assertEqual(len(parsed_2.operands), 1)
+        self.assertIsNone(parsed_2.comment)
+
+        self.assertEqual(parsed_3.instruction, 'movl')
+        self.assertEqual(parsed_3.operands[0].immediate.value, '222')
+        self.assertEqual(parsed_3.operands[1].register.name, 'ebx')
+        self.assertEqual(parsed_3.comment, 'IACA END')
+
+        self.assertEqual(parsed_4.instruction, 'vmovss')
+        self.assertEqual(parsed_4.operands[1].memory.offset.value, '-4')
+        self.assertEqual(parsed_4.operands[1].memory.base.name, 'rsp')
+        self.assertEqual(parsed_4.operands[1].memory.index.name, 'rax')
+        self.assertEqual(parsed_4.operands[1].memory.scale, 8)
+        self.assertEqual(parsed_4.operands[0].register.name, 'xmm4')
+        self.assertEqual(parsed_4.comment, '12.9')
+
+        self.assertEqual(parsed_5.instruction, 'mov')
+        self.assertEqual(parsed_5.operands[1].memory.offset.identifier.name, 'var')
+        self.assertIsNone(parsed_5.operands[1].memory.base)
+        self.assertIsNone(parsed_5.operands[1].memory.index)
+        self.assertEqual(parsed_5.operands[1].memory.scale, 1)
+        self.assertEqual(parsed_5.operands[0].register.name, 'ebx')
+
+        self.assertEqual(parsed_6.instruction, 'lea')
+        self.assertIsNone(parsed_6.operands[0].memory.offset)
+        self.assertIsNone(parsed_6.operands[0].memory.base)
+        self.assertEqual(parsed_6.operands[0].memory.index.name, 'rax')
+        self.assertEqual(parsed_6.operands[0].memory.scale, 8)
+        self.assertEqual(parsed_6.operands[1].register.name, 'rbx')
+
+        self.assertEqual(parsed_7.operands[0].immediate.value, '0x1')
+        self.assertEqual(parsed_7.operands[1].register.name, 'xmm0')
+        self.assertEqual(parsed_7.operands[2].register.name, 'ymm1')
+        self.assertEqual(parsed_7.operands[3].register.name, 'ymm1')
+
+    def test_parse_line(self):
+        line_comment = '# -- Begin  main'
+        line_label = '..B1.7:                         # Preds ..B1.6'
+        line_directive = '\t\t.quad   .2.3_2__kmpc_loc_pack.2 #qed'
+        line_instruction = '\t\tlea       2(%rax,%rax), %ecx #12.9'
+
+        instruction_form_1 = {
+            'instruction': None,
+            'operands': None,
+            'directive': None,
+            'comment': '-- Begin main',
+            'label': None,
+            'line': '# -- Begin  main',
+            'line_number': 1,
+        }
+        instruction_form_2 = {
+            'instruction': None,
+            'operands': None,
+            'directive': None,
+            'comment': 'Preds ..B1.6',
+            'label': '..B1.7',
+            'line': '..B1.7:                         # Preds ..B1.6',
+            'line_number': 2,
+        }
+        instruction_form_3 = {
+            'instruction': None,
+            'operands': None,
+            'directive': {'name': 'quad', 'parameters': ['.2.3_2__kmpc_loc_pack.2']},
+            'comment': 'qed',
+            'label': None,
+            'line': '.quad   .2.3_2__kmpc_loc_pack.2 #qed',
+            'line_number': 3,
+        }
+        instruction_form_4 = {
+            'instruction': 'lea',
+            'operands': [
+                {
+                    'memory': {
+                        'offset': {'value': '2'},
+                        'base': {'name': 'rax'},
+                        'index': {'name': 'rax'},
+                        'scale': 1,
+                    }
+                },
+                {
+                    'register': {'name': 'ecx'}
+                }
+            ],
+            'directive': None,
+            'comment': '12.9',
+            'label': None,
+            'line': 'lea       2(%rax,%rax), %ecx #12.9',
+            'line_number': 4,
+        }
+
+        parsed_1 = self.parser.parse_line(line_comment, 1)
+        parsed_2 = self.parser.parse_line(line_label, 2)
+        parsed_3 = self.parser.parse_line(line_directive, 3)
+        parsed_4 = self.parser.parse_line(line_instruction, 4)
+
+        self.assertEqual(parsed_1, instruction_form_1)
+        self.assertEqual(parsed_2, instruction_form_2)
+        self.assertEqual(parsed_3, instruction_form_3)
+        self.assertEqual(parsed_4, instruction_form_4)
+
+    def test_parse_file(self):
+        parsed = self.parser.parse_file(self.triad_code)
+        self.assertEqual(parsed[0].line_number, 1)
+        self.assertEqual(len(parsed), 353)
+
+    def test_parse_register(self):
+        register_str_1 = '%rax'
+        register_str_2 = '%r9'
+        register_str_3 = '%xmm1'
+        register_str_4 = '%rip'
+
+        parsed_reg_1 = {'register': {'name': 'rax'}}
+        parsed_reg_2 = {'register': {'name': 'r9'}}
+        parsed_reg_3 = {'register': {'name': 'xmm1'}}
+        parsed_reg_4 = {'register': {'name': 'rip'}}
+
+        self.assertEqual(self.parser.parse_register(register_str_1), parsed_reg_1)
+        self.assertEqual(self.parser.parse_register(register_str_2), parsed_reg_2)
+        self.assertEqual(self.parser.parse_register(register_str_3), parsed_reg_3)
+        self.assertEqual(self.parser.parse_register(register_str_4), parsed_reg_4)
+        self.assertIsNone(self.parser.parse_register('rax'))
+
+    def test_normalize_imd(self):
+        imd_decimal_1 = {'value': '79'}
+        imd_hex_1 = {'value': '0x4f'}
+        imd_decimal_2 = {'value': '8'}
+        imd_hex_2 = {'value': '0x8'}
+        self.assertEqual(
+            self.parser.normalize_imd(imd_decimal_1), self.parser.normalize_imd(imd_hex_1)
+        )
+        self.assertEqual(
+            self.parser.normalize_imd(imd_decimal_2), self.parser.normalize_imd(imd_hex_2)
+        )
+
+    def test_reg_dependency(self):
+        reg_a1 = AttrDict({'name': 'rax'})
+        reg_a2 = AttrDict({'name': 'eax'})
+        reg_a3 = AttrDict({'name': 'ax'})
+        reg_a4 = AttrDict({'name': 'al'})
+        reg_r11 = AttrDict({'name': 'r11'})
+        reg_r11b = AttrDict({'name': 'r11b'})
+        reg_r11d = AttrDict({'name': 'r11d'})
+        reg_r11w = AttrDict({'name': 'r11w'})
+        reg_xmm1 = AttrDict({'name': 'xmm1'})
+        reg_ymm1 = AttrDict({'name': 'ymm1'})
+        reg_zmm1 = AttrDict({'name': 'zmm1'})
+
+        reg_b1 = AttrDict({'name': 'rbx'})
+        reg_r15 = AttrDict({'name': 'r15'})
+        reg_xmm2 = AttrDict({'name': 'xmm2'})
+        reg_ymm3 = AttrDict({'name': 'ymm3'})
+
+        reg_a = [reg_a1, reg_a2, reg_a3, reg_a4]
+        reg_r = [reg_r11, reg_r11b, reg_r11d, reg_r11w]
+        reg_vec_1 = [reg_xmm1, reg_ymm1, reg_zmm1]
+        reg_others = [reg_b1, reg_r15, reg_xmm2, reg_ymm3]
+        regs = reg_a + reg_r + reg_vec_1 + reg_others
+
+        # test each register against each other
+        for ri in reg_a:
+            for rj in regs:
+                assert_value = True if rj in reg_a else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+        for ri in reg_r:
+            for rj in regs:
+                assert_value = True if rj in reg_r else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+        for ri in reg_vec_1:
+            for rj in regs:
+                assert_value = True if rj in reg_vec_1 else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+        for ri in reg_others:
+            for rj in regs:
+                assert_value = True if rj == ri else False
+                with self.subTest(reg_a=ri, reg_b=rj, assert_val=assert_value):
+                    self.assertEqual(self.parser.is_reg_dependend_of(ri, rj), assert_value)
+
+    ##################
+    # Helper functions
+    ##################
+    def _get_comment(self, parser, comment):
+        return ' '.join(
+            AttrDict.convert_dict(
+                parser.process_operand(parser.comment.parseString(comment, parseAll=True).asDict())
+            ).comment
+        )
+
+    def _get_label(self, parser, label):
+        return AttrDict.convert_dict(
+            parser.process_operand(parser.label.parseString(label, parseAll=True).asDict())
+        ).label
+
+    def _get_directive(self, parser, directive):
+        return AttrDict.convert_dict(
+            parser.process_operand(parser.directive.parseString(directive, parseAll=True).asDict())
+        ).directive
+
+    @staticmethod
+    def _find_file(name):
+        testdir = os.path.dirname(__file__)
+        name = os.path.join(testdir, 'test_files', name)
+        assert os.path.exists(name)
+        return name
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParserX86ATT)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_semantics.py
+++ b/tests/test_semantics.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""
+Unit tests for Semantic Analysis
+"""
+
+import os
+import unittest
+from subprocess import call
+
+import networkx as nx
+
+from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
+from osaca.semantics import (INSTR_FLAGS, KernelDG, MachineModel,
+                             SemanticsAppender)
+
+
+class TestSemanticTools(unittest.TestCase):
+    MODULE_DATA_DIR = os.path.join(
+        os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
+    )
+    USER_DATA_DIR = os.path.join(os.path.expanduser('~'), '.osaca/')
+
+    @classmethod
+    def setUpClass(self):
+        # copy db files in user directory
+        if not os.path.isdir(os.path.join(self.USER_DATA_DIR, 'data')):
+            os.makedirs(os.path.join(self.USER_DATA_DIR, 'data'))
+            call(['cp', '-r', self.MODULE_DATA_DIR, self.USER_DATA_DIR])
+        # set up parser and kernels
+        self.parser_x86 = ParserX86ATT()
+        self.parser_AArch64 = ParserAArch64v81()
+        with open(self._find_file('kernel-x86.s')) as f:
+            self.code_x86 = f.read()
+        with open(self._find_file('kernel-AArch64.s')) as f:
+            self.code_AArch64 = f.read()
+        self.kernel_x86 = self.parser_x86.parse_file(self.code_x86)
+        self.kernel_AArch64 = self.parser_AArch64.parse_file(self.code_AArch64)
+
+        # set up machine models
+        self.machine_model_csx = MachineModel(
+            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
+        )
+        self.machine_model_tx2 = MachineModel(
+            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
+        )
+        self.semantics_csx = SemanticsAppender(
+            self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
+        )
+        self.semantics_tx2 = SemanticsAppender(
+            self.machine_model_tx2,
+            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/aarch64.yml'),
+        )
+        self.machine_model_zen = MachineModel(arch='zen1')
+
+        for i in range(len(self.kernel_x86)):
+            self.semantics_csx.assign_src_dst(self.kernel_x86[i])
+            self.semantics_csx.assign_tp_lt(self.kernel_x86[i])
+        for i in range(len(self.kernel_AArch64)):
+            self.semantics_tx2.assign_src_dst(self.kernel_AArch64[i])
+            self.semantics_tx2.assign_tp_lt(self.kernel_AArch64[i])
+
+    ###########
+    # Tests
+    ###########
+
+    def test_creation_by_name(self):
+        try:
+            tmp_mm = MachineModel(arch='CSX')
+            SemanticsAppender(tmp_mm)
+        except ValueError:
+            self.fail()
+
+    def test_src_dst_assignment_x86(self):
+        for instruction_form in self.kernel_x86:
+            with self.subTest(instruction_form=instruction_form):
+                if instruction_form['operands'] is not None:
+                    self.assertTrue('source' in instruction_form['operands'])
+                    self.assertTrue('destination' in instruction_form['operands'])
+                    self.assertTrue('src_dst' in instruction_form['operands'])
+
+    def test_src_dst_assignment_AArch64(self):
+        for instruction_form in self.kernel_AArch64:
+            with self.subTest(instruction_form=instruction_form):
+                if instruction_form['operands'] is not None:
+                    self.assertTrue('source' in instruction_form['operands'])
+                    self.assertTrue('destination' in instruction_form['operands'])
+                    self.assertTrue('src_dst' in instruction_form['operands'])
+
+    def test_tp_lt_assignment_x86(self):
+        self.assertTrue('ports' in self.machine_model_csx)
+        port_num = len(self.machine_model_csx['ports'])
+        for instruction_form in self.kernel_x86:
+            with self.subTest(instruction_form=instruction_form):
+                self.assertTrue('throughput' in instruction_form)
+                self.assertTrue('latency' in instruction_form)
+                self.assertIsInstance(instruction_form['port_pressure'], list)
+                self.assertEqual(len(instruction_form['port_pressure']), port_num)
+
+    def test_tp_lt_assignment_AArch64(self):
+        self.assertTrue('ports' in self.machine_model_tx2)
+        port_num = len(self.machine_model_tx2['ports'])
+        for instruction_form in self.kernel_AArch64:
+            with self.subTest(instruction_form=instruction_form):
+                self.assertTrue('throughput' in instruction_form)
+                self.assertTrue('latency' in instruction_form)
+                self.assertIsInstance(instruction_form['port_pressure'], list)
+                self.assertEqual(len(instruction_form['port_pressure']), port_num)
+
+    def test_kernelDG_x86(self):
+        #
+        #  3
+        #   \___>5__>6
+        #   /
+        #  2
+        #     4_______>8
+        #
+        dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
+        self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=4))), 1)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=4)), 7)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=5))), 1)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=5)), 7)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=6))), 1)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=6)), 10)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=7))), 1)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=7)), 8)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=8))), 0)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=9))), 0)
+        with self.assertRaises(ValueError):
+            dg.get_dependent_instruction_forms()
+
+    def test_kernelDG_AArch64(self):
+        dg = KernelDG(self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2)
+        self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
+        self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=4)), {8, 9})
+        self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=5)), {10, 11})
+        self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=6)), {7, 8, 9})
+        self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=7)), {10, 11})
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=8)), 14)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=9)), 15)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=10)), 17)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=11)), 18)
+        self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=12)), {14, 15})
+        self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=13)), {17, 18})
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=14)), 16)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=15)), 16)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=16))), 0)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=17)), 19)
+        self.assertEqual(next(dg.get_dependent_instruction_forms(line_number=18)), 19)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=19))), 0)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=20))), 0)
+        self.assertEqual(len(list(dg.get_dependent_instruction_forms(line_number=21))), 0)
+        with self.assertRaises(ValueError):
+            dg.get_dependent_instruction_forms()
+
+    def test_hidden_load(self):
+        machine_model_hld = MachineModel(
+            path_to_yaml=self._find_file('hidden_load_machine_model.yml')
+        )
+        self.assertTrue(machine_model_hld.has_hidden_loads())
+        semantics_hld = SemanticsAppender(machine_model_hld)
+        kernel_hld = self.parser_x86.parse_file(self.code_x86)
+        kernel_hld_2 = self.parser_x86.parse_file(self.code_x86)
+        kernel_hld_2 = self.parser_x86.parse_file(self.code_x86)[-3:]
+        kernel_hld_3 = self.parser_x86.parse_file(self.code_x86)[5:8]
+        semantics_hld.add_semantics(kernel_hld)
+        semantics_hld.add_semantics(kernel_hld_2)
+        semantics_hld.add_semantics(kernel_hld_3)
+
+        num_hidden_loads = len([x for x in kernel_hld if INSTR_FLAGS.HIDDEN_LD in x['flags']])
+        num_hidden_loads_2 = len([x for x in kernel_hld_2 if INSTR_FLAGS.HIDDEN_LD in x['flags']])
+        num_hidden_loads_3 = len([x for x in kernel_hld_3 if INSTR_FLAGS.HIDDEN_LD in x['flags']])
+        self.assertEqual(num_hidden_loads, 1)
+        self.assertEqual(num_hidden_loads_2, 0)
+        self.assertEqual(num_hidden_loads_3, 1)
+
+    def test_cyclic_dag(self):
+        dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
+        dg.dg.add_edge(100, 101, latency=1.0)
+        dg.dg.add_edge(101, 102, latency=2.0)
+        dg.dg.add_edge(102, 100, latency=3.0)
+        with self.assertRaises(NotImplementedError):
+            dg.get_critical_path()
+        with self.assertRaises(NotImplementedError):
+            dg.get_loopcarried_dependencies()
+
+    def test_loop_carried_dependency_x86(self):
+        lcd_id = 9
+        lcd_id2 = 6
+        dg = KernelDG(self.kernel_x86, self.parser_x86, self.machine_model_csx)
+        lc_deps = dg.get_loopcarried_dependencies()
+        self.assertEqual(len(lc_deps), 2)
+        # ID 9
+        self.assertEqual(
+            lc_deps[lcd_id]['root'], dg.dg.nodes(data=True)[lcd_id]['instruction_form']
+        )
+        self.assertEqual(len(lc_deps[lcd_id]['dependencies']), 1)
+        self.assertEqual(
+            lc_deps[lcd_id]['dependencies'][0], dg.dg.nodes(data=True)[lcd_id]['instruction_form']
+        )
+        # ID 6
+        self.assertEqual(
+            lc_deps[lcd_id2]['root'], dg.dg.nodes(data=True)[lcd_id2]['instruction_form']
+        )
+        self.assertEqual(len(lc_deps[lcd_id2]['dependencies']), 1)
+        self.assertEqual(
+            lc_deps[lcd_id2]['dependencies'][0],
+            dg.dg.nodes(data=True)[lcd_id2]['instruction_form'],
+        )
+
+    def test_is_read_is_written_x86(self):
+        # independent form HW model
+        dag = KernelDG(self.kernel_x86, self.parser_x86, None)
+        reg_rcx = AttrDict({'name': 'rcx'})
+        reg_ymm1 = AttrDict({'name': 'ymm1'})
+
+        instr_form_r_c = self.parser_x86.parse_line('vmovsd  %xmm0, (%r15,%rcx,8)')
+        self.semantics_csx.assign_src_dst(instr_form_r_c)
+        instr_form_non_r_c = self.parser_x86.parse_line('movl  %xmm0, (%r15,%rax,8)')
+        self.semantics_csx.assign_src_dst(instr_form_non_r_c)
+        instr_form_w_c = self.parser_x86.parse_line('movi $0x05ACA, %rcx')
+        self.semantics_csx.assign_src_dst(instr_form_w_c)
+
+        instr_form_rw_ymm_1 = self.parser_x86.parse_line('vinsertf128 $0x1, %xmm1, %ymm0, %ymm1')
+        self.semantics_csx.assign_src_dst(instr_form_rw_ymm_1)
+        instr_form_rw_ymm_2 = self.parser_x86.parse_line('vinsertf128 $0x1, %xmm0, %ymm1, %ymm1')
+        self.semantics_csx.assign_src_dst(instr_form_rw_ymm_2)
+        instr_form_r_ymm = self.parser_x86.parse_line('vmovapd %ymm1, %ymm0')
+        self.semantics_csx.assign_src_dst(instr_form_r_ymm)
+
+        self.assertTrue(dag.is_read(reg_rcx, instr_form_r_c))
+        self.assertFalse(dag.is_read(reg_rcx, instr_form_non_r_c))
+        self.assertFalse(dag.is_read(reg_rcx, instr_form_w_c))
+        self.assertTrue(dag.is_written(reg_rcx, instr_form_w_c))
+        self.assertFalse(dag.is_written(reg_rcx, instr_form_r_c))
+
+        self.assertTrue(dag.is_read(reg_ymm1, instr_form_rw_ymm_1))
+        self.assertTrue(dag.is_read(reg_ymm1, instr_form_rw_ymm_2))
+        self.assertTrue(dag.is_read(reg_ymm1, instr_form_r_ymm))
+        self.assertTrue(dag.is_written(reg_ymm1, instr_form_rw_ymm_1))
+        self.assertTrue(dag.is_written(reg_ymm1, instr_form_rw_ymm_2))
+        self.assertFalse(dag.is_written(reg_ymm1, instr_form_r_ymm))
+
+    def test_is_read_is_written_AArch64(self):
+        # independent form HW model
+        dag = KernelDG(self.kernel_AArch64, self.parser_AArch64, None)
+        reg_x1 = AttrDict({'prefix': 'x', 'name': '1'})
+        reg_w1 = AttrDict({'prefix': 'w', 'name': '1'})
+        reg_d1 = AttrDict({'prefix': 'd', 'name': '1'})
+        reg_q1 = AttrDict({'prefix': 'q', 'name': '1'})
+        reg_v1 = AttrDict({'prefix': 'v', 'name': '1', 'lanes': '2', 'shape': 'd'})
+        regs = [reg_d1, reg_q1, reg_v1]
+        regs_gp = [reg_w1, reg_x1]
+
+        instr_form_r_1 = self.parser_AArch64.parse_line('stp q1, q3, [x12, #192]')
+        self.semantics_tx2.assign_src_dst(instr_form_r_1)
+        instr_form_r_2 = self.parser_AArch64.parse_line('fadd v2.2d, v1.2d, v0.2d')
+        self.semantics_tx2.assign_src_dst(instr_form_r_2)
+        instr_form_w_1 = self.parser_AArch64.parse_line('ldr d1, [x1, #:got_lo12:q2c]')
+        self.semantics_tx2.assign_src_dst(instr_form_w_1)
+        instr_form_non_w_1 = self.parser_AArch64.parse_line('ldr x1, [x1, #:got_lo12:q2c]')
+        self.semantics_tx2.assign_src_dst(instr_form_non_w_1)
+        instr_form_rw_1 = self.parser_AArch64.parse_line('fmul v1.2d, v1.2d, v0.2d')
+        self.semantics_tx2.assign_src_dst(instr_form_rw_1)
+        instr_form_rw_2 = self.parser_AArch64.parse_line('ldp q2, q4, [x1, #64]!')
+        self.semantics_tx2.assign_src_dst(instr_form_rw_2)
+        instr_form_rw_3 = self.parser_AArch64.parse_line('str x4, [x1], #64')
+        self.semantics_tx2.assign_src_dst(instr_form_rw_3)
+        instr_form_non_rw_1 = self.parser_AArch64.parse_line('adds x1, x11')
+        self.semantics_tx2.assign_src_dst(instr_form_non_rw_1)
+
+        for reg in regs:
+            with self.subTest(reg=reg):
+                self.assertTrue(dag.is_read(reg, instr_form_r_1))
+                self.assertTrue(dag.is_read(reg, instr_form_r_2))
+                self.assertTrue(dag.is_read(reg, instr_form_rw_1))
+                self.assertFalse(dag.is_read(reg, instr_form_rw_2))
+                self.assertFalse(dag.is_read(reg, instr_form_rw_3))
+                self.assertFalse(dag.is_read(reg, instr_form_w_1))
+                self.assertTrue(dag.is_written(reg, instr_form_w_1))
+                self.assertTrue(dag.is_written(reg, instr_form_rw_1))
+                self.assertFalse(dag.is_written(reg, instr_form_non_w_1))
+                self.assertFalse(dag.is_written(reg, instr_form_rw_2))
+                self.assertFalse(dag.is_written(reg, instr_form_rw_3))
+                self.assertFalse(dag.is_written(reg, instr_form_non_rw_1))
+                self.assertFalse(dag.is_written(reg, instr_form_non_rw_1))
+        for reg in regs_gp:
+            with self.subTest(reg=reg):
+                self.assertFalse(dag.is_read(reg, instr_form_r_1))
+                self.assertFalse(dag.is_read(reg, instr_form_r_2))
+                self.assertFalse(dag.is_read(reg, instr_form_rw_1))
+                self.assertTrue(dag.is_read(reg, instr_form_rw_2))
+                self.assertTrue(dag.is_read(reg, instr_form_rw_3))
+                self.assertTrue(dag.is_read(reg, instr_form_w_1))
+                self.assertFalse(dag.is_written(reg, instr_form_w_1))
+                self.assertFalse(dag.is_written(reg, instr_form_rw_1))
+                self.assertTrue(dag.is_written(reg, instr_form_non_w_1))
+                self.assertTrue(dag.is_written(reg, instr_form_rw_2))
+                self.assertTrue(dag.is_written(reg, instr_form_rw_3))
+                self.assertTrue(dag.is_written(reg, instr_form_non_rw_1))
+                self.assertTrue(dag.is_written(reg, instr_form_non_rw_1))
+
+    def test_invalid_MachineModel(self):
+        with self.assertRaises(ValueError):
+            MachineModel()
+        with self.assertRaises(ValueError):
+            MachineModel(arch='CSX', path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml'))
+        with self.assertRaises(FileNotFoundError):
+            MachineModel(arch='THE_MACHINE')
+        with self.assertRaises(FileNotFoundError):
+            MachineModel(path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'THE_MACHINE.yml'))
+
+    def test_MachineModel_getter(self):
+        sample_operands = [
+            {
+                'memory': {
+                    'offset': None,
+                    'base': {'name': 'r12'},
+                    'index': {'name': 'rcx'},
+                    'scale': 8,
+                }
+            }
+        ]
+        self.assertIsNone(self.machine_model_csx.get_instruction('GETRESULT', sample_operands))
+        self.assertIsNone(self.machine_model_tx2.get_instruction('GETRESULT', sample_operands))
+
+        self.assertEqual(self.machine_model_csx.get_arch(), 'csx')
+        self.assertEqual(self.machine_model_tx2.get_arch(), 'tx2')
+
+        self.assertEqual(self.machine_model_csx.get_ISA(), 'x86')
+        self.assertEqual(self.machine_model_tx2.get_ISA(), 'aarch64')
+
+        ports_csx = ['0', '0DV', '1', '2', '2D', '3', '3D', '4', '5', '6', '7']
+        data_ports_csx = ['2D', '3D']
+        self.assertEqual(self.machine_model_csx.get_ports(), ports_csx)
+        self.assertEqual(self.machine_model_csx.get_data_ports(), data_ports_csx)
+
+        self.assertFalse(self.machine_model_tx2.has_hidden_loads())
+
+        self.assertEqual(MachineModel.get_isa_for_arch('CSX'), 'x86')
+        self.assertEqual(MachineModel.get_isa_for_arch('tX2'), 'aarch64')
+        with self.assertRaises(ValueError):
+            self.assertIsNone(MachineModel.get_isa_for_arch('THE_MACHINE'))
+
+    ##################
+    # Helper functions
+    ##################
+
+    @staticmethod
+    def _find_file(name):
+        testdir = os.path.dirname(__file__)
+        name = os.path.join(testdir, 'test_files', name)
+        assert os.path.exists(name)
+        return name
+
+
+if __name__ == '__main__':
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestSemanticTools)
+    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s
+++ b/tests/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s
@@ -1,653 +0,0 @@
-	.section	__TEXT,__text,regular,pure_instructions
-	.macosx_version_min 10, 14
-	.globl	_main                   ## -- Begin function main
-	.p2align	4, 0x90
-_main:                                  ## @main
-	.cfi_startproc
-## %bb.0:
-	pushq	%rbp
-	.cfi_def_cfa_offset 16
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-	.cfi_def_cfa_register %rbp
-	pushq	%r15
-	pushq	%r14
-	pushq	%r13
-	pushq	%r12
-	pushq	%rbx
-	subq	$408, %rsp              ## imm = 0x198
-	.cfi_offset %rbx, -56
-	.cfi_offset %r12, -48
-	.cfi_offset %r13, -40
-	.cfi_offset %r14, -32
-	.cfi_offset %r15, -24
-	movq	%rsi, %rbx
-	movq	16(%rsi), %rdi
-	callq	_atoi
-	movl	%eax, %r14d
-	movq	24(%rbx), %rdi
-	callq	_atoi
-                                        ## kill: def $eax killed $eax def $rax
-	movq	%r14, -96(%rbp)         ## 8-byte Spill
-	movl	%r14d, %ecx
-	imull	%r14d, %ecx
-	movl	%ecx, -88(%rbp)         ## 4-byte Spill
-	movq	%rax, -72(%rbp)         ## 8-byte Spill
-	imull	%eax, %ecx
-	movslq	%ecx, %r13
-	shlq	$3, %r13
-	leaq	-56(%rbp), %rdi
-	movl	$32, %esi
-	movq	%r13, %rdx
-	callq	_posix_memalign
-	testl	%eax, %eax
-	je	LBB0_2
-## %bb.1:
-	movq	$0, -56(%rbp)
-	xorl	%ebx, %ebx
-	jmp	LBB0_3
-LBB0_2:
-	movq	-56(%rbp), %rbx
-LBB0_3:
-	leaq	-56(%rbp), %rdi
-	movl	$32, %esi
-	movq	%r13, %rdx
-	callq	_posix_memalign
-	testl	%eax, %eax
-	je	LBB0_5
-## %bb.4:
-	movq	$0, -56(%rbp)
-	xorl	%eax, %eax
-	jmp	LBB0_6
-LBB0_5:
-	movq	-56(%rbp), %rax
-LBB0_6:
-	movq	%rax, -80(%rbp)         ## 8-byte Spill
-	movq	-96(%rbp), %r9          ## 8-byte Reload
-	movabsq	$4602641980904887326, %rax ## imm = 0x3FDFDE7EEC22D41E
-	movq	%rax, -56(%rbp)
-	cmpl	$3, -72(%rbp)           ## 4-byte Folded Reload
-	jl	LBB0_15
-## %bb.7:
-	movabsq	$4294967296, %r12       ## imm = 0x100000000
-	leal	-1(%r9), %ecx
-	movslq	%r9d, %rax
-	movslq	-88(%rbp), %rdx         ## 4-byte Folded Reload
-	movq	%rdx, -160(%rbp)        ## 8-byte Spill
-	movq	-72(%rbp), %rsi         ## 8-byte Reload
-	leal	-1(%rsi), %edx
-	leaq	8(%rbx,%rax,8), %rsi
-	movq	%rsi, -152(%rbp)        ## 8-byte Spill
-	movq	-80(%rbp), %rsi         ## 8-byte Reload
-	leaq	8(%rsi,%rax,8), %rsi
-	movq	%rsi, -144(%rbp)        ## 8-byte Spill
-	leaq	(,%rax,8), %rsi
-	movq	%rsi, -104(%rbp)        ## 8-byte Spill
-	leaq	2(%rax), %rsi
-	movq	%rsi, -136(%rbp)        ## 8-byte Spill
-	shlq	$32, %rax
-	movq	%rax, -184(%rbp)        ## 8-byte Spill
-	addq	$-1, %rcx
-	movl	%r9d, %eax
-	movq	%rax, -176(%rbp)        ## 8-byte Spill
-	movl	$1, %eax
-	movabsq	$4601149042440805838, %rdi ## imm = 0x3FDA90AD19501DCE
-	movq	%rdx, -208(%rbp)        ## 8-byte Spill
-	.p2align	4, 0x90
-LBB0_8:                                 ## =>This Loop Header: Depth=1
-                                        ##     Child Loop BB0_10 Depth 2
-                                        ##       Child Loop BB0_11 Depth 3
-	cmpl	$2, %r9d
-	jle	LBB0_14
-## %bb.9:                               ##   in Loop: Header=BB0_8 Depth=1
-	movl	%eax, %r14d
-	imull	-88(%rbp), %r14d        ## 4-byte Folded Reload
-	leaq	1(%rax), %r8
-	movq	-160(%rbp), %rdx        ## 8-byte Reload
-	movq	%rdx, %rsi
-	movq	%r8, -168(%rbp)         ## 8-byte Spill
-	imulq	%r8, %rsi
-	movq	-152(%rbp), %r10        ## 8-byte Reload
-	leaq	(%r10,%rsi,8), %r8
-	leaq	-1(%rax), %rsi
-	imulq	%rdx, %rsi
-	leaq	(%r10,%rsi,8), %r10
-	movq	%rax, %rsi
-	imulq	%rdx, %rsi
-	movq	-144(%rbp), %rdx        ## 8-byte Reload
-	leaq	(%rdx,%rsi,8), %r11
-	addl	-136(%rbp), %esi        ## 4-byte Folded Reload
-	shlq	$32, %rsi
-	movl	%r9d, %r15d
-	imull	%eax, %r15d
-	leal	2(%r15), %r13d
-	imull	%r9d, %r13d
-	addl	$1, %r13d
-	addq	$1, %r14
-	addl	$1, %r15d
-	imull	%r9d, %r15d
-	movl	$1, %eax
-	.p2align	4, 0x90
-LBB0_10:                                ##   Parent Loop BB0_8 Depth=1
-                                        ## =>  This Loop Header: Depth=2
-                                        ##       Child Loop BB0_11 Depth 3
-	movq	%rax, -112(%rbp)        ## 8-byte Spill
-	leaq	1(%rax), %rax
-	movq	%rax, -192(%rbp)        ## 8-byte Spill
-	movq	%rsi, -120(%rbp)        ## 8-byte Spill
-	xorl	%edx, %edx
-	.p2align	4, 0x90
-LBB0_11:                                ##   Parent Loop BB0_8 Depth=1
-                                        ##     Parent Loop BB0_10 Depth=2
-                                        ## =>    This Inner Loop Header: Depth=3
-	movq	%rdi, (%r11,%rdx,8)
-	leal	(%r15,%rdx), %r9d
-	movslq	%r9d, %rax
-	movq	%rdi, (%rbx,%rax,8)
-	movq	%rsi, %rax
-	sarq	$29, %rax
-	movq	%rdi, (%rbx,%rax)
-	leal	(%r14,%rdx), %eax
-	cltq
-	movq	%rdi, (%rbx,%rax,8)
-	leal	(%r13,%rdx), %eax
-	cltq
-	movq	%rdi, (%rbx,%rax,8)
-	movq	%rdi, (%r10,%rdx,8)
-	movq	%rdi, (%r8,%rdx,8)
-	addq	$1, %rdx
-	addq	%r12, %rsi
-	cmpq	%rdx, %rcx
-	jne	LBB0_11
-## %bb.12:                              ##   in Loop: Header=BB0_10 Depth=2
-	movq	-104(%rbp), %rax        ## 8-byte Reload
-	addq	%rax, %r8
-	addq	%rax, %r10
-	addq	%rax, %r11
-	movq	-120(%rbp), %rsi        ## 8-byte Reload
-	addq	-184(%rbp), %rsi        ## 8-byte Folded Reload
-	movq	-176(%rbp), %rax        ## 8-byte Reload
-	addq	%rax, %r13
-	addq	%rax, %r14
-	addq	%rax, %r15
-	cmpq	%rdx, -112(%rbp)        ## 8-byte Folded Reload
-	movq	-192(%rbp), %rax        ## 8-byte Reload
-	jne	LBB0_10
-## %bb.13:                              ##   in Loop: Header=BB0_8 Depth=1
-	movq	-168(%rbp), %rsi        ## 8-byte Reload
-	movq	%rsi, %rax
-	movq	-96(%rbp), %r9          ## 8-byte Reload
-	movq	-208(%rbp), %rdx        ## 8-byte Reload
-	cmpq	%rdx, %rsi
-	jne	LBB0_8
-	jmp	LBB0_15
-	.p2align	4, 0x90
-LBB0_14:                                ##   in Loop: Header=BB0_8 Depth=1
-	addq	$1, %rax
-	movq	%rax, %rsi
-	cmpq	%rdx, %rsi
-	jne	LBB0_8
-LBB0_15:
-	movq	_var_false@GOTPCREL(%rip), %rax
-	cmpl	$0, (%rax)
-	je	LBB0_17
-## %bb.16:
-	movq	%rbx, %rdi
-	callq	_dummy
-	movq	-80(%rbp), %rdi         ## 8-byte Reload
-	callq	_dummy
-	leaq	-56(%rbp), %rdi
-	callq	_dummy
-	movq	-96(%rbp), %r9          ## 8-byte Reload
-LBB0_17:
-	cmpl	$3, -72(%rbp)           ## 4-byte Folded Reload
-	jl	LBB0_59
-## %bb.18:
-	movabsq	$4294967296, %r14       ## imm = 0x100000000
-	leal	-1(%r9), %ecx
-	movslq	%r9d, %rsi
-	movslq	-88(%rbp), %rax         ## 4-byte Folded Reload
-	movq	%rax, -312(%rbp)        ## 8-byte Spill
-	movq	-72(%rbp), %rax         ## 8-byte Reload
-	addl	$-1, %eax
-	movq	%rax, -72(%rbp)         ## 8-byte Spill
-	leaq	-1(%rcx), %rax
-	leaq	-2(%rcx), %rdi
-	movq	%rdi, -424(%rbp)        ## 8-byte Spill
-	leaq	1(%rsi), %rdi
-	movq	%rdi, -224(%rbp)        ## 8-byte Spill
-	leaq	(%rsi,%rcx), %rdi
-	movq	%rdi, -304(%rbp)        ## 8-byte Spill
-	movl	%r9d, %edi
-	movq	%rdi, -256(%rbp)        ## 8-byte Spill
-	movq	%rcx, -264(%rbp)        ## 8-byte Spill
-	leaq	(%rbx,%rcx,8), %rcx
-	addq	$-8, %rcx
-	movq	%rcx, -352(%rbp)        ## 8-byte Spill
-	leal	6(%r9), %ecx
-	andl	$7, %ecx
-	movq	%rax, -448(%rbp)        ## 8-byte Spill
-	movq	%rcx, -344(%rbp)        ## 8-byte Spill
-	subq	%rcx, %rax
-	movq	%rsi, %rcx
-	shlq	$32, %rcx
-	movq	%rcx, -440(%rbp)        ## 8-byte Spill
-	leaq	1(%rax), %rcx
-	movq	%rcx, -328(%rbp)        ## 8-byte Spill
-	movq	%rax, -336(%rbp)        ## 8-byte Spill
-	leal	1(%rax), %eax
-	movl	%eax, -212(%rbp)        ## 4-byte Spill
-	leaq	2(%rsi), %rax
-	movq	%rax, -296(%rbp)        ## 8-byte Spill
-	movq	-80(%rbp), %rax         ## 8-byte Reload
-	leaq	8(%rax,%rsi,8), %rax
-	movq	%rax, -288(%rbp)        ## 8-byte Spill
-	leaq	(,%rsi,8), %rax
-	movq	%rax, -432(%rbp)        ## 8-byte Spill
-	movq	%rsi, -200(%rbp)        ## 8-byte Spill
-	leaq	(%rbx,%rsi,8), %rax
-	addq	$8, %rax
-	movq	%rax, -280(%rbp)        ## 8-byte Spill
-	movl	$1, %eax
-	.p2align	4, 0x90
-LBB0_19:                                ## =>This Loop Header: Depth=1
-                                        ##     Child Loop BB0_52 Depth 2
-                                        ##       Child Loop BB0_37 Depth 3
-                                        ##       Child Loop BB0_55 Depth 3
-	cmpl	$2, %r9d
-	jle	LBB0_58
-## %bb.20:                              ##   in Loop: Header=BB0_19 Depth=1
-	movq	%rax, %rcx
-	movq	%rax, %r12
-	movq	-312(%rbp), %r15        ## 8-byte Reload
-	imulq	%r15, %r12
-	leaq	1(%rax), %rax
-	movl	%r9d, %edi
-	imull	%ecx, %edi
-	leal	1(%rdi), %r8d
-	imull	%r9d, %r8d
-	addl	$2, %edi
-	imull	%r9d, %edi
-	movq	%rax, -320(%rbp)        ## 8-byte Spill
-	movq	%rax, %r13
-	imulq	%r15, %r13
-	movq	-224(%rbp), %rdx        ## 8-byte Reload
-	leaq	(%rdx,%r13), %rax
-	movq	%rax, -408(%rbp)        ## 8-byte Spill
-	movq	-304(%rbp), %rsi        ## 8-byte Reload
-	leaq	(%rsi,%r13), %rax
-	movq	%rax, -400(%rbp)        ## 8-byte Spill
-	addq	$-1, %rcx
-	imulq	%r15, %rcx
-	leaq	(%rdx,%rcx), %rax
-	movq	%rax, -392(%rbp)        ## 8-byte Spill
-	leaq	(%rsi,%rcx), %rax
-	movq	%rax, -384(%rbp)        ## 8-byte Spill
-	movq	-296(%rbp), %rax        ## 8-byte Reload
-	leal	(%rax,%r12), %eax
-	shlq	$32, %rax
-	movq	%rax, -104(%rbp)        ## 8-byte Spill
-	movq	-280(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%r13,8), %r10
-	leaq	(%rax,%rcx,8), %r11
-	movl	%r12d, %edx
-	addq	$1, %rdx
-	movq	-200(%rbp), %rax        ## 8-byte Reload
-	addq	%rax, %r13
-	movq	%r13, -144(%rbp)        ## 8-byte Spill
-	addq	%rax, %rcx
-	movq	%rcx, -152(%rbp)        ## 8-byte Spill
-	leal	2(%r8), %eax
-	movq	%rax, -240(%rbp)        ## 8-byte Spill
-	leal	1(%r12), %eax
-	movq	%rax, -416(%rbp)        ## 8-byte Spill
-	movq	%rdi, %rax
-	movq	%rdi, -112(%rbp)        ## 8-byte Spill
-	leal	1(%rdi), %r15d
-	movq	-224(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%r12), %rcx
-	leaq	(%rsi,%r12), %rax
-	movq	%rax, -368(%rbp)        ## 8-byte Spill
-	movq	-288(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%r12,8), %rsi
-	leaq	-8(%rax,%r12,8), %rax
-	movq	%rax, -136(%rbp)        ## 8-byte Spill
-	movq	%r12, -120(%rbp)        ## 8-byte Spill
-	leaq	1(%r12), %rax
-	movq	%rax, -360(%rbp)        ## 8-byte Spill
-	leal	-1(%r8), %eax
-	movl	%eax, -124(%rbp)        ## 4-byte Spill
-	movq	%rcx, -376(%rbp)        ## 8-byte Spill
-	movq	%rcx, -272(%rbp)        ## 8-byte Spill
-	movq	%r8, -248(%rbp)         ## 8-byte Spill
-	movq	%r8, %rdi
-	movq	%r15, -232(%rbp)        ## 8-byte Spill
-	movq	%r15, %r8
-	xorl	%r12d, %r12d
-	movl	$1, %eax
-	jmp	LBB0_52
-	.p2align	4, 0x90
-LBB0_21:                                ##   in Loop: Header=BB0_52 Depth=2
-	movl	%r9d, %edx
-	imull	%r12d, %edx
-	movq	-248(%rbp), %rax        ## 8-byte Reload
-	leal	(%rax,%rdx), %ecx
-	movq	-424(%rbp), %rax        ## 8-byte Reload
-	leal	(%rcx,%rax), %esi
-	cmpl	%ecx, %esi
-	jl	LBB0_53
-## %bb.22:                              ##   in Loop: Header=BB0_52 Depth=2
-	movq	%rax, %rcx
-	shrq	$32, %rcx
-	jne	LBB0_53
-## %bb.23:                              ##   in Loop: Header=BB0_52 Depth=2
-	movq	-240(%rbp), %rsi        ## 8-byte Reload
-	leal	(%rsi,%rdx), %esi
-	leal	(%rsi,%rax), %edi
-	cmpl	%esi, %edi
-	jl	LBB0_53
-## %bb.24:                              ##   in Loop: Header=BB0_52 Depth=2
-	testq	%rcx, %rcx
-	jne	LBB0_53
-## %bb.25:                              ##   in Loop: Header=BB0_52 Depth=2
-	movq	-416(%rbp), %rsi        ## 8-byte Reload
-	leal	(%rsi,%rdx), %esi
-	leal	(%rsi,%rax), %edi
-	cmpl	%esi, %edi
-	jl	LBB0_53
-## %bb.26:                              ##   in Loop: Header=BB0_52 Depth=2
-	testq	%rcx, %rcx
-	jne	LBB0_53
-## %bb.27:                              ##   in Loop: Header=BB0_52 Depth=2
-	addl	-232(%rbp), %edx        ## 4-byte Folded Reload
-	leal	(%rdx,%rax), %esi
-	cmpl	%edx, %esi
-	jl	LBB0_53
-## %bb.28:                              ##   in Loop: Header=BB0_52 Depth=2
-	testq	%rcx, %rcx
-	jne	LBB0_53
-## %bb.29:                              ##   in Loop: Header=BB0_52 Depth=2
-	movq	-192(%rbp), %rdx        ## 8-byte Reload
-	movq	%rdx, %rsi
-	imulq	-200(%rbp), %rsi        ## 8-byte Folded Reload
-	movq	-376(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%rsi), %rdi
-	movq	-368(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%rsi), %r13
-	movq	-408(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%rsi), %r11
-	movq	-400(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%rsi), %rcx
-	movq	-392(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%rsi), %r10
-	addq	-384(%rbp), %rsi        ## 8-byte Folded Reload
-                                        ## kill: def $edx killed $edx killed $rdx def $rdx
-	imull	-256(%rbp), %edx        ## 4-byte Folded Reload
-	movq	-232(%rbp), %rax        ## 8-byte Reload
-	leal	(%rax,%rdx), %r12d
-	movq	-360(%rbp), %rax        ## 8-byte Reload
-	leal	(%rax,%rdx), %r9d
-	movq	-240(%rbp), %rax        ## 8-byte Reload
-	leal	(%rax,%rdx), %eax
-	movl	%eax, -60(%rbp)         ## 4-byte Spill
-	addl	-248(%rbp), %edx        ## 4-byte Folded Reload
-	movq	-80(%rbp), %rax         ## 8-byte Reload
-	leaq	(%rax,%rdi,8), %rdi
-	leaq	(%rbx,%rcx,8), %rcx
-	cmpq	%rcx, %rdi
-	leaq	(%rax,%r13,8), %rcx
-	leaq	(%rbx,%r11,8), %r11
-	setb	-45(%rbp)               ## 1-byte Folded Spill
-	cmpq	%rcx, %r11
-	leaq	(%rbx,%r10,8), %r10
-	leaq	(%rbx,%rsi,8), %r11
-	movslq	%r12d, %rsi
-	setb	-44(%rbp)               ## 1-byte Folded Spill
-	cmpq	%r11, %rdi
-	setb	%r12b
-	cmpq	%rcx, %r10
-	leaq	(%rbx,%rsi,8), %r10
-	movq	-352(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%rsi,8), %rsi
-	movslq	%r9d, %r9
-	setb	-43(%rbp)               ## 1-byte Folded Spill
-	cmpq	%rsi, %rdi
-	setb	%r11b
-	cmpq	%rcx, %r10
-	leaq	(%rbx,%r9,8), %r10
-	leaq	(%rax,%r9,8), %rsi
-	movslq	-60(%rbp), %r9          ## 4-byte Folded Reload
-	setb	-60(%rbp)               ## 1-byte Folded Spill
-	cmpq	%rsi, %rdi
-	setb	%r13b
-	cmpq	%rcx, %r10
-	leaq	(%rbx,%r9,8), %r10
-	leaq	(%rax,%r9,8), %rsi
-	movslq	%edx, %rdx
-	setb	-42(%rbp)               ## 1-byte Folded Spill
-	cmpq	%rsi, %rdi
-	setb	%r9b
-	cmpq	%rcx, %r10
-	leaq	(%rax,%rdx,8), %rsi
-	setb	-41(%rbp)               ## 1-byte Folded Spill
-	cmpq	%rsi, %rdi
-	leaq	(%rbx,%rdx,8), %rdx
-	setb	%r10b
-	cmpq	%rcx, %rdx
-	setb	%dl
-	leaq	-55(%rbp), %rax
-	cmpq	%rdi, %rax
-	seta	%dil
-	leaq	-56(%rbp), %rax
-	cmpq	%rcx, %rax
-	setb	%al
-	movb	-44(%rbp), %cl          ## 1-byte Reload
-	testb	%cl, -45(%rbp)          ## 1-byte Folded Reload
-	jne	LBB0_53
-## %bb.30:                              ##   in Loop: Header=BB0_52 Depth=2
-	andb	-43(%rbp), %r12b        ## 1-byte Folded Reload
-	jne	LBB0_53
-## %bb.31:                              ##   in Loop: Header=BB0_52 Depth=2
-	andb	-60(%rbp), %r11b        ## 1-byte Folded Reload
-	jne	LBB0_53
-## %bb.32:                              ##   in Loop: Header=BB0_52 Depth=2
-	andb	-42(%rbp), %r13b        ## 1-byte Folded Reload
-	jne	LBB0_53
-## %bb.33:                              ##   in Loop: Header=BB0_52 Depth=2
-	andb	-41(%rbp), %r9b         ## 1-byte Folded Reload
-	jne	LBB0_53
-## %bb.34:                              ##   in Loop: Header=BB0_52 Depth=2
-	movl	$1, %r9d
-	andb	%dl, %r10b
-	jne	LBB0_54
-## %bb.35:                              ##   in Loop: Header=BB0_52 Depth=2
-	andb	%al, %dil
-	jne	LBB0_54
-## %bb.36:                              ##   in Loop: Header=BB0_52 Depth=2
-	vbroadcastsd	-56(%rbp), %zmm0
-	movq	-104(%rbp), %rdx        ## 8-byte Reload
-	xorl	%esi, %esi
-	movq	-336(%rbp), %r9         ## 8-byte Reload
-	movabsq	$34359738368, %rdi      ## imm = 0x800000000
-	movq	%rdi, %r10
-	movq	-184(%rbp), %r11        ## 8-byte Reload
-	movq	-176(%rbp), %r15        ## 8-byte Reload
-	movq	-168(%rbp), %r12        ## 8-byte Reload
-	movq	-88(%rbp), %rdi         ## 8-byte Reload
-	movq	-160(%rbp), %rax        ## 8-byte Reload
-	.p2align	4, 0x90
-        movl      $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-LBB0_37:                                ##   Parent Loop BB0_19 Depth=1
-                                        ##     Parent Loop BB0_52 Depth=2
-                                        ## =>    This Inner Loop Header: Depth=3
-	leal	(%rax,%rsi), %ecx
-	movslq	%ecx, %rcx
-	vmovupd	(%rbx,%rcx,8), %zmm1
-	movq	%rdx, %rcx
-	sarq	$29, %rcx
-	vaddpd	(%rbx,%rcx), %zmm1, %zmm1
-	leal	(%r12,%rsi), %ecx
-	movslq	%ecx, %rcx
-	vaddpd	(%rbx,%rcx,8), %zmm1, %zmm1
-	leal	(%r8,%rsi), %ecx
-	movslq	%ecx, %rcx
-	vaddpd	(%rbx,%rcx,8), %zmm1, %zmm1
-	vaddpd	(%r15,%rsi,8), %zmm1, %zmm1
-	vaddpd	(%r11,%rsi,8), %zmm1, %zmm1
-	vmulpd	%zmm0, %zmm1, %zmm1
-	vmovupd	%zmm1, (%rdi,%rsi,8)
-	addq	$8, %rsi
-	addq	%r10, %rdx
-	cmpq	%rsi, %r9
-	jne	LBB0_37
-        movl      $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-## %bb.38:                              ##   in Loop: Header=BB0_52 Depth=2
-	movq	-328(%rbp), %r9         ## 8-byte Reload
-	movl	-212(%rbp), %eax        ## 4-byte Reload
-	movl	%eax, %r15d
-	cmpl	$0, -344(%rbp)          ## 4-byte Folded Reload
-	jne	LBB0_54
-	jmp	LBB0_56
-	.p2align	4, 0x90
-LBB0_52:                                ##   Parent Loop BB0_19 Depth=1
-                                        ## =>  This Loop Header: Depth=2
-                                        ##       Child Loop BB0_37 Depth 3
-                                        ##       Child Loop BB0_55 Depth 3
-	movq	%rdx, -168(%rbp)        ## 8-byte Spill
-	addq	$1, %rax
-	movl	$1, %r15d
-	cmpq	$8, -448(%rbp)          ## 8-byte Folded Reload
-	movq	%r10, -184(%rbp)        ## 8-byte Spill
-	movq	%r11, -176(%rbp)        ## 8-byte Spill
-	movq	%rsi, -88(%rbp)         ## 8-byte Spill
-	movq	%rdi, -160(%rbp)        ## 8-byte Spill
-	movq	%r12, -192(%rbp)        ## 8-byte Spill
-	movq	%rax, -208(%rbp)        ## 8-byte Spill
-	jae	LBB0_21
-LBB0_53:                                ##   in Loop: Header=BB0_52 Depth=2
-	movl	$1, %r9d
-LBB0_54:                                ##   in Loop: Header=BB0_52 Depth=2
-	movq	-136(%rbp), %rax        ## 8-byte Reload
-	leaq	(%rax,%r9,8), %rdx
-	movq	-144(%rbp), %rax        ## 8-byte Reload
-	leaq	(%r9,%rax), %rcx
-	leaq	(%rbx,%rcx,8), %r11
-	movq	-152(%rbp), %rax        ## 8-byte Reload
-	leaq	(%r9,%rax), %rcx
-	leaq	(%rbx,%rcx,8), %r10
-	movq	-272(%rbp), %rax        ## 8-byte Reload
-	leal	(%r9,%rax), %r12d
-	shlq	$32, %r12
-	movq	-264(%rbp), %r13        ## 8-byte Reload
-	subq	%r9, %r13
-	movq	-112(%rbp), %rax        ## 8-byte Reload
-	leal	(%r15,%rax), %esi
-	movq	-120(%rbp), %rax        ## 8-byte Reload
-	leal	(%r15,%rax), %edi
-	addl	-124(%rbp), %r15d       ## 4-byte Folded Reload
-	xorl	%ecx, %ecx
-	.p2align	4, 0x90
-LBB0_55:                                ##   Parent Loop BB0_19 Depth=1
-                                        ##     Parent Loop BB0_52 Depth=2
-                                        ## =>    This Inner Loop Header: Depth=3
-	leal	(%r15,%rcx), %eax
-	cltq
-	vmovsd	(%rbx,%rax,8), %xmm0    ## xmm0 = mem[0],zero
-	movq	%r12, %rax
-	sarq	$29, %rax
-	vaddsd	(%rbx,%rax), %xmm0, %xmm0
-	leal	(%rdi,%rcx), %eax
-	cltq
-	vaddsd	(%rbx,%rax,8), %xmm0, %xmm0
-	leal	(%rsi,%rcx), %eax
-	cltq
-	vaddsd	(%rbx,%rax,8), %xmm0, %xmm0
-	vaddsd	(%r10,%rcx,8), %xmm0, %xmm0
-	vaddsd	(%r11,%rcx,8), %xmm0, %xmm0
-	vmulsd	-56(%rbp), %xmm0, %xmm0
-	vmovsd	%xmm0, (%rdx,%rcx,8)
-	addq	$1, %rcx
-	addq	%r14, %r12
-	cmpq	%rcx, %r13
-	jne	LBB0_55
-LBB0_56:                                ##   in Loop: Header=BB0_52 Depth=2
-	movq	-192(%rbp), %r12        ## 8-byte Reload
-	addq	$1, %r12
-	movq	-104(%rbp), %rax        ## 8-byte Reload
-	addq	-440(%rbp), %rax        ## 8-byte Folded Reload
-	movq	%rax, -104(%rbp)        ## 8-byte Spill
-	movq	-432(%rbp), %rcx        ## 8-byte Reload
-	movq	-88(%rbp), %rsi         ## 8-byte Reload
-	addq	%rcx, %rsi
-	movq	-184(%rbp), %r10        ## 8-byte Reload
-	addq	%rcx, %r10
-	movq	-176(%rbp), %r11        ## 8-byte Reload
-	addq	%rcx, %r11
-	movq	-256(%rbp), %rax        ## 8-byte Reload
-	addq	%rax, %r8
-	movq	-168(%rbp), %rdx        ## 8-byte Reload
-	addq	%rax, %rdx
-	movq	-160(%rbp), %rdi        ## 8-byte Reload
-	addq	%rax, %rdi
-	addq	%rcx, -136(%rbp)        ## 8-byte Folded Spill
-	movq	-200(%rbp), %rax        ## 8-byte Reload
-	addq	%rax, -144(%rbp)        ## 8-byte Folded Spill
-	addq	%rax, -152(%rbp)        ## 8-byte Folded Spill
-	addq	%rax, -272(%rbp)        ## 8-byte Folded Spill
-	movq	-96(%rbp), %r9          ## 8-byte Reload
-	movq	-112(%rbp), %rax        ## 8-byte Reload
-	addl	%r9d, %eax
-	movq	%rax, -112(%rbp)        ## 8-byte Spill
-	movq	-120(%rbp), %rax        ## 8-byte Reload
-	addl	%r9d, %eax
-	movq	%rax, -120(%rbp)        ## 8-byte Spill
-	addl	%r9d, -124(%rbp)        ## 4-byte Folded Spill
-	movq	-208(%rbp), %rax        ## 8-byte Reload
-	cmpq	-264(%rbp), %rax        ## 8-byte Folded Reload
-	jne	LBB0_52
-## %bb.57:                              ##   in Loop: Header=BB0_19 Depth=1
-	movq	-320(%rbp), %rcx        ## 8-byte Reload
-	movq	%rcx, %rax
-	cmpq	-72(%rbp), %rcx         ## 8-byte Folded Reload
-	jne	LBB0_19
-	jmp	LBB0_59
-	.p2align	4, 0x90
-LBB0_58:                                ##   in Loop: Header=BB0_19 Depth=1
-	movq	%rax, %rcx
-	addq	$1, %rcx
-	movq	%rcx, %rax
-	cmpq	-72(%rbp), %rcx         ## 8-byte Folded Reload
-	jne	LBB0_19
-LBB0_59:
-	movq	_var_false@GOTPCREL(%rip), %rax
-	cmpl	$0, (%rax)
-	je	LBB0_61
-## %bb.60:
-	movq	%rbx, %rdi
-	vzeroupper
-	callq	_dummy
-	movq	-80(%rbp), %rdi         ## 8-byte Reload
-	callq	_dummy
-	leaq	-56(%rbp), %rdi
-	callq	_dummy
-LBB0_61:
-	xorl	%eax, %eax
-	addq	$408, %rsp              ## imm = 0x198
-	popq	%rbx
-	popq	%r12
-	popq	%r13
-	popq	%r14
-	popq	%r15
-	popq	%rbp
-	vzeroupper
-	retq
-	.cfi_endproc
-                                        ## -- End function
-
-.subsections_via_symbols
--- a/tests/testfiles/taxCalc-ivb-iaca
+++ b/tests/testfiles/taxCalc-ivb-iaca
--- a/tests/testfiles/taxCalc-ivb-iaca.S
+++ b/tests/testfiles/taxCalc-ivb-iaca.S
@@ -1,196 +0,0 @@
-# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
-# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
-	.file "taxCalc.c"
-	.text
-..TXTST0:
-# -- Begin  main
-	.text
-# mark_begin;
-       .align    16,0x90
-	.globl main
-# --- main(void)
-main:
-..B1.1:                         # Preds ..B1.0
-	.cfi_startproc
-..___tag_value_main.1:
-..L2:
-                                                          #4.15
-        pushq     %rbp                                          #4.15
-	.cfi_def_cfa_offset 16
-        movq      %rsp, %rbp                                    #4.15
-	.cfi_def_cfa 6, 16
-	.cfi_offset 6, -16
-        andq      $-128, %rsp                                   #4.15
-        subq      $4096, %rsp                                   #4.15
-        movl      $104446, %esi                                 #4.15
-        movl      $3, %edi                                      #4.15
-        call      __intel_new_feature_proc_init                 #4.15
-                                # LOE rbx r12 r13 r14 r15
-..B1.10:                        # Preds ..B1.1
-        vstmxcsr  (%rsp)                                        #4.15
-        movl      $.2.3_2_kmpc_loc_struct_pack.3, %edi          #4.15
-        xorl      %esi, %esi                                    #4.15
-        orl       $32832, (%rsp)                                #4.15
-        xorl      %eax, %eax                                    #4.15
-        vldmxcsr  (%rsp)                                        #4.15
-..___tag_value_main.6:
-        call      __kmpc_begin                                  #4.15
-..___tag_value_main.7:
-                                # LOE rbx r12 r13 r14 r15
-..B1.2:                         # Preds ..B1.10
-        movl      $il0_peep_printf_format_0, %edi               #5.5
-        call      puts                                          #5.5
-                                # LOE rbx r12 r13 r14 r15
-..B1.3:                         # Preds ..B1.2
-        vmovss    .L_2il0floatpacket.0(%rip), %xmm0             #8.15
-        xorl      %eax, %eax                                    #11.5
-        vxorps    %xmm1, %xmm1, %xmm1                           #9.5
-        vmovss    %xmm1, (%rsp)                                 #9.5
-
-        movl $111,%ebx
-       .byte 100,103,144
-
-..B1.4:                         # Preds ..B1.4 ..B1.3
-        lea       1(%rax,%rax), %edx                            #12.9
-        vcvtsi2ss %edx, %xmm2, %xmm2                            #12.27
-        vmulss    %xmm2, %xmm0, %xmm3                           #12.29
-        lea       2(%rax,%rax), %ecx                            #12.9
-        vaddss    %xmm3, %xmm1, %xmm4                           #12.29
-        vxorps    %xmm1, %xmm1, %xmm1                           #12.27
-        vcvtsi2ss %ecx, %xmm1, %xmm1                            #12.27
-        vmulss    %xmm1, %xmm0, %xmm5                           #12.29
-        vmovss    %xmm4, 4(%rsp,%rax,8)                         #12.9
-        vaddss    %xmm5, %xmm4, %xmm1                           #12.29
-        vmovss    %xmm1, 8(%rsp,%rax,8)                         #12.9
-        incq      %rax                                          #11.5
-        cmpq      $499, %rax                                    #11.5
-        jb        ..B1.4        # Prob 99%                      #11.5
-                                # LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
-        movl $222,%ebx
-        .byte 100,103,144
-..B1.5:                         # Preds ..B1.4
-        vmovss    3992(%rsp), %xmm0                             #12.18
-        movl      $il0_peep_printf_format_1, %edi               #15.5
-        vaddss    .L_2il0floatpacket.1(%rip), %xmm0, %xmm1      #12.29
-        vmovss    %xmm1, 3996(%rsp)                             #12.9
-        call      puts                                          #15.5
-                                # LOE rbx r12 r13 r14 r15
-..B1.6:                         # Preds ..B1.5
-        movl      $.2.3_2_kmpc_loc_struct_pack.14, %edi         #16.12
-        xorl      %eax, %eax                                    #16.12
-..___tag_value_main.8:
-        call      __kmpc_end                                    #16.12
-..___tag_value_main.9:
-                                # LOE rbx r12 r13 r14 r15
-..B1.7:                         # Preds ..B1.6
-        xorl      %eax, %eax                                    #16.12
-        movq      %rbp, %rsp                                    #16.12
-        popq      %rbp                                          #16.12
-	.cfi_def_cfa 7, 8
-	.cfi_restore 6
-        ret                                                     #16.12
-        .align    16,0x90
-	.cfi_endproc
-                                # LOE
-# mark_end;
-	.type	main,@function
-	.size	main,.-main
-	.data
-	.align 4
-	.align 4
-.2.3_2_kmpc_loc_struct_pack.3:
-	.long	0
-	.long	2
-	.long	0
-	.long	0
-	.quad	.2.3_2__kmpc_loc_pack.2
-	.align 4
-.2.3_2__kmpc_loc_pack.2:
-	.byte	59
-	.byte	117
-	.byte	110
-	.byte	107
-	.byte	110
-	.byte	111
-	.byte	119
-	.byte	110
-	.byte	59
-	.byte	109
-	.byte	97
-	.byte	105
-	.byte	110
-	.byte	59
-	.byte	52
-	.byte	59
-	.byte	52
-	.byte	59
-	.byte	59
-	.space 1, 0x00 	# pad
-	.align 4
-.2.3_2_kmpc_loc_struct_pack.14:
-	.long	0
-	.long	2
-	.long	0
-	.long	0
-	.quad	.2.3_2__kmpc_loc_pack.13
-	.align 4
-.2.3_2__kmpc_loc_pack.13:
-	.byte	59
-	.byte	117
-	.byte	110
-	.byte	107
-	.byte	110
-	.byte	111
-	.byte	119
-	.byte	110
-	.byte	59
-	.byte	109
-	.byte	97
-	.byte	105
-	.byte	110
-	.byte	59
-	.byte	49
-	.byte	54
-	.byte	59
-	.byte	49
-	.byte	54
-	.byte	59
-	.byte	59
-	.section .rodata.str1.4, "aMS",@progbits,1
-	.align 4
-	.align 4
-il0_peep_printf_format_0:
-	.long	1128354639
-	.long	1702109249
-	.long	1931506803
-	.long	1953653108
-	.byte	0
-	.space 3, 0x00 	# pad
-	.align 4
-il0_peep_printf_format_1:
-	.long	1128354639
-	.long	1702109249
-	.long	1696625779
-	.word	25710
-	.byte	0
-	.data
-# -- End  main
-	.section .rodata, "a"
-	.align 4
-	.align 4
-.L_2il0floatpacket.0:
-	.long	0x3e428f5c
-	.type	.L_2il0floatpacket.0,@object
-	.size	.L_2il0floatpacket.0,4
-	.align 4
-.L_2il0floatpacket.1:
-	.long	0x433dcf5c
-	.type	.L_2il0floatpacket.1,@object
-	.size	.L_2il0floatpacket.1,4
-	.data
-	.section .note.GNU-stack, ""
-// -- Begin DWARF2 SEGMENT .eh_frame
-	.section .eh_frame,"a",@progbits
-.eh_frame_seg:
-	.align 8
-# End
--- a/tests/testfiles/taxCalc-ivb-iaca2.S
+++ b/tests/testfiles/taxCalc-ivb-iaca2.S
@@ -1,201 +0,0 @@
-# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
-# mark_description "-I../../iaca-lin64/include -fno-alias -O3 -fopenmp -xCORE-AVX-I -S -o ivb-asm.S";
-	.file "taxCalc.c"
-	.text
-..TXTST0:
-# -- Begin  main
-	.text
-# mark_begin;
-       .align    16,0x90
-	.globl main
-# --- main(void)
-main:
-..B1.1:                         # Preds ..B1.0
-	.cfi_startproc
-..___tag_value_main.1:
-..L2:
-                                                          #4.15
-        pushq     %rbp                                          #4.15
-	.cfi_def_cfa_offset 16
-        movq      %rsp, %rbp                                    #4.15
-	.cfi_def_cfa 6, 16
-	.cfi_offset 6, -16
-        andq      $-128, %rsp                                   #4.15
-        subq      $4096, %rsp                                   #4.15
-        movl      $104446, %esi                                 #4.15
-        movl      $3, %edi                                      #4.15
-        call      __intel_new_feature_proc_init                 #4.15
-                                # LOE rbx r12 r13 r14 r15
-..B1.10:                        # Preds ..B1.1
-        vstmxcsr  (%rsp)                                        #4.15
-        movl      $.2.3_2_kmpc_loc_struct_pack.3, %edi          #4.15
-        xorl      %esi, %esi                                    #4.15
-        orl       $32832, (%rsp)                                #4.15
-        xorl      %eax, %eax                                    #4.15
-        vldmxcsr  (%rsp)                                        #4.15
-..___tag_value_main.6:
-        call      __kmpc_begin                                  #4.15
-..___tag_value_main.7:
-                                # LOE rbx r12 r13 r14 r15
-..B1.2:                         # Preds ..B1.10
-        movl      $il0_peep_printf_format_0, %edi               #5.5
-        call      puts                                          #5.5
-                                # LOE rbx r12 r13 r14 r15
-..B1.3:                         # Preds ..B1.2
-        vmovss    .L_2il0floatpacket.0(%rip), %xmm0             #8.15
-        xorl      %eax, %eax                                    #11.5
-        vxorps    %xmm1, %xmm1, %xmm1                           #9.5
-        vmovss    %xmm1, (%rsp)                                 #9.5
-
-                                # LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
-        movl      $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     100        
-        .byte     103        
-        .byte     144        
-..B1.4:                         # Preds ..B1.4 ..B1.3
-        lea       1(%rax,%rax), %edx                            #12.9
-        vcvtsi2ss %edx, %xmm2, %xmm2                            #12.27
-        vmulss    %xmm2, %xmm0, %xmm3                           #12.29
-        lea       2(%rax,%rax), %ecx                            #12.9
-        vaddss    %xmm3, %xmm1, %xmm4                           #12.29
-        vxorps    %xmm1, %xmm1, %xmm1                           #12.27
-        vcvtsi2ss %ecx, %xmm1, %xmm1                            #12.27
-        vmulss    %xmm1, %xmm0, %xmm5                           #12.29
-        vmovss    %xmm4, 4(%rsp,%rax,8)                         #12.9
-        vaddss    %xmm5, %xmm4, %xmm1                           #12.29
-        vmovss    %xmm1, 8(%rsp,%rax,8)                         #12.9
-        incq      %rax                                          #11.5
-        cmpq      $499, %rax                                    #11.5
-        jb        ..B1.4        # Prob 99%                      #11.5
-        movl      $222, %ebx 
-        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-        .byte     103        
-        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
-                                # LOE rax rbx r12 r13 r14 r15 xmm0 xmm1
-
-..B1.5:                         # Preds ..B1.4
-        vmovss    3992(%rsp), %xmm0                             #12.18
-        movl      $il0_peep_printf_format_1, %edi               #15.5
-        vaddss    .L_2il0floatpacket.1(%rip), %xmm0, %xmm1      #12.29
-        vmovss    %xmm1, 3996(%rsp)                             #12.9
-        call      puts                                          #15.5
-                                # LOE rbx r12 r13 r14 r15
-..B1.6:                         # Preds ..B1.5
-        movl      $.2.3_2_kmpc_loc_struct_pack.14, %edi         #16.12
-        xorl      %eax, %eax                                    #16.12
-..___tag_value_main.8:
-        call      __kmpc_end                                    #16.12
-..___tag_value_main.9:
-                                # LOE rbx r12 r13 r14 r15
-..B1.7:                         # Preds ..B1.6
-        xorl      %eax, %eax                                    #16.12
-        movq      %rbp, %rsp                                    #16.12
-        popq      %rbp                                          #16.12
-	.cfi_def_cfa 7, 8
-	.cfi_restore 6
-        ret                                                     #16.12
-        .align    16,0x90
-	.cfi_endproc
-                                # LOE
-# mark_end;
-	.type	main,@function
-	.size	main,.-main
-	.data
-	.align 4
-	.align 4
-.2.3_2_kmpc_loc_struct_pack.3:
-	.long	0
-	.long	2
-	.long	0
-	.long	0
-	.quad	.2.3_2__kmpc_loc_pack.2
-	.align 4
-.2.3_2__kmpc_loc_pack.2:
-	.byte	59
-	.byte	117
-	.byte	110
-	.byte	107
-	.byte	110
-	.byte	111
-	.byte	119
-	.byte	110
-	.byte	59
-	.byte	109
-	.byte	97
-	.byte	105
-	.byte	110
-	.byte	59
-	.byte	52
-	.byte	59
-	.byte	52
-	.byte	59
-	.byte	59
-	.space 1, 0x00 	# pad
-	.align 4
-.2.3_2_kmpc_loc_struct_pack.14:
-	.long	0
-	.long	2
-	.long	0
-	.long	0
-	.quad	.2.3_2__kmpc_loc_pack.13
-	.align 4
-.2.3_2__kmpc_loc_pack.13:
-	.byte	59
-	.byte	117
-	.byte	110
-	.byte	107
-	.byte	110
-	.byte	111
-	.byte	119
-	.byte	110
-	.byte	59
-	.byte	109
-	.byte	97
-	.byte	105
-	.byte	110
-	.byte	59
-	.byte	49
-	.byte	54
-	.byte	59
-	.byte	49
-	.byte	54
-	.byte	59
-	.byte	59
-	.section .rodata.str1.4, "aMS",@progbits,1
-	.align 4
-	.align 4
-il0_peep_printf_format_0:
-	.long	1128354639
-	.long	1702109249
-	.long	1931506803
-	.long	1953653108
-	.byte	0
-	.space 3, 0x00 	# pad
-	.align 4
-il0_peep_printf_format_1:
-	.long	1128354639
-	.long	1702109249
-	.long	1696625779
-	.word	25710
-	.byte	0
-	.data
-# -- End  main
-	.section .rodata, "a"
-	.align 4
-	.align 4
-.L_2il0floatpacket.0:
-	.long	0x3e428f5c
-	.type	.L_2il0floatpacket.0,@object
-	.size	.L_2il0floatpacket.0,4
-	.align 4
-.L_2il0floatpacket.1:
-	.long	0x433dcf5c
-	.type	.L_2il0floatpacket.1,@object
-	.size	.L_2il0floatpacket.1,4
-	.data
-	.section .note.GNU-stack, ""
-// -- Begin DWARF2 SEGMENT .eh_frame
-	.section .eh_frame,"a",@progbits
-.eh_frame_seg:
-	.align 8
-# End
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,5 @@
 [tox]
-envlist = py35
+envlist = py35,py36
 [testenv]
 commands=
    python tests/all_tests.py
-#    osaca --arch ivb --iaca examples/taxCalc-ivb-iaca
-#    osaca --arch ivb --iaca examples/taxCalc-ivb-iaca.S
-#    osaca --arch ivb examples/taxCalc-ivb