version bump

fixed typo
fixed test after changing TP value of instruction
2025-12-16 00:50:06 +01:00 · 2020-11-11 15:14:27 +01:00 · 2020-11-11 14:11:00 +01:00 · 2020-11-11 14:04:07 +01:00 · 2020-11-11 13:54:23 +01:00 · 2020-11-11 12:27:49 +01:00
39 changed files with 72281 additions and 53390 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 # OSACA specific files and folders
-osaca/taxCalc/
+*.*.pickle

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,9 +3,9 @@ language: python
 python:
    - "3.5"
    - "3.6"
-# Python 3.7 not working yet
    - "3.7"
    - "3.8"
+    - "3.9"
 before_install: 
 #  - pip install tox-travis
  - pip install codecov
@@ -26,7 +26,7 @@ deploy:
  username: "__token__"
  password:
    secure: "fRRCETOwDkJ4pFacYZghPfCQ9mSsV4PlD3sTDp8rDHoCnebPjvFYc1tIdv+Wds0ae162KNUaj9GbxjK0MTGiRcy4pD08n7ufv8snmBQ2rtOLkj7RCRg1hw30WcMHjzqScFJgQcBrpjdPmR5AlesUufh6OadGvF1NspmVRWKr8ir3KQhmNV+itAliYoqaSTRTg1zC/znm+49l5gkzlLxd+mPj5/dtcc8vZ/i2M2+nNTTjDxq71q4Ddqv+bgZV1y7OZY2YuvjEDPflUbwc3fjOxpj891uMDHodsGmEHBu8WsLpF2tAO0C/x63S0jXamkV+/4cAQqQAwWr0Lby9/BjCfUwyUMOEgZ0S+z9WoFpBpQTQEfkD2JH/UFrv4CMnLFqgDkVMcx0vc/rT4Od8eJ5wOSG5+VdniJNOLpodFOXuKc09eJMk2lE9vk9OBrcsZ09UOTPTUCMZSIP4cBDxaIkx+RHQEy63TQdJZcElRBEWGEgj2e9hbiktvIoOvbFGQDscpz7ShBDklXIpu9hnxcKHtNDEjyywTUJmx7lTMILL05DPUnpUmnMb1Gyx5lbHzhSExc9re0cxEA354UUQKBS5HwHQcEBw9stMfsaForiBAUOocUKdGqlGP9cOXFoxdC9M+ff5FNstgbjPYSowb/JbATMlmCWKgH/bXXcTGCO10sk="
-  distributions: sdist
+  distributions: "sdist bdist_wheel"
  skip_existing: true
  cleanup: false
  on:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,8 @@ include README.rst
 include LICENSE
 include tox.ini
 recursive-include osaca/data/ *.yml
+recursive-include osaca/data/ *.pickle
+include osaca/data/_build_cache.py
 include examples/*
 recursive-include tests *.py *.out
 recursive-include tests/testfiles/ *
--- a/README.rst
+++ b/README.rst
@@ -10,8 +10,8 @@ Open Source Architecture Code Analyzer

 For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.

-.. image:: https://travis-ci.org/RRZE-HPC/OSACA.svg?branch=master
-    :target: https://travis-ci.org/RRZE-HPC/OSACA
+.. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?branch=master
+    :target: https://travis-ci.com/github/RRZE-HPC/OSACA
    :alt: Build Status

 .. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
@@ -57,8 +57,12 @@ Additional requirements are:

 -  `Python3 <https://www.python.org/>`__
 -  `Graphviz <https://www.graphviz.org/>`__ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
+
+Optional requirements are:
+
 -  `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
 -   `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
+- `BeautifulSoup4 <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ for scraping instruction form information for the x86 ISA (experimental)

 Design
 ======
--- a/osaca/init.py
+++ b/osaca/init.py
@@ -1,6 +1,6 @@
 """Open Source Architecture Code Analyzer"""
 name = 'osaca'
-__version__ = '0.3.6'
+__version__ = '0.3.12'

 # To trigger travis deployment to pypi, do the following:
 # 1. Increment __version___
--- a/osaca/main.py
+++ b/osaca/main.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+from .osaca import main
+
+main()
--- a/osaca/api/kerncraft_interface.py
+++ b/osaca/api/kerncraft_interface.py
@@ -5,7 +5,7 @@ import sys
 from io import StringIO

 from osaca.frontend import Frontend
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT
 from osaca.semantics import (INSTR_FLAGS, KernelDG, MachineModel,
                             ArchSemantics, reduce_to_section)

@@ -29,7 +29,7 @@ class KerncraftAPI(object):
        self.semantics = ArchSemantics(self.machine_model)
        isa = self.machine_model.get_ISA().lower()
        if isa == 'aarch64':
-            self.parser = ParserAArch64v81()
+            self.parser = ParserAArch64()
        elif isa == 'x86':
            self.parser = ParserX86ATT()

--- a/osaca/data/_build_cache.py
+++ b/osaca/data/_build_cache.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+from glob import glob
+import os.path
+import sys
+sys.path[0:0] = ['../..']
+
+failed = False
+try:
+    from osaca.semantics.hw_model import MachineModel
+except ModuleNotFoundError:
+    print("Unable to import MachineModel, probably some dependency is not yet installed. SKIPPING. "
+          "First run of OSACA may take a while to build caches, subsequent runs will be as fast as "
+          "ever.")
+    sys.exit()
+
+print('Building cache: ', end='')
+sys.stdout.flush()
+
+# Iterating architectures
+for f in glob(os.path.join(os.path.dirname(__file__), '*.yml')):
+    MachineModel(path_to_yaml=f)
+    print('.', end='')
+    sys.stdout.flush()
+
+# Iterating ISAs
+for f in glob(os.path.join(os.path.dirname(__file__), 'isa/*.yml')):
+    MachineModel(path_to_yaml=f)
+    print('+', end='')
+    sys.stdout.flush()
+
+print()
--- a/osaca/data/a64fx.yml
+++ b/osaca/data/a64fx.yml
--- a/osaca/data/icl.yml
+++ b/osaca/data/icl.yml
--- a/osaca/data/isa/aarch64.yml
+++ b/osaca/data/isa/aarch64.yml
@@ -1,4 +1,4 @@
-osaca_version: 0.3.4
+osaca_version: 0.3.7
 isa: "AArch64"
 # Contains all operand-irregular instruction forms OSACA supports for AArch64.
 # Operand-regular for a AArch64 instruction form with N operands in the shape of
@@ -37,48 +37,10 @@ instruction_forms:
          offset: "*"
          index: "*"
          scale: "*"
-          pre-indexed: false
-          post-indexed: false
+          pre-indexed: "*"
+          post-indexed: "*"
          source: true
          destination: false
-    - name: ldp
-      operands:
-        - class: "register"
-          prefix: "*"
-          source: false
-          destination: true
-        - class: "register"
-          prefix: "*"
-          source: false
-          destination: true
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: false
-          post-indexed: true
-          source: true
-          destination: true
-    - name: ldp
-      operands:
-        - class: "register"
-          prefix: "*"
-          source: false
-          destination: true
-        - class: "register"
-          prefix: "*"
-          source: false
-          destination: true
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: true
-          post-indexed: false
-          source: true
-          destination: true
    - name: [ldr, ldur]
      operands:
        - class: "register"
@@ -90,25 +52,10 @@ instruction_forms:
          offset: "*"
          index: "*"
          scale: "*"
-          pre-indexed: false
-          post-indexed: true
+          pre-indexed: "*"
+          post-indexed: "*"
          source: true
-          destination: true
-    - name: [ldr, ldur]
-      operands:
-        - class: "register"
-          prefix: "*"
-          source: false
-          destination: true
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: true
-          post-indexed: false
-          source: true
-          destination: true
+          destination: false
    - name: stp
      operands:
        - class: "register"
@@ -124,46 +71,8 @@ instruction_forms:
          offset: "*"
          index: "*"
          scale: "*"
-          pre-indexed: false
-          post-indexed: false
-          source: false
-          destination: true   
-    - name: stp
-      operands:
-        - class: "register"
-          prefix: "*"
-          source: true
-          destination: false
-        - class: "register"
-          prefix: "*"
-          source: true
-          destination: false
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: true
-          post-indexed: false
-          source: false
-          destination: true
-    - name: stp
-      operands:
-        - class: "register"
-          prefix: "*"
-          source: true
-          destination: false
-        - class: "register"
-          prefix: "*"
-          source: true
-          destination: false
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: false
-          post-indexed: true
+          pre-indexed: "*"
+          post-indexed: "*"
          source: false
          destination: true
    - name: [str, stur]
@@ -181,3 +90,73 @@ instruction_forms:
          post-indexed: "*"
          source: false
          destination: true
+    - name: cmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+    - name: cmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+    - name: cmn
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+    - name: cmn
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+    - name: fcmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+    - name: fcmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "immediate"
+          imd: "double"
+          source: true
+          destination: false
+    - name: fcmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "immediate"
+          imd: "float"
+          source: true
+          destination: false
--- a/osaca/data/isa/x86.yml
+++ b/osaca/data/isa/x86.yml
@@ -3167,7 +3167,7 @@ instruction_forms:
          destination: false
      hidden_operands:
        - class: "memory"
-          base: "gpr"
+          base: {name: 'rsp'}
          offset: ~
          index: ~
          scale: 1
@@ -3177,11 +3177,29 @@ instruction_forms:
          name: "rsp"
          source: true
          destination: true
+    - name: pop
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: false
+          destination: true
+      hidden_operands:
+        - class: "memory"
+          base: {name: 'rsp'}
+          offset: ~
+          index: ~
+          scale: 1
+          source: true
+          destination: false
+        - class: "register"
+          name: "rsp"
+          source: true
+          destination: true
    - name: pushfq
      operands: []
      hidden_operands:
        - class: "memory"
-          base: "gpr"
+          base: {name: 'rsp'}
          offset: ~
          index: ~
          scale: 1
@@ -3971,4 +3989,3 @@ instruction_forms:
          name: "gpr"
          source: true
          destination: true
-
--- a/osaca/data/ivb.yml
+++ b/osaca/data/ivb.yml
--- a/osaca/data/model_importer.py
+++ b/osaca/data/model_importer.py
@@ -219,9 +219,15 @@ def extract_model(tree, arch, skip_mem=True):
                        port_23 = True
                    if '4' in pp[1]:
                        port_4 = True
-                # Add (1, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
+                # Add (X, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
+                # X = 2 on SNB and IVB IFF used in combination with ymm register, otherwise X = 1
+                if arch.upper() in ['SNB', 'IVB'] and \
+                    any([p['class'] == 'register' and p['name'] == 'ymm' for p in parameters]):
+                    data_port_throughput = 2
+                else:
+                    data_port_throughput = 1
                if port_23 and not port_4:
-                    port_pressure.append((1, ['2D', '3D']))
+                    port_pressure.append((data_port_throughput, ['2D', '3D']))

        # Add missing ports:
        for ports in [pp[1] for pp in port_pressure]:
@@ -275,7 +281,7 @@ def main():
        if model is not None:
            print(
                rhs_comment(
-                    model.dump(), basename + " " + args.xml.split('/')[-1] + " " + args.arch
+                    model.dump(), "uops.info import"
                )
            )
    else:
--- a/osaca/data/snb.yml
+++ b/osaca/data/snb.yml
--- a/osaca/data/tx2.yml
+++ b/osaca/data/tx2.yml
@@ -80,24 +80,114 @@ instruction_forms:
  throughput: 0.33333333
  latency: 1.0  # 	1*p012
  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: mul
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 1.0
+  latency: 4.0  # 	1*p1
+  port_pressure: [[1, '1']]
+- name: mul
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 1.0
+  latency: 4.0  # 	1*p1
+  port_pressure: [[1, '1']]
 - name: b.ne
  operands:
  - class: identifier
-  throughput: 0.0
+  throughput: 1.0
  latency: 0.0
-  port_pressure: []
+  port_pressure: [[1, '2']]
+- name: b.lt
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
+- name: b.hs
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
+- name: b.eq
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
+- name: b
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
 - name: b.gt
  operands:
  - class: identifier
-  throughput: 0.0
+  throughput: 1.0
  latency: 0.0
-  port_pressure: []
+  port_pressure: [[1, '2']]
 - name: bne
  operands:
  - class: identifier
-  throughput: 0.0
+  throughput: 1.0
  latency: 0.0
-  port_pressure: []
+  port_pressure: [[1, '2']]
 - name: cmp
  operands:
  - class: register
@@ -107,6 +197,15 @@ instruction_forms:
  throughput: 0.33333333
  latency: 1.0  # 	1*p012
  port_pressure: [[1, '012']]
+- name: cmp
+  operands:
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: cmp
  operands:
  - class: register
@@ -126,6 +225,17 @@ instruction_forms:
  throughput: 0.5
  latency: 5.0  # 	1*p01
  port_pressure: [[1, '01']]
+- name: dup
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 0.5
+  latency: 5.0  # 	1*p01
+  port_pressure: [[1, '01']]
 - name: fadd
  operands:
  - class: register
@@ -323,6 +433,28 @@ instruction_forms:
  throughput: 0.5
  latency: 6.0  # 	1*p01
  port_pressure: [[1, '01']]
+- name: lsl
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: lsl
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: ldp
  operands:
  - class: register
@@ -403,6 +535,22 @@ instruction_forms:
  throughput: 1.0
  latency: 4.0  # 	2*p34
  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 4.0  # 	2*p34
+  port_pressure: [[2.0, '34']]
 - name: ldp
  operands:
  - class: register
@@ -505,6 +653,15 @@ instruction_forms:
  throughput: 0.5
  latency: 4.0  # 	1*p34
  port_pressure: [[1.0, '34']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
 - name: ldr
  operands:
  - class: register
@@ -532,15 +689,42 @@ instruction_forms:
  throughput: 0.0
  latency: 0.0
  port_pressure: []
+- name: mov
+  operands:
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: mov
+  operands:
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: mov
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: mov
  operands:
  - class: register
    prefix: x
  - class: register
    prefix: x
-  throughput: 0.5
-  latency: 1.0  # 	1*p01
-  port_pressure: [[1, '01']]
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: mov
  operands:
  - class: register
@@ -568,6 +752,43 @@ instruction_forms:
  throughput: ~
  latency: ~
  port_pressure: []
+- name: ret
+  operands: []
+  throughput: 0.5
+  latency: ~  # 	1*p34
+  port_pressure: [[1, '34']]
+- name: stp
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p34+1*p5
+  port_pressure: [[2, '34'], [1, '5']]
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p34+1*p5
+  port_pressure: [[2, '34'], [1, '5']]
 - name: stp
  operands:
  - class: register
@@ -644,6 +865,20 @@ instruction_forms:
  throughput: 1.0
  latency: 4.0  # 	1*p34+1*p5
  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: w
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
 - name: str
  operands:
  - class: register
@@ -728,6 +963,39 @@ instruction_forms:
  throughput: 1.0
  latency: 0  # 	1*p34+1*p5
  port_pressure: [[1.0, '34'], [1.0, '5'], [1, '012']]
+- name: subs
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: subs
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: sub
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: sub
  operands:
  - class: register
@@ -739,3 +1007,25 @@ instruction_forms:
  throughput: 0.33333333
  latency: 1.0  # 	1*p012
  port_pressure: [[1, '012']]
+- name: sub
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: sub
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
--- a/osaca/frontend.py
+++ b/osaca/frontend.py
@@ -76,7 +76,7 @@ class Frontend(object):
                self._get_flag_symbols(instruction_form['flags'])
                if instruction_form['instruction'] is not None
                else ' ',
-                instruction_form['line'].strip(),
+                instruction_form['line'].strip().replace('\t', ' '),
            )
            line = line if show_lineno else col_sep + col_sep.join(line.split(col_sep)[1:])
            if show_cmnts is False and self._is_comment(instruction_form):
@@ -138,13 +138,13 @@ class Frontend(object):
                separator,
                sum([instr_form['latency_lcd'] for instr_form in dep_dict[dep]['dependencies']]),
                separator,
-                dep_dict[dep]['root']['line'],
+                dep_dict[dep]['root']['line'].strip(),
                separator,
                [node['line_number'] for node in dep_dict[dep]['dependencies']],
            )
        return s

-    def full_analysis(self, kernel, kernel_dg: KernelDG, ignore_unknown=False, verbose=False):
+    def full_analysis(self, kernel, kernel_dg: KernelDG, ignore_unknown=False, arch_warning=False, length_warning=False, verbose=False):
        """
        Build the full analysis report including header, the symbol map, the combined TP/CP/LCD
        view and the list based LCD view.
@@ -156,11 +156,16 @@ class Frontend(object):
        :param ignore_unknown: flag for ignore warning if performance data is missing, defaults to
            `False`
        :type ignore_unknown: boolean, optional
+        :param print_arch_warning: flag for additional user warning to specify micro-arch 
+        :type print_arch_warning: boolean, optional
+        :param print_length_warning: flag for additional user warning to specify kernel length with --lines
+        :type print_length_warning: boolean, optional
        :param verbose: flag for verbosity level, defaults to False
        :type verbose: boolean, optional
        """
        return (
            self._header_report()
+            + self._user_warnings(arch_warning, length_warning)
            + self._symbol_map()
            + self.combined_view(
                kernel,
@@ -246,7 +251,7 @@ class Frontend(object):
                self._get_flag_symbols(instruction_form['flags'])
                if instruction_form['instruction'] is not None
                else ' ',
-                instruction_form['line'].strip(),
+                instruction_form['line'].strip().replace('\t', ' '),
            )
        s += '\n'
        # check for unknown instructions and throw warning if called without --ignore-unknown
@@ -285,6 +290,27 @@ class Frontend(object):
        ).format(amount, '-' * len(str(amount)))
        return s

+    def _user_warnings(self, arch_warning, length_warning):
+        """Returns warning texts for giving the user more insight in what he is doing."""
+        arch_text = (
+                    'WARNING: No micro-architecture was specified and a default uarch was used.\n'
+                    '         Specify the uarch with --arch. See --help for more information.\n'
+        )
+        length_text = (
+                    'WARNING: You are analyzing a large amount of instruction forms. Analysis '
+                    'across loops/block boundaries often do not make much sense.\n'
+                    '         Specify the kernel length with --length. See --help for more '
+                    'information.\n'
+                    '         If this is intentional, you can safely ignore this message.\n'
+        )
+
+        warnings = ''
+        warnings += arch_text if arch_warning else ''
+        warnings += length_text if length_warning else ''
+        warnings += '\n'
+        return warnings
+
+
    def _get_separator_list(self, separator, separator_2=' '):
        """Creates column view for seperators in the TP/combined view."""
        separator_list = []
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -5,19 +5,33 @@ import io
 import os
 import re
 import sys
+import traceback

 from osaca.db_interface import import_benchmark_output, sanity_check
 from osaca.frontend import Frontend
-from osaca.parser import BaseParser, ParserAArch64v81, ParserX86ATT
+from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT
 from osaca.semantics import (INSTR_FLAGS, ArchSemantics, KernelDG,
                             MachineModel, reduce_to_section)

-MODULE_DATA_DIR = os.path.join(
-    os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
-)
-LOCAL_OSACA_DIR = os.path.join(os.path.expanduser('~') + '/.osaca/')
-DATA_DIR = os.path.join(LOCAL_OSACA_DIR, 'data/')
-SUPPORTED_ARCHS = ['SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'CSX', 'ZEN1', 'ZEN2', 'TX2', 'N1']
+
+SUPPORTED_ARCHS = [
+    'SNB',
+    'IVB',
+    'HSW',
+    'BDW',
+    'SKX',
+    'CSX',
+    'ICL',
+    'ZEN1',
+    'ZEN2',
+    'TX2',
+    'N1',
+    'A64FX',
+]
+DEFAULT_ARCHS = {
+    'aarch64': 'A64FX',
+    'x86': 'SKX',
+}


 # Stolen from pip
@@ -71,7 +85,8 @@ def create_parser(parser=None):
    parser.add_argument(
        '--arch',
        type=str,
-        help='Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ZEN1, ZEN2, TX2, N1).',
+        help='Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ZEN1, ZEN2, TX2, N1, '
+        'A64FX). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.',
    )
    parser.add_argument(
        '--fixed',
@@ -79,6 +94,13 @@ def create_parser(parser=None):
        help='Run the throughput analysis with fixed probabilities for all suitable ports per '
        'instruction. Otherwise, OSACA will print the optimal port utilization for the kernel.',
    )
+    parser.add_argument(
+        '--lines',
+        type=str,
+        help='Define lines that should be included in the analysis. This option overwrites any'
+        ' range defined by markers in the assembly. Add either single lines or ranges defined by'
+        ' "-" or ":", each entry separated by commas, e.g.: --lines 1,2,8-18,20:24',
+    )
    parser.add_argument(
        '--db-check',
        dest='check_db',
@@ -128,6 +150,12 @@ def create_parser(parser=None):
    parser.add_argument(
        '--verbose', '-v', action='count', default=0, help='Increases verbosity level.'
    )
+    parser.add_argument(
+        '--out', '-o',
+        default=sys.stdout,
+        type=argparse.FileType('w'),
+        help='Write analysis to this file (default to stdout).'
+    )
    parser.add_argument(
        'file', type=argparse.FileType('r'), help='Path to object (ASM or instruction file).'
    )
@@ -144,7 +172,12 @@ def check_arguments(args, parser):
    """
    supported_import_files = ['ibench', 'asmbench']

-    if 'arch' in args and (args.arch is None or args.arch.upper() not in SUPPORTED_ARCHS):
+    if args.arch is None and (args.check_db or 'import_data' in args):
+        parser.error(
+            'DB check and data import cannot work with a default microarchitecture. '
+            'Please see --help for all valid architecture codes.'
+        )
+    elif args.arch is not None and args.arch.upper() not in SUPPORTED_ARCHS:
        parser.error(
            'Microarchitecture not supported. Please see --help for all valid architecture codes.'
        )
@@ -221,19 +254,41 @@ def inspect(args, output_file=sys.stdout):
    :param output_file: Define the stream for output, defaults to :class:`sys.stdout`
    :type output_file: stream, optional
    """
-    arch = args.arch
+    # Read file
+    code = args.file.read()
+
+    # Detect ISA if necessary
+    arch = args.arch if args.arch is not None else DEFAULT_ARCHS[BaseParser.detect_ISA(code)]
+    print_arch_warning = False if args.arch else True
    isa = MachineModel.get_isa_for_arch(arch)
    verbose = args.verbose
    ignore_unknown = args.ignore_unknown

-    # Read file
-    code = args.file.read()
    # Parse file
    parser = get_asm_parser(arch)
-    parsed_code = parser.parse_file(code)
+    try:
+        parsed_code = parser.parse_file(code)
+    except:
+        # probably the wrong parser based on heuristic
+        if args.arch is None:
+            # change ISA and try again
+            arch = DEFAULT_ARCHS['x86'] if BaseParser.detect_ISA(code) == 'aarch64' else DEFAULT_ARCHS['aarch64']
+            isa = MachineModel.get_isa_for_arch(arch)
+            parser = get_asm_parser(arch)
+            parsed_code = parser.parse_file(code)
+        else:
+            traceback.print_exc(file=sys.stderr)
+            sys.exit(1)

-    # Reduce to marked kernel and add semantics
-    kernel = reduce_to_section(parsed_code, isa)
+    # Reduce to marked kernel or chosen section and add semantics
+    if args.lines:
+        line_range = get_line_range(args.lines)
+        kernel = [line for line in parsed_code if line['line_number'] in line_range]
+        print_length_warning = False
+    else:
+        kernel = reduce_to_section(parsed_code, isa)
+        # Print warning if kernel has no markers and is larger than threshold (100)
+        print_length_warning = True if len(kernel) == len(parsed_code) and len(kernel) > 100 else False
    machine_model = MachineModel(arch=arch)
    semantics = ArchSemantics(machine_model)
    semantics.add_semantics(kernel)
@@ -249,7 +304,12 @@ def inspect(args, output_file=sys.stdout):
    frontend = Frontend(args.file.name, arch=arch)
    print(
        frontend.full_analysis(
-            kernel, kernel_graph, ignore_unknown=ignore_unknown, verbose=verbose
+            kernel,
+            kernel_graph,
+            ignore_unknown=ignore_unknown,
+            arch_warning=print_arch_warning,
+            length_warning=print_length_warning,
+            verbose=verbose
        ),
        file=output_file,
    )
@@ -292,7 +352,7 @@ def get_asm_parser(arch) -> BaseParser:
    if isa == 'x86':
        return ParserX86ATT()
    elif isa == 'aarch64':
-        return ParserAArch64v81()
+        return ParserAArch64()


 def get_unmatched_instruction_ratio(kernel):
@@ -306,13 +366,26 @@ def get_unmatched_instruction_ratio(kernel):
            unmatched_counter += 1
    return unmatched_counter / len(kernel)

+def get_line_range(line_str):
+    line_str = line_str.replace(':', '-')
+    lines = line_str.split(',')
+    lines_int = []
+    for l in lines:
+        if '-' in l:
+            start = int(l.split('-')[0])
+            end = int(l.split('-')[1])
+            rnge = list(range(start, end+1))
+            lines_int += rnge
+        else:
+            lines_int.append(int(l))
+    return lines_int

 def main():
    """Initialize and run command line interface."""
    parser = create_parser()
    args = parser.parse_args()
    check_arguments(args, parser)
-    run(args)
+    run(args, output_file=args.out)


 if __name__ == '__main__':
--- a/osaca/parser/init.py
+++ b/osaca/parser/init.py
@@ -6,14 +6,14 @@ Only the parser below will be exported, so please add new parsers to __all__.
 from .attr_dict import AttrDict
 from .base_parser import BaseParser
 from .parser_x86att import ParserX86ATT
-from .parser_AArch64v81 import ParserAArch64v81
+from .parser_AArch64 import ParserAArch64

-__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64v81', 'get_parser']
+__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64', 'get_parser']

 def get_parser(isa):
    if isa.lower() == 'x86':
        return ParserX86ATT()
    elif isa.lower() == 'aarch64':
-        return ParserAArch64v81()
+        return ParserAArch64()
    else:
        raise ValueError("Unknown ISA {!r}.".format(isa))
--- a/osaca/parser/base_parser.py
+++ b/osaca/parser/base_parser.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 """Parser superclass of specific parsers."""
-
+import operator
+import re

 class BaseParser(object):
    # Identifiers for operand types
@@ -8,14 +9,35 @@ class BaseParser(object):
    DIRECTIVE_ID = 'directive'
    IMMEDIATE_ID = 'immediate'
    LABEL_ID = 'label'
+    IDENTIFIER_ID = 'identifier'
    MEMORY_ID = 'memory'
    REGISTER_ID = 'register'
    SEGMENT_EXT_ID = 'segment_extension'
    INSTRUCTION_ID = 'instruction'
    OPERANDS_ID = 'operands'
+    _parser_constructed = False

    def __init__(self):
-        self.construct_parser()
+        if not self._parser_constructed:
+            self.construct_parser()
+            self._parser_constructed = True
+
+    @staticmethod
+    def detect_ISA(file_content):
+        """Detect the ISA of the assembly based on the used registers and return the ISA code."""
+        # Check for the amount of registers in the code to determine the ISA
+        # 1) Check for xmm, ymm, zmm, rax, rbx, rcx, and rdx registers in x86
+        heuristics_x86ATT = [r'%[xyz]mm[0-9]', r'%[er][abcd]x[0-9]']
+        # 2) check for v and z vector registers and x/w general-purpose registers
+        heuristics_aarch64 = [r'[vz][0-9][0-9]?\.[0-9][0-9]?[bhsd]', r'[wx][0-9]']
+        matches = {'x86': 0, 'aarch64': 0}
+
+        for h in heuristics_x86ATT:
+            matches['x86'] += len(re.findall(h, file_content))
+        for h in heuristics_aarch64:
+            matches['aarch64'] += len(re.findall(h, file_content))
+
+        return max(matches.items(), key=operator.itemgetter(1))[0]

    def parse_file(self, file_content, start_line=0):
        """
--- a/osaca/parser/parser_AArch64v81.py
+++ b/osaca/parser/parser_AArch64v81.py
@@ -6,7 +6,15 @@ import pyparsing as pp
 from osaca.parser import AttrDict, BaseParser


-class ParserAArch64v81(BaseParser):
+class ParserAArch64(BaseParser):
+    _instance = None
+
+    # Singelton pattern, as this is created very many times
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ParserAArch64, cls).__new__(cls)
+        return cls._instance
+
    def __init__(self):
        super().__init__()
        self.isa = 'aarch64'
@@ -19,22 +27,23 @@ class ParserAArch64v81(BaseParser):
            pp.ZeroOrMore(pp.Word(pp.printables))
        ).setResultsName(self.COMMENT_ID)
        # Define ARM assembly identifier
+        decimal_number = pp.Combine(
+            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
+        ).setResultsName('value')
+        hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
        relocation = pp.Combine(pp.Literal(':') + pp.Word(pp.alphanums + '_') + pp.Literal(':'))
        first = pp.Word(pp.alphas + '_.', exact=1)
        rest = pp.Word(pp.alphanums + '_.')
        identifier = pp.Group(
            pp.Optional(relocation).setResultsName('relocation')
            + pp.Combine(first + pp.Optional(rest)).setResultsName('name')
-        ).setResultsName('identifier')
+            + pp.Optional(pp.Suppress(pp.Literal('+')) + (hex_number | decimal_number).setResultsName('offset'))
+        ).setResultsName(self.IDENTIFIER_ID)
        # Label
        self.label = pp.Group(
            identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
        ).setResultsName(self.LABEL_ID)
        # Directive
-        decimal_number = pp.Combine(
-            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
-        ).setResultsName('value')
-        hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
        directive_option = pp.Combine(
            pp.Word(pp.alphas + '#@.%', exact=1)
            + pp.Optional(pp.Word(pp.printables + ' ', excludeChars=','))
@@ -46,7 +55,7 @@ class ParserAArch64v81(BaseParser):
        self.directive = pp.Group(
            pp.Literal('.')
            + pp.Word(pp.alphanums + '_').setResultsName('name')
-            + commaSeparatedList.setResultsName('parameters')
+            + (pp.OneOrMore(directive_parameter) ^ commaSeparatedList).setResultsName('parameters')
            + pp.Optional(self.comment)
        ).setResultsName(self.DIRECTIVE_ID)
        # LLVM-MCA markers
@@ -91,31 +100,49 @@ class ParserAArch64v81(BaseParser):
            ^ pp.CaselessLiteral('ror')
            ^ pp.CaselessLiteral('sxtw')
            ^ pp.CaselessLiteral('uxtw')
+            ^ pp.CaselessLiteral('mul vl')
        )
        arith_immediate = pp.Group(
            immediate.setResultsName('base_immediate')
            + pp.Suppress(pp.Literal(','))
            + shift_op.setResultsName('shift_op')
-            + immediate.setResultsName('shift')
+            + pp.Optional(immediate).setResultsName('shift')
        ).setResultsName(self.IMMEDIATE_ID)
        # Register:
-        # scalar: [XWBHSDQ][0-9]{1,2}  |   vector: V[0-9]{1,2}\.[12468]{1,2}[BHSD]()?
-        # define SP and ZR register aliases as regex, due to pyparsing does not support
+        # scalar: [XWBHSDQ][0-9]{1,2}  |   vector: [VZ][0-9]{1,2}(\.[12468]{1,2}[BHSD])?
+        #  | predicate: P[0-9]{1,2}(/[ZM])?
+        # ignore vector len control ZCR_EL[123] for now
+        # define SP, ZR register aliases as regex, due to pyparsing does not support
        # proper lookahead
        alias_r31_sp = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(sp|SP))')
        alias_r31_zr = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(zr|ZR))')
-        scalar = pp.Word(pp.alphas, exact=1).setResultsName('prefix') + pp.Word(
+        scalar = pp.Word('xwbhsdqXWBHSDQ', exact=1).setResultsName('prefix') + pp.Word(
            pp.nums
        ).setResultsName('name')
        index = pp.Literal('[') + pp.Word(pp.nums).setResultsName('index') + pp.Literal(']')
        vector = (
-            pp.CaselessLiteral('v').setResultsName('prefix')
+            pp.oneOf('v z', caseless=True).setResultsName('prefix')
            + pp.Word(pp.nums).setResultsName('name')
            + pp.Literal('.')
            + pp.Optional(pp.Word('12468')).setResultsName('lanes')
            + pp.Word(pp.alphas, exact=1).setResultsName('shape')
            + pp.Optional(index)
        )
+        predicate = (
+            pp.CaselessLiteral('p').setResultsName('prefix')
+            + pp.Word(pp.nums).setResultsName('name')
+            + pp.Optional(
+                (
+                    pp.Suppress(pp.Literal('/'))
+                    + pp.oneOf('z m', caseless=True).setResultsName('predication')
+                )
+                | (
+                    pp.Literal('.')
+                    + pp.Optional(pp.Word('12468')).setResultsName('lanes')
+                    + pp.Word(pp.alphas, exact=1).setResultsName('shape')
+                )
+            )
+        )
        self.list_element = vector ^ scalar
        register_list = (
            pp.Literal('{')
@@ -129,7 +156,8 @@ class ParserAArch64v81(BaseParser):
            + pp.Optional(index)
        )
        register = pp.Group(
-            (alias_r31_sp | alias_r31_zr | vector | scalar | register_list)
+            (alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
+            #(alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
            + pp.Optional(
                pp.Suppress(pp.Literal(','))
                + shift_op.setResultsName('shift_op')
@@ -144,7 +172,7 @@ class ParserAArch64v81(BaseParser):
            pp.Literal('[')
            + pp.Optional(register.setResultsName('base'))
            + pp.Optional(pp.Suppress(pp.Literal(',')))
-            + pp.Optional(register_index ^ immediate.setResultsName('offset'))
+            + pp.Optional(register_index ^ (immediate ^ arith_immediate).setResultsName('offset'))
            + pp.Literal(']')
            + pp.Optional(
                pp.Literal('!').setResultsName('pre_indexed')
@@ -177,6 +205,11 @@ class ParserAArch64v81(BaseParser):
            + pp.Optional(self.comment)
        )

+        # for testing
+        self.predicate = predicate
+        self.vector = vector
+        self.register = register
+
    def parse_line(self, line, line_number=None):
        """
        Parse line and return instruction form.
@@ -193,7 +226,7 @@ class ParserAArch64v81(BaseParser):
                self.DIRECTIVE_ID: None,
                self.COMMENT_ID: None,
                self.LABEL_ID: None,
-                'line': line.strip(),
+                'line': line,
                'line_number': line_number,
            }
        )
@@ -317,14 +350,18 @@ class ParserAArch64v81(BaseParser):
            return self.process_immediate(operand[self.IMMEDIATE_ID])
        if self.LABEL_ID in operand:
            return self.process_label(operand[self.LABEL_ID])
+        if self.IDENTIFIER_ID in operand:
+            return self.process_identifier(operand[self.IDENTIFIER_ID])
        return operand

    def process_memory_address(self, memory_address):
        """Post-process memory address operand"""
        # Remove unnecessarily created dictionary entries during parsing
-        offset = None if 'offset' not in memory_address else memory_address['offset']
-        base = None if 'base' not in memory_address else memory_address['base']
-        index = None if 'index' not in memory_address else memory_address['index']
+        offset = memory_address.get('offset', None)
+        if isinstance(offset, list) and len(offset) == 1:
+            offset = offset[0]
+        base = memory_address.get('base', None)
+        index = memory_address.get('index', None)
        scale = 1
        if base is not None and 'name' in base and base['name'] == 'sp':
            base['prefix'] = 'x'
@@ -351,18 +388,20 @@ class ParserAArch64v81(BaseParser):
    def process_register_list(self, register_list):
        """Post-process register lists (e.g., {r0,r3,r5}) and register ranges (e.g., {r0-r7})"""
        # Remove unnecessarily created dictionary entries during parsing
-        vlist = []
+        rlist = []
        dict_name = ''
        if 'list' in register_list:
            dict_name = 'list'
        if 'range' in register_list:
            dict_name = 'range'
-        for v in register_list[dict_name]:
-            vlist.append(
-                AttrDict.convert_dict(self.list_element.parseString(v, parseAll=True).asDict())
+        for r in register_list[dict_name]:
+            rlist.append(
+                AttrDict.convert_dict(self.list_element.parseString(r, parseAll=True).asDict())
            )
-        index = None if 'index' not in register_list else register_list['index']
-        new_dict = AttrDict({dict_name: vlist, 'index': index})
+        index = register_list.get('index', None)
+        new_dict = AttrDict({dict_name: rlist, 'index': index})
+        if len(new_dict[dict_name]) == 1:
+            return AttrDict({self.REGISTER_ID: new_dict[dict_name][0]})
        return AttrDict({self.REGISTER_ID: new_dict})

    def process_immediate(self, immediate):
@@ -375,7 +414,9 @@ class ParserAArch64v81(BaseParser):
            # normal integer value, nothing to do
            return AttrDict({self.IMMEDIATE_ID: immediate})
        if 'base_immediate' in immediate:
-            # arithmetic immediate, nothing to do
+            # arithmetic immediate, add calculated value as value
+            immediate['shift'] = immediate['shift'][0]
+            immediate['value'] = int(immediate['base_immediate']['value']) << int(immediate['shift']['value'])
            return AttrDict({self.IMMEDIATE_ID: immediate})
        if 'float' in immediate:
            dict_name = 'float'
@@ -396,6 +437,13 @@ class ParserAArch64v81(BaseParser):
        label['name'] = label['name']['name']
        return AttrDict({self.LABEL_ID: label})

+    def process_identifier(self, identifier):
+        """Post-process identifier operand"""
+        # remove value if it consists of symbol+offset
+        if 'value' in identifier:
+            del identifier['value']
+        return AttrDict({self.IDENTIFIER_ID: identifier})
+
    def get_full_reg_name(self, register):
        """Return one register name string including all attributes"""
        if 'lanes' in register:
@@ -440,7 +488,7 @@ class ParserAArch64v81(BaseParser):

    def is_vector_register(self, register):
        """Check if register is a vector register"""
-        if register['prefix'] in 'bhsdqv':
+        if register['prefix'] in 'bhsdqvz':
            return True
        return False

@@ -455,7 +503,7 @@ class ParserAArch64v81(BaseParser):
    def is_reg_dependend_of(self, reg_a, reg_b):
        """Check if ``reg_a`` is dependent on ``reg_b``"""
        prefixes_gpr = 'wx'
-        prefixes_vec = 'bhsdqv'
+        prefixes_vec = 'bhsdqvz'
        if reg_a['name'] == reg_b['name']:
            if reg_a['prefix'].lower() in prefixes_gpr and reg_b['prefix'].lower() in prefixes_gpr:
                return True
--- a/osaca/parser/parser_x86att.py
+++ b/osaca/parser/parser_x86att.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import string
+import re

 import pyparsing as pp

@@ -8,6 +9,14 @@ from osaca.parser import AttrDict, BaseParser


 class ParserX86ATT(BaseParser):
+    _instance = None
+
+    # Singelton pattern, as this is created very many times
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ParserX86ATT, cls).__new__(cls)
+        return cls._instance
+
    def __init__(self):
        super().__init__()
        self.isa = 'x86'
@@ -33,12 +42,18 @@ class ParserX86ATT(BaseParser):
            + pp.Optional(relocation).setResultsName('relocation')
        ).setResultsName('identifier')
        # Label
+        rest = pp.Word(pp.alphanums + '$_.+-()')
+        label_identifier = pp.Group(
+            pp.Optional(id_offset).setResultsName('offset')
+            + pp.Combine(first + pp.Optional(rest)).setResultsName('name')
+            + pp.Optional(relocation).setResultsName('relocation')
+        ).setResultsName('identifier')
        numeric_identifier = pp.Group(
            pp.Word(pp.nums).setResultsName('name')
            + pp.Optional(pp.oneOf('b f', caseless=True).setResultsName('suffix'))
        ).setResultsName('identifier')
        self.label = pp.Group(
-            (identifier | numeric_identifier).setResultsName('name')
+            (label_identifier | numeric_identifier).setResultsName('name')
            + pp.Literal(':')
            + pp.Optional(self.comment)
        ).setResultsName(self.LABEL_ID)
@@ -178,7 +193,7 @@ class ParserX86ATT(BaseParser):
                self.DIRECTIVE_ID: None,
                self.COMMENT_ID: None,
                self.LABEL_ID: None,
-                'line': line.strip(),
+                'line': line,
                'line_number': line_number,
            }
        )
@@ -297,9 +312,9 @@ class ParserX86ATT(BaseParser):
    def process_memory_address(self, memory_address):
        """Post-process memory address operand"""
        # Remove unecessarily created dictionary entries during memory address parsing
-        offset = None if 'offset' not in memory_address else memory_address['offset']
-        base = None if 'base' not in memory_address else memory_address['base']
-        index = None if 'index' not in memory_address else memory_address['index']
+        offset = memory_address.get('offset', None)
+        base = memory_address.get('base', None)
+        index = memory_address.get('index', None)
        scale = 1 if 'scale' not in memory_address else int(memory_address['scale'])
        if isinstance(offset, str) and base is None and index is None:
            offset = {'value': offset}
@@ -348,45 +363,44 @@ class ParserX86ATT(BaseParser):

    def is_reg_dependend_of(self, reg_a, reg_b):
        """Check if ``reg_a`` is dependent on ``reg_b``"""
+        # Normalize name
+        reg_a_name = reg_a['name'].upper()
+        reg_b_name = reg_b['name'].upper()
+
        # Check if they are the same registers
-        if reg_a.name == reg_b.name:
+        if reg_a_name == reg_b_name:
            return True
        # Check vector registers first
        if self.is_vector_register(reg_a):
            if self.is_vector_register(reg_b):
-                if reg_a.name[1:] == reg_b.name[1:]:
+                if reg_a_name[1:] == reg_b_name[1:]:
                    # Registers in the same vector space
                    return True
            return False
        # Check basic GPRs
-        a_dep = ['RAX', 'EAX', 'AX', 'AH', 'AL']
-        b_dep = ['RBX', 'EBX', 'BX', 'BH', 'BL']
-        c_dep = ['RCX', 'ECX', 'CX', 'CH', 'CL']
-        d_dep = ['RDX', 'EDX', 'DX', 'DH', 'DL']
-        sp_dep = ['RSP', 'ESP', 'SP', 'SPL']
-        src_dep = ['RSI', 'ESI', 'SI', 'SIL']
-        dst_dep = ['RDI', 'EDI', 'DI', 'DIL']
-        basic_gprs = [a_dep, b_dep, c_dep, d_dep, sp_dep, src_dep, dst_dep]
+        gpr_groups = {
+            'A': ['RAX', 'EAX', 'AX', 'AH', 'AL'],
+            'B': ['RBX', 'EBX', 'BX', 'BH', 'BL'],
+            'C': ['RCX', 'ECX', 'CX', 'CH', 'CL'],
+            'D': ['RDX', 'EDX', 'DX', 'DH', 'DL'],
+            'SP': ['RSP', 'ESP', 'SP', 'SPL'],
+            'SRC': ['RSI', 'ESI', 'SI', 'SIL'],
+            'DST': ['RDI', 'EDI', 'DI', 'DIL']
+        }
        if self.is_basic_gpr(reg_a):
            if self.is_basic_gpr(reg_b):
-                for dep_group in basic_gprs:
-                    if reg_a['name'].upper() in dep_group:
-                        if reg_b['name'].upper() in dep_group:
+                for dep_group in gpr_groups.values():
+                    if reg_a_name in dep_group:
+                        if reg_b_name in dep_group:
                            return True
            return False
+
        # Check other GPRs
-        gpr_parser = (
-            pp.CaselessLiteral('R')
-            + pp.Word(pp.nums).setResultsName('id')
-            + pp.Optional(pp.Word('dwbDWB', exact=1))
-        )
-        try:
-            id_a = gpr_parser.parseString(reg_a['name'], parseAll=True).asDict()['id']
-            id_b = gpr_parser.parseString(reg_b['name'], parseAll=True).asDict()['id']
-            if id_a == id_b:
-                return True
-        except pp.ParseException:
-            return False
+        ma = re.match(r'R([0-9]+)[DWB]?', reg_a_name)
+        mb = re.match(r'R([0-9]+)[DWB]?', reg_b_name)
+        if ma and mb and ma.group(1) == mb.group(1):
+            return True
+
        # No dependencies
        return False

@@ -400,19 +414,11 @@ class ParserX86ATT(BaseParser):
        """Check if register is a general purpose register"""
        if register is None:
            return False
-        gpr_parser = (
-            pp.CaselessLiteral('R')
-            + pp.Word(pp.nums).setResultsName('id')
-            + pp.Optional(pp.Word('dwbDWB', exact=1))
-        )
+
        if self.is_basic_gpr(register):
            return True
-        else:
-            try:
-                gpr_parser.parseString(register['name'], parseAll=True)
-                return True
-            except pp.ParseException:
-                return False
+
+        return re.match(r'R([0-9]+)[DWB]?', register['name'], re.IGNORECASE)

    def is_vector_register(self, register):
        """Check if register is a vector register"""
--- a/osaca/semantics/arch_semantics.py
+++ b/osaca/semantics/arch_semantics.py
@@ -53,9 +53,18 @@ class ArchSemantics(ISASemantics):
                )
                if len(set(port_sums)) > 1:
                    # balance ports
-                    for _ in range(cycles * 100):
-                        instr_ports[port_sums.index(max(port_sums))] -= INC
-                        instr_ports[port_sums.index(min(port_sums))] += INC
+                    # init list for keeping track of the current change
+                    differences = [cycles / len(ports)  for p in ports]
+                    for _ in range(int(cycles * (1 / INC))):
+                        if len(instr_ports) == 1:
+                            # no balancing possible anymore
+                            break
+                        max_port_idx = port_sums.index(max(port_sums))
+                        min_port_idx = port_sums.index(min(port_sums))
+                        instr_ports[max_port_idx] -= INC
+                        instr_ports[min_port_idx] += INC
+                        differences[max_port_idx] -= INC
+                        differences[min_port_idx] += INC
                        # instr_ports = [round(p, 2) for p in instr_ports]
                        self._itemsetter(*indices)(instruction_form['port_pressure'], *instr_ports)
                        # check if min port is zero
@@ -63,7 +72,12 @@ class ArchSemantics(ISASemantics):
                            # if port_pressure is not exactly 0.00, add the residual to
                            # the former port
                            if min(instr_ports) != 0.0:
-                                instr_ports[port_sums.index(min(port_sums))] += min(instr_ports)
+                                min_port_idx = port_sums.index(min(port_sums))
+                                instr_ports[min_port_idx] += min(instr_ports)
+                                differences[min_port_idx] += min(instr_ports)
+                                # we don't need to decrease difference for other port, just
+                                # delete it
+                                del differences[instr_ports.index(min(instr_ports))]
                                self._itemsetter(*indices)(
                                    instruction_form['port_pressure'], *instr_ports
                                )
@@ -80,6 +94,17 @@ class ArchSemantics(ISASemantics):
                            instr_ports = self._to_list(
                                itemgetter(*indices)(instruction_form['port_pressure'])
                            )
+                        # never remove more than the fixed utilization per uop and port, i.e., 
+                        # cycles/len(ports)
+                        if round(min(differences), 2) <= 0:
+                            # don't worry if port_pressure isn't exactly 0 and just
+                            # remove from further balancing by deleting index since
+                            # pressure is not 0
+                            del indices[differences.index(min(differences))]
+                            instr_ports = self._to_list(
+                                 itemgetter(*indices)(instruction_form['port_pressure'])
+                            )
+                            del differences[differences.index(min(differences))]
                        port_sums = self._to_list(
                            itemgetter(*indices)(self.get_throughput_sum(kernel))
                        )
@@ -373,9 +398,7 @@ class ArchSemantics(ISASemantics):

            def g(obj, value):
                obj[item] = value
-
        else:
-
            def g(obj, *values):
                for item, value in zip(items, values):
                    obj[item] = value
@@ -391,9 +414,11 @@ class ArchSemantics(ISASemantics):
    @staticmethod
    def get_throughput_sum(kernel):
        """Get the overall throughput sum separated by port of all instructions of a kernel."""
-        tp_sum = reduce(
-            (lambda x, y: [sum(z) for z in zip(x, y)]),
-            [instr['port_pressure'] for instr in kernel],
-        )
-        tp_sum = [round(x, 2) for x in tp_sum]
+        # ignoring all lines with throughput == 0.0, because there won't be anything to sum up
+        # typically comment, label and non-instruction lines
+        port_pressures = [instr['port_pressure'] for instr in kernel if instr['throughput'] != 0.0]
+        # Essentially summing up each columns of port_pressures, where each column is one port
+        # and each row is one line of the kernel
+        # round is necessary to ensure termination of ArchsSemantics.assign_optimal_throughput
+        tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)]
        return tp_sum
--- a/osaca/semantics/hw_model.py
+++ b/osaca/semantics/hw_model.py
@@ -1,12 +1,14 @@
 #!/usr/bin/env python3

-import base64
 import os
 import pickle
 import re
 import string
 from copy import deepcopy
 from itertools import product
+import hashlib
+from pathlib import Path
+from collections import defaultdict

 import ruamel.yaml
 from ruamel.yaml.compat import StringIO
@@ -17,6 +19,7 @@ from osaca.parser import ParserX86ATT

 class MachineModel(object):
    WILDCARD = '*'
+    INTERNAL_VERSION = 1  # increase whenever self._data format changes to invalidate cache!

    def __init__(self, arch=None, path_to_yaml=None, isa=None, lazy=False):
        if not arch and not path_to_yaml:
@@ -39,7 +42,7 @@ class MachineModel(object):
                'load_throughput_default': [],
                'ports': [],
                'port_model_scheme': None,
-                'instruction_forms': [],
+                'instruction_forms': []
            }
        else:
            if arch and path_to_yaml:
@@ -49,7 +52,7 @@ class MachineModel(object):
            yaml = self._create_yaml_object()
            if arch:
                self._arch = arch.lower()
-                self._path = utils.find_file(self._arch + '.yml')
+                self._path = utils.find_datafile(self._arch + '.yml')
            # check if file is cached
            cached = self._get_cached(self._path) if not lazy else False
            if cached:
@@ -59,8 +62,6 @@ class MachineModel(object):
                with open(self._path, 'r') as f:
                    if not lazy:
                        self._data = yaml.load(f)
-                        # cache file for next call
-                        self._write_in_cache(self._path, self._data)
                    else:
                        file_content = ''
                        line = f.readline()
@@ -69,21 +70,26 @@ class MachineModel(object):
                            line = f.readline()
                        self._data = yaml.load(file_content)
                        self._data['instruction_forms'] = []
-            # separate multi-alias instruction forms
-            for entry in [
-                x for x in self._data['instruction_forms'] if isinstance(x['name'], list)
-            ]:
-                for name in entry['name']:
-                    new_entry = {'name': name}
-                    for k in [x for x in entry.keys() if x != 'name']:
-                        new_entry[k] = entry[k]
-                    self._data['instruction_forms'].append(new_entry)
-                # remove old entry
-                self._data['instruction_forms'].remove(entry)
-            # For use with dict instead of list as DB
-            # self._data['instruction_dict'] = (
-            #     self._convert_to_dict(self._data['instruction_forms'])
-            # )
+                # separate multi-alias instruction forms
+                for entry in [x for x in self._data['instruction_forms']
+                              if isinstance(x['name'], list)]:
+                    for name in entry['name']:
+                        new_entry = {'name': name}
+                        for k in [x for x in entry.keys() if x != 'name']:
+                            new_entry[k] = entry[k]
+                        self._data['instruction_forms'].append(new_entry)
+                    # remove old entry
+                    self._data['instruction_forms'].remove(entry)
+                # Normalize instruction_form names (to UPPERCASE) and build dict for faster access:
+                self._data['instruction_forms_dict'] = defaultdict(list)
+                for iform in self._data['instruction_forms']:
+                    iform['name'] = iform['name'].upper()
+                    self._data['instruction_forms_dict'][iform['name']].append(iform)
+                self._data['internal_version'] = self.INTERNAL_VERSION
+
+                if not lazy:
+                    # cache internal representation for future use
+                    self._write_in_cache(self._path)

    def __getitem__(self, key):
        """Return configuration entry."""
@@ -98,36 +104,21 @@ class MachineModel(object):
    def get_instruction(self, name, operands):
        """Find and return instruction data from name and operands."""
        # For use with dict instead of list as DB
-        # return self.get_instruction_from_dict(name, operands)
        if name is None:
            return None
+        name_matched_iforms = self._data['instruction_forms_dict'].get(name.upper(), [])
        try:
            return next(
                instruction_form
-                for instruction_form in self._data['instruction_forms']
-                if instruction_form['name'].upper() == name.upper()
-                and self._match_operands(
+                for instruction_form in name_matched_iforms if self._match_operands(
                    instruction_form['operands'] if 'operands' in instruction_form else [],
-                    operands,
-                )
-            )
+                    operands))
        except StopIteration:
            return None
        except TypeError as e:
            print('\nname: {}\noperands: {}'.format(name, operands))
            raise TypeError from e

-    def get_instruction_from_dict(self, name, operands):
-        """Find and return instruction data from name and operands stored in dictionary."""
-        if name is None:
-            return None
-        try:
-            # Check if key is in dict
-            instruction_form = self._data['instruction_dict'][self._get_key(name, operands)]
-            return instruction_form
-        except KeyError:
-            return None
-
    def average_port_pressure(self, port_pressure):
        """Construct average port pressure list from instruction data."""
        port_list = self._data['ports']
@@ -234,12 +225,13 @@ class MachineModel(object):
                for y in list(filter(lambda x: True if x != 'class' else False, op))
            ]
            operands.append('{}({})'.format(op['class'], ','.join(op_attrs)))
-        return '{}  {}'.format(instruction_form['name'], ','.join(operands))
+        return '{}  {}'.format(instruction_form['name'].lower(), ','.join(operands))

    @staticmethod
    def get_isa_for_arch(arch):
        """Return ISA for given micro-arch ``arch``."""
        arch_dict = {
+            'a64fx': 'aarch64',
            'tx2': 'aarch64',
            'n1': 'aarch64',
            'zen1': 'x86',
@@ -293,7 +285,8 @@ class MachineModel(object):
            {
                k: v
                for k, v in self._data.items()
-                if k not in ['instruction_forms', 'load_throughput']
+                if k not in ['instruction_forms', 'instruction_forms_dict', 'load_throughput',
+                             'internal_version']
            },
            stream,
        )
@@ -313,37 +306,54 @@ class MachineModel(object):
        :type filepath: str
        :returns: cached DB if existing, `False` otherwise
        """
-        hashname = self._get_hashname(filepath)
-        cachepath = utils.exists_cached_file(hashname + '.pickle')
-        if cachepath:
-            # Check if modification date of DB is older than cached version
-            if os.path.getmtime(filepath) < os.path.getmtime(cachepath):
-                # load cached version
-                with open(cachepath, 'rb') as f:
-                    cached_db = pickle.load(f)
-                return cached_db
-            else:
-                # DB newer than cached version --> delete cached file and return False
-                os.remove(cachepath)
+        p = Path(filepath)
+        hexhash = hashlib.sha256(p.read_bytes()).hexdigest()
+
+        # 1. companion cachefile: same location, with '.<name>_<sha512hash>.pickle'
+        companion_cachefile = p.with_name('.' + p.stem + '_' + hexhash).with_suffix('.pickle')
+        if companion_cachefile.exists():
+            # companion file (must be up-to-date, due to equal hash)
+            with companion_cachefile.open('rb') as f:
+                data = pickle.load(f)
+            if data.get('internal_version') == self.INTERNAL_VERSION:
+                return data
+
+        # 2. home cachefile: ~/.osaca/cache/<name>_<sha512hash>.pickle
+        home_cachefile = (Path(utils.CACHE_DIR) / (p.stem + '_' + hexhash)).with_suffix('.pickle')
+        if home_cachefile.exists():
+            # home file (must be up-to-date, due to equal hash)
+            with home_cachefile.open('rb') as f:
+                data = pickle.load(f)
+            if data.get('internal_version') == self.INTERNAL_VERSION:
+                return data
        return False

-    def _write_in_cache(self, filepath, data):
+    def _write_in_cache(self, filepath):
        """
        Write machine model to cache

        :param filepath: path to store DB
        :type filepath: str
-        :param data: :class:`MachineModel` to store
-        :type data: :class:`dict`
        """
-        hashname = self._get_hashname(filepath)
-        filepath = os.path.join(utils.CACHE_DIR, hashname + '.pickle')
-        with open(filepath, 'wb') as f:
-            pickle.dump(data, f)
+        p = Path(filepath)
+        hexhash = hashlib.sha256(p.read_bytes()).hexdigest()
+        # 1. companion cachefile: same location, with '.<name>_<sha512hash>.pickle'
+        companion_cachefile = p.with_name('.' + p.stem + '_' + hexhash).with_suffix('.pickle')
+        if os.access(str(companion_cachefile.parent), os.W_OK):
+            with companion_cachefile.open('wb') as f:
+                pickle.dump(self._data, f)
+                return

-    def _get_hashname(self, name):
-        """Returns unique hashname for machine model"""
-        return base64.b64encode(name.encode()).decode()
+        # 2. home cachefile: ~/.osaca/cache/<name>_<sha512hash>.pickle
+        cache_dir = Path(utils.CACHE_DIR)
+        try:
+            os.makedirs(cache_dir, exist_ok=True)
+        except OSError:
+            return
+        home_cachefile = (cache_dir / (p.stem + '_' + hexhash)).with_suffix('.pickle')
+        if os.access(str(home_cachefile.parent), os.W_OK):
+            with home_cachefile.open('wb') as f:
+                pickle.dump(self._data, f)

    def _get_key(self, name, operands):
        """Get unique instruction form key for dict DB."""
@@ -353,18 +363,6 @@ class MachineModel(object):
        key_string += '_'.join([self._get_operand_hash(op) for op in operands])
        return key_string

-    def _convert_to_dict(self, instruction_forms):
-        """Convert list DB to dict DB"""
-        instruction_dict = {}
-        for instruction_form in instruction_forms:
-            instruction_dict[
-                self._get_key(
-                    instruction_form['name'],
-                    instruction_form['operands'] if 'operands' in instruction_form else None,
-                )
-            ] = instruction_form
-        return instruction_dict
-
    def _get_operand_hash(self, operand):
        """Get unique key for operand for dict DB"""
        operand_string = ''
@@ -493,6 +491,7 @@ class MachineModel(object):
        if 'class' in operand:
            # compare two DB entries
            return self._compare_db_entries(i_operand, operand)
+        # TODO support class wildcards
        # register
        if 'register' in operand:
            if i_operand['class'] != 'register':
@@ -504,12 +503,14 @@ class MachineModel(object):
                return False
            return self._is_AArch64_mem_type(i_operand, operand['memory'])
        # immediate
+        # TODO support wildcards
        if 'value' in operand or ('immediate' in operand and 'value' in operand['immediate']):
            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'int'
        if 'float' in operand or ('immediate' in operand and 'float' in operand['immediate']):
            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'float'
        if 'double' in operand or ('immediate' in operand and 'double' in operand['immediate']):
            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'double'
+        # identifier
        if 'identifier' in operand or (
            'immediate' in operand and 'identifier' in operand['immediate']
        ):
@@ -580,7 +581,11 @@ class MachineModel(object):

    def _is_x86_reg_type(self, i_reg, reg, consider_masking=False):
        """Check if register type match."""
-        i_reg_name = i_reg if not consider_masking else i_reg['name']
+        i_reg_name = i_reg['name'] if i_reg and 'name' in i_reg else i_reg
+        if reg is None:
+            if i_reg is None:
+                return True
+            return False
        # check for wildcards
        if i_reg_name == self.WILDCARD or reg['name'] == self.WILDCARD:
            return True
--- a/osaca/semantics/isa_semantics.py
+++ b/osaca/semantics/isa_semantics.py
@@ -2,7 +2,7 @@
 from itertools import chain

 from osaca import utils
-from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
+from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT

 from .hw_model import MachineModel

@@ -26,12 +26,12 @@ class ISASemantics(object):

    def __init__(self, isa, path_to_yaml=None):
        self._isa = isa.lower()
-        path = utils.find_file('isa/' + self._isa + '.yml') if not path_to_yaml else path_to_yaml
+        path = path_to_yaml or utils.find_datafile('isa/' + self._isa + '.yml')
        self._isa_model = MachineModel(path_to_yaml=path)
        if self._isa == 'x86':
            self._parser = ParserX86ATT()
        elif self._isa == 'aarch64':
-            self._parser = ParserAArch64v81()
+            self._parser = ParserAArch64()

    def process(self, instruction_forms):
        """Process a list of instruction forms."""
@@ -52,7 +52,6 @@ class ISASemantics(object):
            return
        # check if instruction form is in ISA yaml, otherwise apply standard operand assignment
        # (one dest, others source)
-        # import pdb; pdb.set_trace()
        isa_data = self._isa_model.get_instruction(
            instruction_form['instruction'], instruction_form['operands']
        )
@@ -103,14 +102,14 @@ class ISASemantics(object):
                if ('post_indexed' in operand['memory'] and operand['memory']['post_indexed']) or (
                    'pre_indexed' in operand['memory'] and operand['memory']['pre_indexed']
                ):
-                    op_dict['source'].remove(operand)
-                    op_dict['src_dst'].append(operand)
+                    op_dict['src_dst'].append(AttrDict.convert_dict(
+                        {'register': operand['memory']['base']}))
            for operand in [op for op in op_dict['destination'] if 'memory' in op]:
                if ('post_indexed' in operand['memory'] and operand['memory']['post_indexed']) or (
                    'pre_indexed' in operand['memory'] and operand['memory']['pre_indexed']
                ):
-                    op_dict['destination'].remove(operand)
-                    op_dict['src_dst'].append(operand)
+                    op_dict['src_dst'].append(AttrDict.convert_dict(
+                        {'register': operand['memory']['base']}))
        # store operand list in dict and reassign operand key/value pair
        instruction_form['semantic_operands'] = AttrDict.convert_dict(op_dict)
        # assign LD/ST flags
--- a/osaca/semantics/marker_utils.py
+++ b/osaca/semantics/marker_utils.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 from collections import OrderedDict

-from osaca.parser import ParserAArch64v81, ParserX86ATT, get_parser
+from osaca.parser import ParserAArch64, ParserX86ATT, get_parser

 COMMENT_MARKER = {'start': 'OSACA-BEGIN', 'end': 'OSACA-END'}

@@ -22,9 +22,9 @@ def reduce_to_section(kernel, isa):
    else:
        raise ValueError('ISA not supported.')
    if start == -1:
-        raise LookupError('Could not find START MARKER. Make sure it is inserted!')
+        start = 0
    if end == -1:
-        raise LookupError('Could not find END MARKER. Make sure it is inserted!')
+        end = len(kernel)
    return kernel[start:end]


@@ -38,7 +38,7 @@ def find_marked_kernel_AArch64(lines):
    nop_bytes = ['213', '3', '32', '31']
    return find_marked_section(
        lines,
-        ParserAArch64v81(),
+        ParserAArch64(),
        ['mov'],
        'x1',
        [111, 222],
@@ -277,6 +277,11 @@ def find_basic_loop_bodies(lines):
            current_block.append(line)
            # Find end of block by searching for references to valid jump labels
            if line['instruction'] and line['operands']:
+                # Ignore `b.none` instructions (relevant von ARM SVE code)
+                # This branch instruction is often present _within_ inner loop blocks, but usually 
+                # do not terminate
+                if line['instruction'] == 'b.none':
+                    continue
                for operand in [o for o in line['operands'] if 'identifier' in o]:
                    if operand['identifier']['name'] in valid_jump_labels:
                        if operand['identifier']['name'] == label:
--- a/osaca/utils.py
+++ b/osaca/utils.py
@@ -1,28 +1,14 @@
 #!/usr/bin/env python3
 import os.path

+DATA_DIRS = [os.path.expanduser('~/.osaca/data'), os.path.join(os.path.dirname(__file__), 'data')]
 CACHE_DIR = os.path.expanduser('~/.osaca/cache')


-def find_file(name):
+def find_datafile(name):
    """Check for existence of name in user or package data folders and return path."""
-    search_paths = [os.path.expanduser('~/.osaca/data'),
-                    os.path.join(os.path.dirname(__file__), 'data')]
-    for dir in search_paths:
+    for dir in DATA_DIRS:
        path = os.path.join(dir, name)
        if os.path.exists(path):
            return path
-    raise FileNotFoundError("Could not find {!r} in {!r}.".format(name, search_paths))
-
-
-def exists_cached_file(name):
-    """Check for existence of file in cache dir. Returns path if it exists and False otherwise."""
-    if not os.path.exists(CACHE_DIR):
-        os.makedirs(CACHE_DIR)
-        return False
-    search_paths = [CACHE_DIR]
-    for dir in search_paths:
-        path = os.path.join(dir, name)
-        if os.path.exists(path):
-            return path
-    return False
+    raise FileNotFoundError("Could not find {!r} in {!r}.".format(name, DATA_DIRS))
--- a/setup.py
+++ b/setup.py
@@ -2,11 +2,14 @@

 # Always prefer setuptools over distutils
 from setuptools import setup, find_packages
+from setuptools.command.install import install as _install
+from setuptools.command.sdist import sdist as _sdist
 # To use a consistent encoding
 from codecs import open
 import os
 import io
 import re
+import sys

 here = os.path.abspath(os.path.dirname(__file__))

@@ -27,6 +30,27 @@ def find_version(*file_paths):
    raise RuntimeError("Unable to find version string.")


+def _run_build_cache(dir):
+    from subprocess import check_call
+    # This is run inside the install staging directory (that had no .pyc files)
+    # We don't want to generate any.
+    # https://github.com/eliben/pycparser/pull/135
+    check_call([sys.executable, '-B', '_build_cache.py'],
+               cwd=os.path.join(dir, 'osaca', 'data'))
+
+
+class install(_install):
+    def run(self):
+        _install.run(self)
+        self.execute(_run_build_cache, (self.install_lib,), msg="Build ISA and architecture cache")
+
+
+class sdist(_sdist):
+    def make_release_tree(self, basedir, files):
+        _sdist.make_release_tree(self, basedir, files)
+        self.execute(_run_build_cache, (basedir,), msg="Build ISA and architecture cache")
+
+
 # Get the long description from the README file
 with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f:
    long_description = f.read()
@@ -59,7 +83,7 @@ setup(
        #   3 - Alpha
        #   4 - Beta
        #   5 - Production/Stable
-        'Development Status :: 3 - Alpha',
+        'Development Status :: 4 - Beta',

        # Indicate who your project is intended for
        'Intended Audience :: Developers',
@@ -76,6 +100,9 @@ setup(
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
    ],

    # What doesd your project relate to?
@@ -124,4 +151,7 @@ setup(
            'osaca=osaca.osaca:main',
        ],
    },
+
+    # Overwriting install and sdist to enforce cache distribution with package
+    cmdclass={'install': install, 'sdist': sdist},
 )
--- a/tests/all_tests.py
+++ b/tests/all_tests.py
@@ -8,7 +8,7 @@ suite = unittest.TestLoader().loadTestsFromNames(
    [
        'test_base_parser',
        'test_parser_x86att',
-        'test_parser_AArch64v81',
+        'test_parser_AArch64',
        'test_marker_utils',
        'test_semantics',
        'test_frontend',
--- a/tests/test_base_parser.py
+++ b/tests/test_base_parser.py
@@ -18,6 +18,12 @@ class TestBaseParser(unittest.TestCase):
            pass
        with open(self._find_file('triad_x86_iaca.s')) as f:
            self.triad_code = f.read()
+        with open(self._find_file('triad_arm_iaca.s')) as f:
+            self.triad_code_arm = f.read()
+        with open(self._find_file('kernel_x86.s')) as f:
+            self.x86_code = f.read()
+        with open(self._find_file('kernel_aarch64.s')) as f:
+            self.aarch64_code = f.read()

    ##################
    # Test
@@ -59,6 +65,12 @@ class TestBaseParser(unittest.TestCase):
        with self.assertRaises(NotImplementedError):
            self.parser.normalize_imd(imd_hex_1)

+    def test_detect_ISA(self):
+        self.assertEqual(BaseParser.detect_ISA(self.triad_code), 'x86')
+        self.assertEqual(BaseParser.detect_ISA(self.triad_code_arm), 'aarch64')
+        self.assertEqual(BaseParser.detect_ISA(self.x86_code), 'x86')
+        self.assertEqual(BaseParser.detect_ISA(self.aarch64_code), 'aarch64')
+
    ##################
    # Helper functions
    ##################
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -11,7 +11,7 @@ from shutil import copyfile
 from unittest.mock import patch

 import osaca.osaca as osaca
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT
 from osaca.semantics import MachineModel


@@ -71,7 +71,7 @@ class TestCLI(unittest.TestCase):

    def test_get_parser(self):
        self.assertTrue(isinstance(osaca.get_asm_parser('csx'), ParserX86ATT))
-        self.assertTrue(isinstance(osaca.get_asm_parser('tx2'), ParserAArch64v81))
+        self.assertTrue(isinstance(osaca.get_asm_parser('tx2'), ParserAArch64))
        with self.assertRaises(ValueError):
            osaca.get_asm_parser('UNKNOWN')

@@ -153,6 +153,64 @@ class TestCLI(unittest.TestCase):
                output = StringIO()
                osaca.run(args, output_file=output)

+    def test_without_arch(self):
+        # Run test kernels without --arch flag
+        parser = osaca.create_parser()
+        # x86
+        kernel_x86 = 'kernel_x86.s'
+        args = parser.parse_args([self._find_test_file(kernel_x86)])
+        output = StringIO()
+        osaca.run(args, output_file=output)
+        # AArch64
+        kernel_aarch64 = 'kernel_aarch64.s'
+        args = parser.parse_args([self._find_test_file(kernel_aarch64)])
+        osaca.run(args, output_file=output)
+    
+    def test_user_warnings(self):
+        parser = osaca.create_parser()
+        kernel = 'triad_x86_unmarked.s'
+        args = parser.parse_args(
+            ['--arch', 'csx', '--ignore-unknown', self._find_test_file(kernel)]
+        )
+        output = StringIO()
+        osaca.run(args, output_file=output)
+        # WARNING for length
+        self.assertTrue(output.getvalue().count('WARNING') == 1)
+        args = parser.parse_args(
+            ['--lines', '100-199', '--ignore-unknown', self._find_test_file(kernel)]
+        )
+        output = StringIO()
+        osaca.run(args, output_file=output)
+        # WARNING for arch
+        self.assertTrue(output.getvalue().count('WARNING') == 1)
+
+
+    def test_lines_arg(self):
+        # Run tests with --lines option
+        parser = osaca.create_parser()
+        kernel_x86 = 'triad_x86_iaca.s'
+        args_base = parser.parse_args(
+            ['--arch', 'csx', self._find_test_file(kernel_x86)]
+        )
+        output_base = StringIO()
+        osaca.run(args_base, output_file=output_base)
+        output_base = output_base.getvalue().split('\n')[8:]
+        args = []
+        args.append(parser.parse_args(
+            ['--lines', '146-154', '--arch', 'csx', self._find_test_file(kernel_x86)]
+        ))
+        args.append(parser.parse_args(
+            ['--lines', '146:154', '--arch', 'csx', self._find_test_file(kernel_x86)]
+        ))
+        args.append(parser.parse_args(
+            ['--lines', '146,147:148,149-154', '--arch', 'csx', self._find_test_file(kernel_x86)]
+        ))
+        for a in args:
+            with self.subTest(params=a):
+                output = StringIO()
+                osaca.run(a, output_file=output)
+                self.assertEqual(output.getvalue().split('\n')[8:], output_base)
+
    ##################
    # Helper functions
    ##################
--- a/tests/test_files/triad_x86_unmarked.s
+++ b/tests/test_files/triad_x86_unmarked.s
@@ -0,0 +1,345 @@
+	.file	"triad.c"
+	.section	.rodata.str1.8,"aMS",@progbits,1
+	.align 8
+.LC9:
+	.string	"%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
+	.text
+	.p2align 4,,15
+	.globl	triad
+	.type	triad, @function
+triad:
+.LFB24:
+	.cfi_startproc
+	pushq	%r13
+	.cfi_def_cfa_offset 16
+	.cfi_offset 13, -16
+	movslq	%edi, %rax
+	movl	$64, %edi
+	leaq	16(%rsp), %r13
+	.cfi_def_cfa 13, 0
+	andq	$-32, %rsp
+	pushq	-8(%r13)
+	pushq	%rbp
+	.cfi_escape 0x10,0x6,0x2,0x76,0
+	movq	%rsp, %rbp
+	pushq	%r15
+	.cfi_escape 0x10,0xf,0x2,0x76,0x78
+	leaq	0(,%rax,8), %r15
+	pushq	%r14
+	movq	%r15, %rsi
+	pushq	%r13
+	.cfi_escape 0xf,0x3,0x76,0x68,0x6
+	.cfi_escape 0x10,0xe,0x2,0x76,0x70
+	pushq	%r12
+	pushq	%rbx
+	.cfi_escape 0x10,0xc,0x2,0x76,0x60
+	.cfi_escape 0x10,0x3,0x2,0x76,0x58
+	movq	%rax, %rbx
+	subq	$72, %rsp
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r14
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r12
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r13
+	call	aligned_alloc
+	movq	%rax, %r15
+	leal	-1(%rbx), %eax
+	movl	%eax, -96(%rbp)
+	testl	%ebx, %ebx
+	jle	.L2
+	cmpl	$2, %eax
+	jbe	.L14
+	movl	%ebx, %esi
+	vmovapd	.LC0(%rip), %ymm0
+	xorl	%eax, %eax
+	xorl	%ecx, %ecx
+	shrl	$2, %esi
+	.p2align 4,,10
+	.p2align 3
+.L4:
+	addl	$1, %ecx
+	vmovapd	%ymm0, (%r15,%rax)
+	vmovapd	%ymm0, 0(%r13,%rax)
+	vmovapd	%ymm0, (%r12,%rax)
+	vmovapd	%ymm0, (%r14,%rax)
+	addq	$32, %rax
+	cmpl	%ecx, %esi
+	ja	.L4
+	movl	%ebx, %eax
+	andl	$-4, %eax
+	cmpl	%eax, %ebx
+	je	.L26
+	vzeroupper
+.L3:
+	vmovsd	.LC1(%rip), %xmm0
+	movslq	%eax, %rcx
+	vmovsd	%xmm0, (%r15,%rcx,8)
+	vmovsd	%xmm0, 0(%r13,%rcx,8)
+	vmovsd	%xmm0, (%r12,%rcx,8)
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	leal	1(%rax), %ecx
+	cmpl	%ecx, %ebx
+	jle	.L2
+	movslq	%ecx, %rcx
+	addl	$2, %eax
+	vmovsd	%xmm0, (%r15,%rcx,8)
+	vmovsd	%xmm0, 0(%r13,%rcx,8)
+	vmovsd	%xmm0, (%r12,%rcx,8)
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	cmpl	%eax, %ebx
+	jle	.L2
+	cltq
+	vmovsd	%xmm0, (%r15,%rax,8)
+	vmovsd	%xmm0, 0(%r13,%rax,8)
+	vmovsd	%xmm0, (%r12,%rax,8)
+	vmovsd	%xmm0, (%r14,%rax,8)
+.L2:
+	movl	%ebx, %eax
+	movl	$1, -84(%rbp)
+	movl	%ebx, %r10d
+	andl	$-4, %eax
+	shrl	$2, %r10d
+	movl	%eax, -100(%rbp)
+	.p2align 4,,10
+	.p2align 3
+.L13:
+	leaq	-56(%rbp), %rsi
+	leaq	-72(%rbp), %rdi
+	movl	%r10d, -88(%rbp)
+	call	timing
+	movl	-88(%rbp), %r10d
+	xorl	%r11d, %r11d
+	.p2align 4,,10
+	.p2align 3
+.L12:
+	vmovsd	(%r14), %xmm0
+	vxorpd	%xmm7, %xmm7, %xmm7
+	vucomisd	%xmm7, %xmm0
+	jbe	.L6
+	movq	%r14, %rdi
+	movl	%r11d, -92(%rbp)
+	movl	%r10d, -88(%rbp)
+	vzeroupper
+	call	dummy
+	movl	-92(%rbp), %r11d
+	movl	-88(%rbp), %r10d
+.L6:
+	testl	%ebx, %ebx
+	jle	.L8
+	cmpl	$2, -96(%rbp)
+	jbe	.L15
+	xorl	%eax, %eax
+	xorl	%ecx, %ecx
+	.p2align 4,,10
+	.p2align 3
+.L10:
+	vmovapd	(%r15,%rax), %ymm0
+	vmovapd	(%r12,%rax), %ymm3
+	addl	$1, %ecx
+	vfmadd132pd	0(%r13,%rax), %ymm3, %ymm0
+	vmovapd	%ymm0, (%r14,%rax)
+	addq	$32, %rax
+	cmpl	%ecx, %r10d
+	ja	.L10
+	movl	-100(%rbp), %eax
+	cmpl	%ebx, %eax
+	je	.L8
+.L9:
+	movslq	%eax, %rcx
+	vmovsd	0(%r13,%rcx,8), %xmm0
+	vmovsd	(%r12,%rcx,8), %xmm5
+	vfmadd132sd	(%r15,%rcx,8), %xmm5, %xmm0
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	leal	1(%rax), %ecx
+	cmpl	%ebx, %ecx
+	jge	.L8
+	movslq	%ecx, %rcx
+	addl	$2, %eax
+	vmovsd	0(%r13,%rcx,8), %xmm0
+	vmovsd	(%r12,%rcx,8), %xmm6
+	vfmadd132sd	(%r15,%rcx,8), %xmm6, %xmm0
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	cmpl	%eax, %ebx
+	jle	.L8
+	cltq
+	vmovsd	(%r15,%rax,8), %xmm0
+	vmovsd	(%r12,%rax,8), %xmm4
+	vfmadd132sd	0(%r13,%rax,8), %xmm4, %xmm0
+	vmovsd	%xmm0, (%r14,%rax,8)
+.L8:
+	addl	$1, %r11d
+	cmpl	-84(%rbp), %r11d
+	jne	.L12
+	leaq	-56(%rbp), %rsi
+	leaq	-64(%rbp), %rdi
+	movl	%r11d, -84(%rbp)
+	movl	%r10d, -88(%rbp)
+	vzeroupper
+	call	timing
+	vmovsd	-64(%rbp), %xmm1
+	vsubsd	-72(%rbp), %xmm1, %xmm1
+	vmovsd	.LC3(%rip), %xmm2
+	movl	-84(%rbp), %r11d
+	movl	-88(%rbp), %r10d
+	vucomisd	%xmm1, %xmm2
+	leal	(%r11,%r11), %eax
+	movl	%eax, -84(%rbp)
+	ja	.L13
+	movl	%eax, %esi
+	vxorpd	%xmm6, %xmm6, %xmm6
+	vxorpd	%xmm0, %xmm0, %xmm0
+	movl	%ebx, %edx
+	sarl	%esi
+	vcvtsi2sd	%ebx, %xmm0, %xmm0
+	movl	$.LC9, %edi
+	movl	$5, %eax
+	vcvtsi2sd	%esi, %xmm6, %xmm6
+	vmulsd	.LC5(%rip), %xmm6, %xmm2
+	vmovsd	.LC4(%rip), %xmm5
+	vmovsd	.LC6(%rip), %xmm7
+	vmulsd	%xmm0, %xmm6, %xmm4
+	vmulsd	%xmm0, %xmm2, %xmm2
+	vdivsd	%xmm1, %xmm4, %xmm4
+	vdivsd	%xmm1, %xmm2, %xmm2
+	vdivsd	%xmm5, %xmm4, %xmm4
+	vmulsd	%xmm7, %xmm2, %xmm3
+	vaddsd	%xmm0, %xmm0, %xmm2
+	vmulsd	.LC8(%rip), %xmm0, %xmm0
+	vmulsd	%xmm6, %xmm2, %xmm2
+	vmulsd	.LC7(%rip), %xmm2, %xmm2
+	vmulsd	%xmm7, %xmm3, %xmm3
+	vdivsd	%xmm5, %xmm0, %xmm0
+	vdivsd	%xmm5, %xmm4, %xmm4
+	vdivsd	%xmm1, %xmm2, %xmm2
+	call	printf
+	movq	%r14, %rdi
+	call	free
+	movq	%r12, %rdi
+	call	free
+	movq	%r13, %rdi
+	call	free
+	addq	$72, %rsp
+	movq	%r15, %rdi
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	.cfi_remember_state
+	.cfi_def_cfa 13, 0
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	leaq	-16(%r13), %rsp
+	.cfi_def_cfa 7, 16
+	popq	%r13
+	.cfi_def_cfa_offset 8
+	jmp	free
+	.p2align 4,,10
+	.p2align 3
+.L15:
+	.cfi_restore_state
+	xorl	%eax, %eax
+	jmp	.L9
+.L26:
+	vzeroupper
+	jmp	.L2
+.L14:
+	xorl	%eax, %eax
+	jmp	.L3
+	.cfi_endproc
+.LFE24:
+	.size	triad, .-triad
+	.section	.rodata.str1.8
+	.align 8
+.LC10:
+	.string	"TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
+	.align 8
+.LC11:
+	.string	"Size (KByte) |   runtime  |  MFlop/s  |  MB/s   |  MLUP/s | repeat | size"
+	.section	.text.startup,"ax",@progbits
+	.p2align 4,,15
+	.globl	main
+	.type	main, @function
+main:
+.LFB25:
+	.cfi_startproc
+	pushq	%rbx
+	.cfi_def_cfa_offset 16
+	.cfi_offset 3, -16
+	movl	$.LC10, %edi
+	movl	$20, %ebx
+	call	puts
+	movl	$.LC11, %edi
+	call	puts
+	.p2align 4,,10
+	.p2align 3
+.L28:
+	vxorpd	%xmm1, %xmm1, %xmm1
+	movq	.LC12(%rip), %rax
+	vcvtsi2sd	%ebx, %xmm1, %xmm1
+	addl	$1, %ebx
+	vmovq	%rax, %xmm0
+	call	pow
+	vcvttsd2si	%xmm0, %edi
+	call	triad
+	cmpl	$36, %ebx
+	jne	.L28
+	xorl	%eax, %eax
+	popq	%rbx
+	.cfi_def_cfa_offset 8
+	ret
+	.cfi_endproc
+.LFE25:
+	.size	main, .-main
+	.section	.rodata.cst32,"aM",@progbits,32
+	.align 32
+.LC0:
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.section	.rodata.cst8,"aM",@progbits,8
+	.align 8
+.LC1:
+	.long	1907715710
+	.long	1048610426
+	.align 8
+.LC3:
+	.long	2576980378
+	.long	1070176665
+	.align 8
+.LC4:
+	.long	0
+	.long	1083129856
+	.align 8
+.LC5:
+	.long	0
+	.long	1077936128
+	.align 8
+.LC6:
+	.long	0
+	.long	1062207488
+	.align 8
+.LC7:
+	.long	2696277389
+	.long	1051772663
+	.align 8
+.LC8:
+	.long	0
+	.long	1075838976
+	.align 8
+.LC12:
+	.long	3435973837
+	.long	1073007820
+	.ident	"GCC: (GNU) 7.2.0"
+	.section	.note.GNU-stack,"",@progbits
--- a/tests/test_frontend.py
+++ b/tests/test_frontend.py
@@ -7,7 +7,7 @@ import os
 import unittest

 from osaca.frontend import Frontend
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT
 from osaca.semantics import ArchSemantics, KernelDG, MachineModel


@@ -20,7 +20,7 @@ class TestFrontend(unittest.TestCase):
    def setUpClass(self):
        # set up parser and kernels
        self.parser_x86 = ParserX86ATT()
-        self.parser_AArch64 = ParserAArch64v81()
+        self.parser_AArch64 = ParserAArch64()
        with open(self._find_file('kernel_x86.s')) as f:
            code_x86 = f.read()
        with open(self._find_file('kernel_aarch64.s')) as f:
@@ -33,7 +33,7 @@ class TestFrontend(unittest.TestCase):
            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
        )
        self.machine_model_tx2 = MachineModel(
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
+            arch='tx2'
        )
        self.semantics_csx = ArchSemantics(
            self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
--- a/tests/test_kerncraftAPI.py
+++ b/tests/test_kerncraftAPI.py
@@ -9,7 +9,7 @@ import unittest
 from collections import OrderedDict

 from osaca.api import KerncraftAPI
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT


 class TestKerncraftAPI(unittest.TestCase):
@@ -17,7 +17,7 @@ class TestKerncraftAPI(unittest.TestCase):
    def setUpClass(self):
        # set up parser and kernels
        self.parser_x86 = ParserX86ATT()
-        self.parser_AArch64 = ParserAArch64v81()
+        self.parser_AArch64 = ParserAArch64()
        with open(self._find_file('triad_x86_iaca.s')) as f:
            self.code_x86 = f.read()
        with open(self._find_file('triad_arm_iaca.s')) as f:
@@ -63,7 +63,7 @@ class TestKerncraftAPI(unittest.TestCase):
                ('0DV', 0.0),
                ('1', 34.0),
                ('1DV', 0.0),
-                ('2', 2.0),
+                ('2', 3.0),
                ('3', 64.0),
                ('4', 64.0),
                ('5', 32.0),
--- a/tests/test_marker_utils.py
+++ b/tests/test_marker_utils.py
@@ -8,13 +8,13 @@ from collections import OrderedDict

 from osaca.semantics import reduce_to_section, find_basic_blocks, find_jump_labels, \
    find_basic_loop_bodies
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT


 class TestMarkerUtils(unittest.TestCase):
    @classmethod
    def setUpClass(self):
-        self.parser_AArch = ParserAArch64v81()
+        self.parser_AArch = ParserAArch64()
        self.parser_x86 = ParserX86ATT()
        with open(self._find_file('triad_arm_iaca.s')) as f:
            triad_code_arm = f.read()
@@ -178,120 +178,115 @@ class TestMarkerUtils(unittest.TestCase):

    def test_marker_special_cases_AArch(self):
        bytes_line = '.byte     213,3,32,31\n'
-        mov_start = 'mov      x1, #111\n'
-        mov_end = 'mov      x1, #222\n'
-        prologue = 'dup v0.2d, x14\n' + '    neg x9, x9\n' + '    .p2align    6\n'
+        start_marker = 'mov      x1, #111\n' + bytes_line
+        end_marker = 'mov      x1, #222\n' + bytes_line
+        prologue = (
+            'dup v0.2d, x14\n'
+            'neg x9, x9\n'
+            '.p2align    6\n')
        kernel = (
            '.LBB0_28:\n'
            + 'fmul    v7.2d, v7.2d, v19.2d\n'
            + 'stp q0, q1, [x10, #-32]\n'
-            + 'b.ne    .LBB0_28\n'
-        )
-        epilogue = '.LBB0_29:   //   Parent Loop BB0_20 Depth=1\n' + 'bl    dummy\n'
-        kernel_length = len(list(filter(None, kernel.split('\n'))))
+            + 'b.ne    .LBB0_28\n')
+        epilogue = (
+            '.LBB0_29:   //   Parent Loop BB0_20 Depth=1\n'
+            'bl    dummy\n')

-        # marker directly at the beginning
-        code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        beginning_parsed = self.parser_AArch.parse_file(code_beginning)
-        test_kernel = reduce_to_section(beginning_parsed, 'AArch64')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
+        samples = [
+            # (test name,
+            #  ignored prologue, section to be extraced, ignored epilogue)
+            ("markers",
+             prologue + start_marker, kernel, end_marker + epilogue),
+            ("marker at file start",
+             start_marker, kernel, end_marker + epilogue),
+            ("no start marker",
+             '', prologue + kernel, end_marker + epilogue),
+            ("marker at file end",
+             prologue + start_marker, kernel, end_marker),
+            ("no end marker",
+             prologue + start_marker, kernel + epilogue, ''),
+            ("empty kernel",
+             prologue + start_marker, '', end_marker + epilogue),
+        ]

-        # marker at the end
-        code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        end_parsed = self.parser_AArch.parse_file(code_end)
-        test_kernel = reduce_to_section(end_parsed, 'AArch64')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
-
-        # no kernel
-        code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
-        empty_parsed = self.parser_AArch.parse_file(code_empty)
-        test_kernel = reduce_to_section(empty_parsed, 'AArch64')
-        self.assertEqual(len(test_kernel), 0)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        self.assertEqual(test_kernel, [])
-
-        # no start marker
-        code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
-        no_start_parsed = self.parser_AArch.parse_file(code_no_start)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_start_parsed, 'AArch64')
-
-        # no end marker
-        code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
-        no_end_parsed = self.parser_AArch.parse_file(code_no_end)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_end_parsed, 'AArch64')
-
-        # no marker at all
-        code_no_marker = prologue + kernel + epilogue
-        no_marker_parsed = self.parser_AArch.parse_file(code_no_marker)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_marker_parsed, 'AArch64')
+        for test_name, pro, kernel, epi in samples:
+            code = pro + kernel + epi
+            parsed = self.parser_AArch.parse_file(code)
+            test_kernel = reduce_to_section(parsed, 'AArch64')
+            if kernel:
+                kernel_length = len(kernel.strip().split('\n'))
+            else:
+                kernel_length = 0
+            self.assertEqual(
+                len(test_kernel), kernel_length,
+                msg="Invalid exctracted kernel length on {!r} sample".format(test_name))
+            if pro:
+                kernel_start = len((pro).strip().split('\n'))
+            else:
+                kernel_start = 0
+            parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
+            self.assertEqual(
+                test_kernel, parsed_kernel,
+                msg="Invalid exctracted kernel on {!r}".format(test_name))

    def test_marker_special_cases_x86(self):
-        bytes_line = '.byte     100\n.byte     103\n.byte     144\n'
-        mov_start = 'movl     $111, %ebx\n'
-        mov_end = 'movl     $222, %ebx\n'
-        prologue = 'movl    -88(%rbp), %r10d\n' + 'xorl    %r11d, %r11d\n' + '.p2align 4,,10\n'
+        bytes_line = (
+            '.byte     100\n'
+            '.byte     103\n'
+            '.byte     144\n')
+        start_marker = 'movl     $111, %ebx\n' + bytes_line
+        end_marker = 'movl     $222, %ebx\n' + bytes_line
+        prologue = (
+            'movl    -88(%rbp), %r10d\n'
+            'xorl    %r11d, %r11d\n'
+            '.p2align 4,,10\n')
        kernel = (
            '.L3: #L3\n'
-            + 'vmovsd  .LC1(%rip), %xmm0\n'
-            + 'vmovsd  %xmm0, (%r15,%rcx,8)\n'
-            + 'cmpl    %ecx, %ebx\n'
-            + 'jle .L3\n'
-        )
-        epilogue = 'leaq    -56(%rbp), %rsi\n' + 'movl    %r10d, -88(%rbp)\n' + 'call    timing\n'
-        kernel_length = len(list(filter(None, kernel.split('\n'))))
+            'vmovsd  .LC1(%rip), %xmm0\n'
+            'vmovsd  %xmm0, (%r15,%rcx,8)\n'
+            'cmpl    %ecx, %ebx\n'
+            'jle .L3\n')
+        epilogue = (
+            'leaq    -56(%rbp), %rsi\n'
+            'movl    %r10d, -88(%rbp)\n'
+            'call    timing\n')
+        samples = [
+            # (test name,
+            #  ignored prologue, section to be extraced, ignored epilogue)
+            ("markers",
+             prologue + start_marker, kernel, end_marker + epilogue),
+            ("marker at file start",
+             start_marker, kernel, end_marker + epilogue),
+            ("no start marker",
+             '', prologue + kernel, end_marker + epilogue),
+            ("marker at file end",
+             prologue + start_marker, kernel, end_marker),
+            ("no end marker",
+             prologue + start_marker, kernel + epilogue, ''),
+            ("empty kernel",
+             prologue + start_marker, '', end_marker + epilogue),
+        ]

-        # marker directly at the beginning
-        code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        beginning_parsed = self.parser_x86.parse_file(code_beginning)
-        test_kernel = reduce_to_section(beginning_parsed, 'x86')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
-
-        # marker at the end
-        code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        end_parsed = self.parser_x86.parse_file(code_end)
-        test_kernel = reduce_to_section(end_parsed, 'x86')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
-
-        # no kernel
-        code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
-        empty_parsed = self.parser_x86.parse_file(code_empty)
-        test_kernel = reduce_to_section(empty_parsed, 'x86')
-        self.assertEqual(len(test_kernel), 0)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        self.assertEqual(test_kernel, [])
-
-        # no start marker
-        code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
-        no_start_parsed = self.parser_x86.parse_file(code_no_start)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_start_parsed, 'x86')
-
-        # no end marker
-        code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
-        no_end_parsed = self.parser_x86.parse_file(code_no_end)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_end_parsed, 'x86')
-
-        # no marker at all
-        code_no_marker = prologue + kernel + epilogue
-        no_marker_parsed = self.parser_x86.parse_file(code_no_marker)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_marker_parsed, 'x86')
+        for test_name, pro, kernel, epi in samples:
+            code = pro + kernel + epi
+            parsed = self.parser_x86.parse_file(code)
+            test_kernel = reduce_to_section(parsed, 'x86')
+            if kernel:
+                kernel_length = len(kernel.strip().split('\n'))
+            else:
+                kernel_length = 0
+            self.assertEqual(
+                len(test_kernel), kernel_length,
+                msg="Invalid exctracted kernel length on {!r} sample".format(test_name))
+            if pro:
+                kernel_start = len((pro).strip().split('\n'))
+            else:
+                kernel_start = 0
+            parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
+            self.assertEqual(
+                test_kernel, parsed_kernel,
+                msg="Invalid exctracted kernel on {!r}".format(test_name))

    def test_find_jump_labels(self):
        self.assertEqual(find_jump_labels(self.parsed_x86),
--- a/tests/test_parser_AArch64v81.py
+++ b/tests/test_parser_AArch64v81.py
@@ -8,13 +8,13 @@ import unittest

 from pyparsing import ParseException

-from osaca.parser import AttrDict, ParserAArch64v81
+from osaca.parser import AttrDict, ParserAArch64


-class TestParserAArch64v81(unittest.TestCase):
+class TestParserAArch64(unittest.TestCase):
    @classmethod
    def setUpClass(self):
-        self.parser = ParserAArch64v81()
+        self.parser = ParserAArch64()
        with open(self._find_file('triad_arm_iaca.s')) as f:
            self.triad_code = f.read()

@@ -146,8 +146,8 @@ class TestParserAArch64v81(unittest.TestCase):
    def test_parse_line(self):
        line_comment = '// -- Begin  main'
        line_label = '.LBB0_1:              // =>This Inner Loop Header: Depth=1'
-        line_directive = '\t.cfi_def_cfa w29, -16'
-        line_instruction = '\tldr s0, [x11, w10, sxtw #2]\t\t// = <<2'
+        line_directive = '.cfi_def_cfa w29, -16'
+        line_instruction = 'ldr s0, [x11, w10, sxtw #2]    // = <<2'
        line_prefetch = 'prfm    pldl1keep, [x26, #2048] //HPL'
        line_preindexed = 'stp x29, x30, [sp, #-16]!'
        line_postindexed = 'ldp q2, q3, [x11], #64'
@@ -201,7 +201,7 @@ class TestParserAArch64v81(unittest.TestCase):
            'directive': None,
            'comment': '= <<2',
            'label': None,
-            'line': 'ldr s0, [x11, w10, sxtw #2]\t\t// = <<2',
+            'line': 'ldr s0, [x11, w10, sxtw #2]    // = <<2',
            'line_number': 4,
        }
        instruction_form_5 = {
@@ -309,23 +309,23 @@ class TestParserAArch64v81(unittest.TestCase):
        self.assertEqual(self.parser.normalize_imd(identifier), identifier)

    def test_multiple_regs(self):
-        instr_range = 'PUSH {r5-r7}'
+        instr_range = 'PUSH {x5-x7}'
        reg_range = AttrDict({
            'register': {
                'range': [
-                    {'prefix': 'r', 'name': '5'},
-                    {'prefix': 'r', 'name': '7'}
+                    {'prefix': 'x', 'name': '5'},
+                    {'prefix': 'x', 'name': '7'}
                ],
                'index': None
            }
        })
-        instr_list = 'POP {r5, r7, r9}'
+        instr_list = 'POP {x5, x7, x9}'
        reg_list = AttrDict({
            'register': {
                'list': [
-                    {'prefix': 'r', 'name': '5'},
-                    {'prefix': 'r', 'name': '7'},
-                    {'prefix': 'r', 'name': '9'}
+                    {'prefix': 'x', 'name': '5'},
+                    {'prefix': 'x', 'name': '7'},
+                    {'prefix': 'x', 'name': '9'}
                ],
                'index': None
            }
@@ -411,5 +411,5 @@ class TestParserAArch64v81(unittest.TestCase):


 if __name__ == '__main__':
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestParserAArch64v81)
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParserAArch64)
    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_parser_x86att.py
+++ b/tests/test_parser_x86att.py
@@ -156,8 +156,8 @@ class TestParserX86ATT(unittest.TestCase):
    def test_parse_line(self):
        line_comment = '# -- Begin  main'
        line_label = '..B1.7:                         # Preds ..B1.6'
-        line_directive = '\t\t.quad   .2.3_2__kmpc_loc_pack.2 #qed'
-        line_instruction = '\t\tlea       2(%rax,%rax), %ecx #12.9'
+        line_directive = '.quad   .2.3_2__kmpc_loc_pack.2 #qed'
+        line_instruction = 'lea       2(%rax,%rax), %ecx #12.9'

        instruction_form_1 = {
            'instruction': None,
--- a/tests/test_semantics.py
+++ b/tests/test_semantics.py
@@ -11,7 +11,7 @@ from subprocess import call
 import networkx as nx

 from osaca.osaca import get_unmatched_instruction_ratio
-from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
+from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT
 from osaca.semantics import (INSTR_FLAGS, ArchSemantics, KernelDG,
                             MachineModel, reduce_to_section)

@@ -20,48 +20,43 @@ class TestSemanticTools(unittest.TestCase):
    MODULE_DATA_DIR = os.path.join(
        os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
    )
-    USER_DATA_DIR = os.path.join(os.path.expanduser('~'), '.osaca/')

    @classmethod
-    def setUpClass(self):
-        # copy db files in user directory
-        if not os.path.isdir(os.path.join(self.USER_DATA_DIR, 'data')):
-            os.makedirs(os.path.join(self.USER_DATA_DIR, 'data'))
-            call(['cp', '-r', self.MODULE_DATA_DIR, self.USER_DATA_DIR])
+    def setUpClass(cls):
        # set up parser and kernels
-        self.parser_x86 = ParserX86ATT()
-        self.parser_AArch64 = ParserAArch64v81()
-        with open(self._find_file('kernel_x86.s')) as f:
-            self.code_x86 = f.read()
-        with open(self._find_file('kernel_aarch64.s')) as f:
-            self.code_AArch64 = f.read()
-        self.kernel_x86 = reduce_to_section(self.parser_x86.parse_file(self.code_x86), 'x86')
-        self.kernel_AArch64 = reduce_to_section(
-            self.parser_AArch64.parse_file(self.code_AArch64), 'aarch64'
+        cls.parser_x86 = ParserX86ATT()
+        cls.parser_AArch64 = ParserAArch64()
+        with open(cls._find_file('kernel_x86.s')) as f:
+            cls.code_x86 = f.read()
+        with open(cls._find_file('kernel_aarch64.s')) as f:
+            cls.code_AArch64 = f.read()
+        cls.kernel_x86 = reduce_to_section(cls.parser_x86.parse_file(cls.code_x86), 'x86')
+        cls.kernel_AArch64 = reduce_to_section(
+            cls.parser_AArch64.parse_file(cls.code_AArch64), 'aarch64'
        )

        # set up machine models
-        self.machine_model_csx = MachineModel(
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
+        cls.machine_model_csx = MachineModel(
+            path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'csx.yml')
        )
-        self.machine_model_tx2 = MachineModel(
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
+        cls.machine_model_tx2 = MachineModel(
+            path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'tx2.yml')
        )
-        self.semantics_csx = ArchSemantics(
-            self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
+        cls.semantics_csx = ArchSemantics(
+            cls.machine_model_csx, path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'isa/x86.yml')
        )
-        self.semantics_tx2 = ArchSemantics(
-            self.machine_model_tx2,
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/aarch64.yml'),
+        cls.semantics_tx2 = ArchSemantics(
+            cls.machine_model_tx2,
+            path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'isa/aarch64.yml'),
        )
-        self.machine_model_zen = MachineModel(arch='zen1')
+        cls.machine_model_zen = MachineModel(arch='zen1')

-        for i in range(len(self.kernel_x86)):
-            self.semantics_csx.assign_src_dst(self.kernel_x86[i])
-            self.semantics_csx.assign_tp_lt(self.kernel_x86[i])
-        for i in range(len(self.kernel_AArch64)):
-            self.semantics_tx2.assign_src_dst(self.kernel_AArch64[i])
-            self.semantics_tx2.assign_tp_lt(self.kernel_AArch64[i])
+        for i in range(len(cls.kernel_x86)):
+            cls.semantics_csx.assign_src_dst(cls.kernel_x86[i])
+            cls.semantics_csx.assign_tp_lt(cls.kernel_x86[i])
+        for i in range(len(cls.kernel_AArch64)):
+            cls.semantics_tx2.assign_src_dst(cls.kernel_AArch64[i])
+            cls.semantics_tx2.assign_tp_lt(cls.kernel_AArch64[i])

    ###########
    # Tests
@@ -88,28 +83,21 @@ class TestSemanticTools(unittest.TestCase):
        self.assertIsNone(test_mm_x86.get_instruction(None, []))
        self.assertIsNone(test_mm_arm.get_instruction(None, []))

-        # test dict DB creation
-        test_mm_x86._data['instruction_dict'] = test_mm_x86._convert_to_dict(
-            test_mm_x86._data['instruction_forms']
-        )
-        test_mm_arm._data['instruction_dict'] = test_mm_arm._convert_to_dict(
-            test_mm_arm._data['instruction_forms']
-        )
-        # test get_instruction from dict DB
-        self.assertIsNone(test_mm_x86.get_instruction_from_dict(None, []))
-        self.assertIsNone(test_mm_arm.get_instruction_from_dict(None, []))
-        self.assertIsNone(test_mm_x86.get_instruction_from_dict('NOT_IN_DB', []))
-        self.assertIsNone(test_mm_arm.get_instruction_from_dict('NOT_IN_DB', []))
+        # test get_instruction from DB
+        self.assertIsNone(test_mm_x86.get_instruction(None, []))
+        self.assertIsNone(test_mm_arm.get_instruction(None, []))
+        self.assertIsNone(test_mm_x86.get_instruction('NOT_IN_DB', []))
+        self.assertIsNone(test_mm_arm.get_instruction('NOT_IN_DB', []))
        name_x86_1 = 'vaddpd'
        operands_x86_1 = [
            {'class': 'register', 'name': 'xmm'},
            {'class': 'register', 'name': 'xmm'},
            {'class': 'register', 'name': 'xmm'},
        ]
-        instr_form_x86_1 = test_mm_x86.get_instruction_from_dict(name_x86_1, operands_x86_1)
+        instr_form_x86_1 = test_mm_x86.get_instruction(name_x86_1, operands_x86_1)
        self.assertEqual(instr_form_x86_1, test_mm_x86.get_instruction(name_x86_1, operands_x86_1))
        self.assertEqual(
-            test_mm_x86.get_instruction_from_dict('jg', [{'class': 'identifier'}]),
+            test_mm_x86.get_instruction('jg', [{'class': 'identifier'}]),
            test_mm_x86.get_instruction('jg', [{'class': 'identifier'}]),
        )
        name_arm_1 = 'fadd'
@@ -118,10 +106,10 @@ class TestSemanticTools(unittest.TestCase):
            {'class': 'register', 'prefix': 'v', 'shape': 's'},
            {'class': 'register', 'prefix': 'v', 'shape': 's'},
        ]
-        instr_form_arm_1 = test_mm_arm.get_instruction_from_dict(name_arm_1, operands_arm_1)
+        instr_form_arm_1 = test_mm_arm.get_instruction(name_arm_1, operands_arm_1)
        self.assertEqual(instr_form_arm_1, test_mm_arm.get_instruction(name_arm_1, operands_arm_1))
        self.assertEqual(
-            test_mm_arm.get_instruction_from_dict('b.ne', [{'class': 'identifier'}]),
+            test_mm_arm.get_instruction('b.ne', [{'class': 'identifier'}]),
            test_mm_arm.get_instruction('b.ne', [{'class': 'identifier'}]),
        )

--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py35,py36
+envlist = py35,py36,py37,py38,py39
 [testenv]
 commands=
    python tests/all_tests.py
Author	SHA1	Message	Date
Julian Hammer	4ff8fdc4ab	version bump	2020-11-11 15:14:27 +01:00
JanLJL	c204096d74	fixed typo	2020-11-11 14:11:00 +01:00
JanLJL	dea217c12c	fixed test after changing TP value of instruction	2020-11-11 14:04:07 +01:00
JanLJL	92c162daa2	new instructions	2020-11-11 13:54:23 +01:00
JanLJL	87ea8f0f0a	new instructions	2020-11-11 12:27:49 +01:00
Julian Hammer	cb04efc384	fixed typo	2020-11-10 13:33:24 +01:00
JanLJL	14c0ea6180	bugfixes	2020-11-09 23:29:42 +01:00
Julian Hammer	314ff4cf9d	improved performance of arch_semantics and reg dependency matching	2020-11-09 19:27:47 +01:00
Julian Hammer	f64253b2b9	added dict for instruction lookup	2020-11-09 17:00:46 +01:00
Julian Hammer	979d08358e	singelton for isa parsers	2020-11-09 12:36:14 +01:00
Julian Hammer	a2dd6f752d	added comment	2020-11-09 12:35:13 +01:00
Julian Hammer	2fb36406a7	performance improvement of throughput summation	2020-11-09 12:01:00 +01:00
Julian Hammer	94086033a8	added __main__.py	2020-11-09 08:27:31 +01:00
JanLJL	75edfc808a	version bump	2020-11-06 20:40:13 +01:00
JanLJL	c8c077a834	enhanced length warning	2020-11-06 15:49:13 +01:00
JanLJL	26ee005adc	added missing test file	2020-11-06 15:07:57 +01:00
JanLJL	207c53aaad	minor bugfix in HW model and added user warnings for more insight	2020-11-06 15:06:36 +01:00
JanLJL	fafd7bc526	Merge branch 'master' of https://github.com/RRZE-HPC/OSACA	2020-11-06 12:57:46 +01:00
JanLJL	b986d7eba0	added --lines option	2020-11-06 12:57:41 +01:00
Julian Hammer	6b0adb5d68	improved cache handing (always hashing original file)	2020-11-06 12:27:34 +01:00
JanLJL	f9f382a948	bugfixes	2020-11-06 12:03:54 +01:00
Julian Hammer	c6b58c63ab	Merge branch 'master' of github.com:RRZE-HPC/OSACA	2020-11-03 16:28:28 +01:00
Julian Hammer	78530bfdb0	fail-safed _build_cache.py	2020-11-03 16:28:07 +01:00
JanLJL	5aa0899961	added bdist	2020-11-03 16:10:46 +01:00
JanLJL	7f0abd7d10	version bump	2020-11-02 15:48:19 +01:00
JanLJL	9ba9bab107	try different ISA as fallback when parsing without --arch flag, use SKX as x86 default and enhanced ISA detection heuristic	2020-11-02 15:33:50 +01:00
Julian Hammer	983e66938c	version bump	2020-10-29 13:15:23 +01:00
JanLJL	1c889fa785	Merge branch 'master' of https://github.com/RRZE-HPC/OSACA	2020-10-29 13:00:09 +01:00
JanLJL	022598d94f	autodetect ISA and default uarch for ISA	2020-10-29 13:00:02 +01:00
Julian	1f5c9d1c61	using travis-ci.com badge	2020-10-29 12:45:39 +01:00
JanLJL	30e0ad038d	ignore pickles in data/ and support py3.9	2020-10-29 11:06:20 +01:00
Julian Hammer	decec86e56	fixed py3.5 compatability	2020-10-29 10:59:00 +01:00
JanLJL	9af689b28c	fixed bug in tests and removed unused imports	2020-10-28 19:29:48 +01:00
Julian Hammer	3aea3f2b49	Merge branch 'master' of github.com:RRZE-HPC/OSACA	2020-10-28 17:16:43 +01:00
Julian Hammer	a6cb09cf1f	added cache files to package and building during setup	2020-10-28 17:16:03 +01:00
Julian Hammer	9d2ea8603f	new caching structure with support for distribution	2020-10-28 16:29:55 +01:00
JanLJL	a7918db145	enhanced hanlding for immediates with shifting	2020-10-21 12:14:21 +02:00
Julian Hammer	b5b1a1f2b2	version bump	2020-10-20 14:36:43 +02:00
Julian	dd59af16b2	Merge pull request #51 from RRZE-HPC/A64FX A64FX support and several Arm bugfixes and enhancements including better TP scheduling	2020-10-16 10:44:47 +02:00
JanLJL	d9325724e2	removed duplicate cmp entry	2020-10-16 10:11:51 +02:00
JanLJL	7e7269c2bc	refactored operand checking in post-processing	2020-10-16 10:05:08 +02:00
JanLJL	c64a24ae1b	no \t replacement before any other point than user output	2020-10-16 09:44:18 +02:00
JanLJL	e8b78e4cc6	Merge branch 'master' into A64FX	2020-10-15 22:44:12 +02:00
JanLJL	cd5a706f56	adjusted tests for AArch64	2020-10-15 17:56:08 +02:00
Jan	13426358d0	Merge pull request #50 from RRZE-HPC/fix/increment_handling Fixing Increment Handling	2020-10-15 17:00:11 +02:00
Julian Hammer	c80088b628	Merge branch 'master' into fix/increment_handling	2020-10-15 16:36:29 +02:00
Julian Hammer	748474cd81	added more cmp versions	2020-10-15 16:23:14 +02:00
Julian Hammer	2fec0bf810	Merge branch 'master' into fix/increment_handling	2020-10-15 13:55:34 +02:00
Julian Hammer	711a41d18e	extended and cleaned up marker tests	2020-10-15 13:54:18 +02:00
Julian Hammer	cf4a9cddcb	Merge branch 'master' into fix/increment_handling	2020-10-15 13:17:02 +02:00
Julian Hammer	5a5a1e74f5	added CMP to aarch64 to exclude first op from destinations	2020-10-15 13:15:54 +02:00
Julian Hammer	4865e7ea72	fixed ignoring of last line without end marker	2020-10-15 11:59:51 +02:00
Julian Hammer	d03398ddf9	treating post- and pre-incremeted memory references no longer as src_dst the incremented register is now considered src_dst instead	2020-10-13 19:25:29 +02:00
Julian Hammer	edb8df3205	considering split AVX loads on SNB and IVB	2020-10-13 11:25:13 +02:00
Julian Hammer	489050723c	removed a nother set of no-maker tests	2020-10-13 09:03:13 +02:00
Julian Hammer	0cc0d35ce9	removed maker missing tests	2020-10-12 19:34:04 +02:00
Julian Hammer	7f65bdb022	version bump	2020-10-12 15:39:49 +02:00
Julian Hammer	04360cc897	fixed label identifiers by splitting	2020-10-12 15:39:32 +02:00
Julian Hammer	5e7a12f9bb	paranthesis now suppored in identifier strings	2020-10-12 15:05:52 +02:00
Julian Hammer	1def12ee79	if not markes were found, use whole code	2020-10-12 15:04:55 +02:00
Julian Hammer	7269156854	added `--out` argument	2020-10-12 15:04:18 +02:00
Julian Hammer	d6529ced73	fixed push and added pop	2020-10-12 15:03:03 +02:00
Julian Hammer	eac728dc9f	added tx2 support for `ldp d1, d2, [x3]`	2020-10-07 13:57:57 +02:00
JanLJL	451ba62959	added vector mov	2020-09-23 10:07:43 +02:00
JanLJL	57cf1bfe6f	Merge branch 'master' of github.com:RRZE-HPC/osaca	2020-09-17 22:28:56 +02:00
JanLJL	44b921aa73	added BS4 dependency	2020-09-17 22:27:37 +02:00
JanLJL	accb52ce53	Merge branch 'master' of github.com:RRZE-HPC/osaca	2020-09-17 22:15:20 +02:00
JanLJL	9e78f85475	added instructions	2020-09-17 22:14:14 +02:00
JanLJL	64da89ec3d	enhancecd ARM identifier to support immediate offsets	2020-09-17 22:12:12 +02:00
JanLJL	adeae88665	instr update	2020-09-17 21:21:15 +02:00
JanLJL	1698ed1776	gather enhancement	2020-09-03 13:48:00 +02:00
JanLJL	2ef6051e64	added gather load instruction	2020-09-03 09:30:19 +02:00
Julian Hammer	bd61b94669	ignoring b.none branched in basic block detection	2020-08-03 19:23:33 +02:00
JanLJL	addcdeda85	added sve instructions	2020-08-03 08:55:37 +02:00
JanLJL	23d36a651b	enhancements for SVE support	2020-08-03 08:54:59 +02:00
JanLJL	b052ab4151	bugfix in OoO scheduling	2020-07-28 14:52:30 +02:00
JanLJL	673da99fba	minor enhancements for scheduling	2020-07-23 15:55:56 +02:00
JanLJL	6c72281d65	prepared for aarch64 8.2 support	2020-07-23 15:54:54 +02:00
JanLJL	5520362e65	adjustments and bugfixes	2020-07-13 18:53:19 +02:00
JanLJL	93060eee43	Merge branch 'master' into A64FX	2020-07-13 14:41:49 +02:00
JanLJL	0e77b7bc9a	enhanced TP scheduling	2020-07-06 18:49:46 +02:00
JanLJL	ce8c3ff9ab	bugfixes for A64FX	2020-07-06 18:48:54 +02:00
JanLJL	6294e2e9da	initial commit for trying to support a64fx	2020-06-26 05:20:40 +02:00
JanLJL	5258d65c8e	few more instructions	2020-06-24 17:41:30 +02:00
JanLJL	379fe80169	added initial support for Intel Ice Lake (ICL)	2020-06-22 22:15:14 +02:00