version bump

fixed typo
fixed test after changing TP value of instruction
2025-12-16 00:50:06 +01:00 · 2020-11-11 15:14:27 +01:00 · 2020-11-11 14:11:00 +01:00 · 2020-11-11 14:04:07 +01:00 · 2020-11-11 13:54:23 +01:00 · 2020-11-11 12:27:49 +01:00
49 changed files with 75330 additions and 75126 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 # OSACA specific files and folders
-osaca/taxCalc/
+*.*.pickle

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,11 +3,13 @@ language: python
 python:
    - "3.5"
    - "3.6"
-# Python 3.7 not working yet
-#    - "3.7"
+    - "3.7"
+    - "3.8"
+    - "3.9"
 before_install: 
 #  - pip install tox-travis
  - pip install codecov
+  - pip install bs4
  - pip install pygraphviz
  - pip install kerncraft
 install:
@@ -24,7 +26,7 @@ deploy:
  username: "__token__"
  password:
    secure: "fRRCETOwDkJ4pFacYZghPfCQ9mSsV4PlD3sTDp8rDHoCnebPjvFYc1tIdv+Wds0ae162KNUaj9GbxjK0MTGiRcy4pD08n7ufv8snmBQ2rtOLkj7RCRg1hw30WcMHjzqScFJgQcBrpjdPmR5AlesUufh6OadGvF1NspmVRWKr8ir3KQhmNV+itAliYoqaSTRTg1zC/znm+49l5gkzlLxd+mPj5/dtcc8vZ/i2M2+nNTTjDxq71q4Ddqv+bgZV1y7OZY2YuvjEDPflUbwc3fjOxpj891uMDHodsGmEHBu8WsLpF2tAO0C/x63S0jXamkV+/4cAQqQAwWr0Lby9/BjCfUwyUMOEgZ0S+z9WoFpBpQTQEfkD2JH/UFrv4CMnLFqgDkVMcx0vc/rT4Od8eJ5wOSG5+VdniJNOLpodFOXuKc09eJMk2lE9vk9OBrcsZ09UOTPTUCMZSIP4cBDxaIkx+RHQEy63TQdJZcElRBEWGEgj2e9hbiktvIoOvbFGQDscpz7ShBDklXIpu9hnxcKHtNDEjyywTUJmx7lTMILL05DPUnpUmnMb1Gyx5lbHzhSExc9re0cxEA354UUQKBS5HwHQcEBw9stMfsaForiBAUOocUKdGqlGP9cOXFoxdC9M+ff5FNstgbjPYSowb/JbATMlmCWKgH/bXXcTGCO10sk="
-  distributions: sdist
+  distributions: "sdist bdist_wheel"
  skip_existing: true
  cleanup: false
  on:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,8 @@ include README.rst
 include LICENSE
 include tox.ini
 recursive-include osaca/data/ *.yml
+recursive-include osaca/data/ *.pickle
+include osaca/data/_build_cache.py
 include examples/*
 recursive-include tests *.py *.out
 recursive-include tests/testfiles/ *
--- a/README.rst
+++ b/README.rst
@@ -10,8 +10,8 @@ Open Source Architecture Code Analyzer

 For an innermost loop kernel in assembly, this tool allows automatic instruction fetching of assembly code and automatic runtime prediction including throughput analysis and detection for critical path and loop-carried dependencies.

-.. image:: https://travis-ci.org/RRZE-HPC/OSACA.svg?branch=master
-    :target: https://travis-ci.org/RRZE-HPC/OSACA
+.. image:: https://travis-ci.com/RRZE-HPC/OSACA.svg?branch=master
+    :target: https://travis-ci.com/github/RRZE-HPC/OSACA
    :alt: Build Status

 .. image:: https://codecov.io/github/RRZE-HPC/OSACA/coverage.svg?branch=master
@@ -57,8 +57,12 @@ Additional requirements are:

 -  `Python3 <https://www.python.org/>`__
 -  `Graphviz <https://www.graphviz.org/>`__ for dependency graph creation (minimal dependency is `libgraphviz-dev` on Ubuntu)
+
+Optional requirements are:
+
 -  `Kerncraft <https://github.com/RRZE-HPC/kerncraft>`__ >=v0.8.4 for marker insertion
 -   `ibench <https://github.com/RRZE-HPC/ibench>`__ or `asmbench <https://github.com/RRZE-HPC/asmbench/>`__ for throughput/latency measurements
+- `BeautifulSoup4 <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ for scraping instruction form information for the x86 ISA (experimental)

 Design
 ======
--- a/osaca/init.py
+++ b/osaca/init.py
@@ -1,6 +1,6 @@
 """Open Source Architecture Code Analyzer"""
 name = 'osaca'
-__version__ = '0.3.3.dev0'
+__version__ = '0.3.12'

 # To trigger travis deployment to pypi, do the following:
 # 1. Increment __version___
--- a/osaca/main.py
+++ b/osaca/main.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+from .osaca import main
+
+main()
--- a/osaca/api/kerncraft_interface.py
+++ b/osaca/api/kerncraft_interface.py
@@ -5,7 +5,7 @@ import sys
 from io import StringIO

 from osaca.frontend import Frontend
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT
 from osaca.semantics import (INSTR_FLAGS, KernelDG, MachineModel,
                             ArchSemantics, reduce_to_section)

@@ -29,7 +29,7 @@ class KerncraftAPI(object):
        self.semantics = ArchSemantics(self.machine_model)
        isa = self.machine_model.get_ISA().lower()
        if isa == 'aarch64':
-            self.parser = ParserAArch64v81()
+            self.parser = ParserAArch64()
        elif isa == 'x86':
            self.parser = ParserX86ATT()

--- a/osaca/data/_build_cache.py
+++ b/osaca/data/_build_cache.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+from glob import glob
+import os.path
+import sys
+sys.path[0:0] = ['../..']
+
+failed = False
+try:
+    from osaca.semantics.hw_model import MachineModel
+except ModuleNotFoundError:
+    print("Unable to import MachineModel, probably some dependency is not yet installed. SKIPPING. "
+          "First run of OSACA may take a while to build caches, subsequent runs will be as fast as "
+          "ever.")
+    sys.exit()
+
+print('Building cache: ', end='')
+sys.stdout.flush()
+
+# Iterating architectures
+for f in glob(os.path.join(os.path.dirname(__file__), '*.yml')):
+    MachineModel(path_to_yaml=f)
+    print('.', end='')
+    sys.stdout.flush()
+
+# Iterating ISAs
+for f in glob(os.path.join(os.path.dirname(__file__), 'isa/*.yml')):
+    MachineModel(path_to_yaml=f)
+    print('+', end='')
+    sys.stdout.flush()
+
+print()
--- a/osaca/data/a64fx.yml
+++ b/osaca/data/a64fx.yml
--- a/osaca/data/bdw.yml
+++ b/osaca/data/bdw.yml
--- a/osaca/data/csx.yml
+++ b/osaca/data/csx.yml
--- a/osaca/data/generate_mov_entries.py
+++ b/osaca/data/generate_mov_entries.py
@@ -9,8 +9,8 @@ class MOVEntryBuilder:
        port_occupancy = defaultdict(Fraction)
        for uops, ports in port_pressure:
            for p in ports:
-              port_occupancy[p] += Fraction(uops, len(ports))
-        return float(max(list(port_occupancy.values())+[0]))
+                port_occupancy[p] += Fraction(uops, len(ports))
+        return float(max(list(port_occupancy.values()) + [0]))

    @staticmethod
    def classify(operands_types):
@@ -18,10 +18,10 @@ class MOVEntryBuilder:
        store = 'mem' in operands_types[-1:]
        assert not (load and store), "Can not process a combined load-store instruction."
        return load, store
-    
+
    def build_description(
-            self, instruction_name, operand_types, 
-            port_pressure=[], latency=0, comment=None):
+        self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
+    ):
        if comment:
            comment = "  # " + comment
        else:
@@ -32,10 +32,7 @@ class MOVEntryBuilder:
            if ot == 'imd':
                description += '  - class: immediate\n    imd: int\n'
            elif ot.startswith('mem'):
-                description += (
-                    '  - class: memory\n'
-                    '    base: "*"\n'
-                    '    offset: "*"\n')
+                description += '  - class: memory\n' '    base: "*"\n' '    offset: "*"\n'
                if ot == 'mem_simple':
                    description += '    index: ~\n'
                elif ot == 'mem_complex':
@@ -45,18 +42,20 @@ class MOVEntryBuilder:
                description += '    scale: "*"\n'
            else:
                description += '  - class: register\n    name: {}\n'.format(ot)
-        
+
        description += (
            '  latency: {latency}\n'
            '  port_pressure: {port_pressure!r}\n'
            '  throughput: {throughput}\n'
-            '  uops: {uops}\n').format(
-                latency=latency,
-                port_pressure=port_pressure,
-                throughput=self.compute_throughput(port_pressure),
-                uops=sum([i for i,p in port_pressure]))
+            '  uops: {uops}\n'
+        ).format(
+            latency=latency,
+            port_pressure=port_pressure,
+            throughput=self.compute_throughput(port_pressure),
+            uops=sum([i for i, p in port_pressure]),
+        )
        return description
-    
+
    def parse_port_pressure(self, port_pressure_str):
        """
        Example:
@@ -68,7 +67,7 @@ class MOVEntryBuilder:
                cycles, ports = p.split('*p')
                port_pressure.append([int(cycles), ports])
        return port_pressure
-    
+
    def process_item(self, instruction_form, resources):
        """
        Example:
@@ -84,9 +83,7 @@ class MOVEntryBuilder:

 class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
    # for SNB and IVB
-    def build_description(
-            self, instruction_name, operand_types, 
-            port_pressure=[], latency=0):
+    def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
        load, store = self.classify(operand_types)

        comment = None
@@ -100,15 +97,14 @@ class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
            comment = "with store"

        return MOVEntryBuilder.build_description(
-            self, instruction_name, operand_types, port_pressure, latency, comment)
+            self, instruction_name, operand_types, port_pressure, latency, comment
+        )


 class MOVEntryBuilderIntelWithPort7AGU(MOVEntryBuilder):
    # for HSW, BDW, SKX and CSX

-    def build_description(
-            self, instruction_name, operand_types, 
-            port_pressure=[], latency=0):
+    def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
        load, store = self.classify(operand_types)

        if load:
@@ -116,7 +112,8 @@ class MOVEntryBuilderIntelWithPort7AGU(MOVEntryBuilder):
            latency += 4
            comment = "with load"
            return MOVEntryBuilder.build_description(
-                self, instruction_name, operand_types, port_pressure, latency, comment)
+                self, instruction_name, operand_types, port_pressure, latency, comment
+            )
        if store:
            port_pressure_simple = port_pressure + [[1, '237'], [1, '4']]
            operands_simple = ['mem_simple' if o == 'mem' else o for o in operand_types]
@@ -125,16 +122,28 @@ class MOVEntryBuilderIntelWithPort7AGU(MOVEntryBuilder):
            latency += 0
            return (
                MOVEntryBuilder.build_description(
-                    self, instruction_name, operands_simple, port_pressure_simple, latency,
-                    "with store, simple AGU") +
-                '\n' +
-                MOVEntryBuilder.build_description(
-                    self, instruction_name, operands_complex, port_pressure_complex, latency,
-                    "with store, complex AGU"))
-        
+                    self,
+                    instruction_name,
+                    operands_simple,
+                    port_pressure_simple,
+                    latency,
+                    "with store, simple AGU",
+                )
+                + '\n'
+                + MOVEntryBuilder.build_description(
+                    self,
+                    instruction_name,
+                    operands_complex,
+                    port_pressure_complex,
+                    latency,
+                    "with store, complex AGU",
+                )
+            )
+
        # Register only:
        return MOVEntryBuilder.build_description(
-            self, instruction_name, operand_types, port_pressure, latency)
+            self, instruction_name, operand_types, port_pressure, latency
+        )


 np7 = MOVEntryBuilderIntelNoPort7AGU()
@@ -149,7 +158,6 @@ snb_mov_instructions = [
    ('mov imd gpr', ('1*p015', 1)),
    ('mov imd mem', ('', 0)),
    ('movabs imd gpr', ('1*p015', 1)),  # AT&T version
-
    # https://www.felixcloutier.com/x86/movapd
    ('movapd xmm xmm', ('1*p5', 1)),
    ('movapd xmm mem', ('', 0)),
@@ -160,7 +168,6 @@ snb_mov_instructions = [
    ('vmovapd ymm ymm', ('1*p5', 1)),
    ('vmovapd ymm mem', ('', 0)),
    ('vmovapd mem ymm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movaps
    ('movaps xmm xmm', ('1*p5', 1)),
    ('movaps xmm mem', ('', 0)),
@@ -171,7 +178,6 @@ snb_mov_instructions = [
    ('vmovaps ymm ymm', ('1*p5', 1)),
    ('movaps ymm mem', ('', 0)),
    ('movaps mem ymm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movd:movq
    ('movd gpr mm', ('1*p5', 1)),
    ('movd mem mm', ('', 0)),
@@ -197,7 +203,6 @@ snb_mov_instructions = [
    ('vmovd xmm mem', ('', 0)),
    ('vmovq xmm gpr', ('1*p0', 1)),
    ('vmovq xmm mem', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movddup
    ('movddup xmm xmm', ('1*p5', 1)),
    ('movddup mem xmm', ('', 0)),
@@ -205,10 +210,8 @@ snb_mov_instructions = [
    ('vmovddup mem xmm', ('', 0)),
    ('vmovddup ymm ymm', ('1*p5', 1)),
    ('vmovddup mem ymm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movdq2q
    ('movdq2q xmm mm', ('1*p015+1*p5', 1)),
-
    # https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
    ('movdqa xmm xmm', ('1*p015', 1)),
    ('movdqa mem xmm', ('', 0)),
@@ -219,7 +222,6 @@ snb_mov_instructions = [
    ('vmovdqa ymm ymm', ('1*p05', 1)),
    ('vmovdqa mem ymm', ('', 0)),
    ('vmovdqa ymm mem', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
    ('movdqu xmm xmm', ('1*p015', 1)),
    ('movdqu mem xmm', ('', 0)),
@@ -230,75 +232,60 @@ snb_mov_instructions = [
    ('vmovdqu ymm ymm', ('1*p05', 1)),
    ('vmovdqu mem ymm', ('', 0)),
    ('vmovdqu ymm mem', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movhlps
    ('movhlps xmm xmm', ('1*p5', 1)),
    ('vmovhlps xmm xmm xmm', ('1*p5', 1)),
-
    # https://www.felixcloutier.com/x86/movhpd
    ('movhpd mem xmm', ('1*p5', 1)),
    ('vmovhpd mem xmm xmm', ('1*p5', 1)),
    ('movhpd xmm mem', ('', 0)),
    ('vmovhpd mem xmm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movhps
    ('movhps mem xmm', ('1*p5', 1)),
    ('vmovhps mem xmm xmm', ('1*p5', 1)),
    ('movhps xmm mem', ('', 0)),
    ('vmovhps mem xmm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movlhps
    ('movlhps xmm xmm', ('1*p5', 1)),
    ('vmovlhps xmm xmm xmm', ('1*p5', 1)),
-
    # https://www.felixcloutier.com/x86/movlpd
    ('movlpd mem xmm', ('1*p5', 1)),
    ('vmovlpd mem xmm xmm', ('1*p5', 1)),
    ('movlpd xmm mem', ('', 0)),
    ('vmovlpd mem xmm', ('1*p5', 1)),
-    
    # https://www.felixcloutier.com/x86/movlps
    ('movlps mem xmm', ('1*p5', 1)),
    ('vmovlps mem xmm xmm', ('1*p5', 1)),
    ('movlps xmm mem', ('', 0)),
    ('vmovlps mem xmm', ('1*p5', 1)),
-
    # https://www.felixcloutier.com/x86/movmskpd
    ('movmskpd xmm gpr', ('1*p0', 2)),
    ('vmovmskpd xmm gpr', ('1*p0', 2)),
    ('vmovmskpd ymm gpr', ('1*p0', 2)),
-
    # https://www.felixcloutier.com/x86/movmskps
    ('movmskps xmm gpr', ('1*p0', 1)),
    ('vmovmskps xmm gpr', ('1*p0', 1)),
    ('vmovmskps ymm gpr', ('1*p0', 1)),
-
    # https://www.felixcloutier.com/x86/movntdq
    ('movntdq xmm mem', ('', 0)),  # TODO NT-store: what latency to use?
    ('vmovntdq xmm mem', ('', 0)),  # TODO NT-store: what latency to use?
    ('vmovntdq ymm mem', ('', 0)),  # TODO NT-store: what latency to use?
-
    # https://www.felixcloutier.com/x86/movntdqa
    ('movntdqa mem xmm', ('', 0)),
    ('vmovntdqa mem xmm', ('', 0)),
    ('vmovntdqa mem ymm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movnti
    ('movnti gpr mem', ('', 0)),  # TODO NT-store: what latency to use?
-
    # https://www.felixcloutier.com/x86/movntpd
    ('movntpd xmm mem', ('', 0)),  # TODO NT-store: what latency to use?
    ('vmovntpd xmm mem', ('', 0)),  # TODO NT-store: what latency to use?
    ('vmovntpd ymm mem', ('', 0)),  # TODO NT-store: what latency to use?
-
    # https://www.felixcloutier.com/x86/movntps
    ('movntps xmm mem', ('', 0)),  # TODO NT-store: what latency to use?
    ('vmovntps xmm mem', ('', 0)),  # TODO NT-store: what latency to use?
    ('vmovntps ymm mem', ('', 0)),  # TODO NT-store: what latency to use?
-
    # https://www.felixcloutier.com/x86/movntq
    ('movntq mm mem', ('', 0)),  # TODO NT-store: what latency to use?
-
    # https://www.felixcloutier.com/x86/movq
    ('movq mm mm', ('', 0)),
    ('movq mem mm', ('', 0)),
@@ -309,14 +296,11 @@ snb_mov_instructions = [
    ('vmovq xmm xmm', ('1*p015', 1)),
    ('vmovq mem xmm', ('', 0)),
    ('vmovq xmm mem', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movq2dq
    ('movq2dq mm xmm', ('1*p015', 1)),
-
    # https://www.felixcloutier.com/x86/movs:movsb:movsw:movsd:movsq
    # TODO combined load-store is currently not supported
    # ('movs mem mem', ()),
-
    # https://www.felixcloutier.com/x86/movsd
    ('movsd xmm xmm', ('1*p5', 1)),
    ('movsd mem xmm', ('', 0)),
@@ -324,7 +308,6 @@ snb_mov_instructions = [
    ('vmovsd xmm xmm xmm', ('1*p5', 1)),
    ('vmovsd mem xmm', ('', 0)),
    ('vmovsd xmm mem', ('', 0)),
-    
    # https://www.felixcloutier.com/x86/movshdup
    ('movshdup xmm xmm', ('1*p5', 1)),
    ('movshdup mem xmm', ('', 0)),
@@ -332,7 +315,6 @@ snb_mov_instructions = [
    ('vmovshdup mem xmm', ('', 0)),
    ('vmovshdup ymm ymm', ('1*p5', 1)),
    ('vmovshdup mem ymm', ('', 0)),
-    
    # https://www.felixcloutier.com/x86/movsldup
    ('movsldup xmm xmm', ('1*p5', 1)),
    ('movsldup mem xmm', ('', 0)),
@@ -340,7 +322,6 @@ snb_mov_instructions = [
    ('vmovsldup mem xmm', ('', 0)),
    ('vmovsldup ymm ymm', ('1*p5', 1)),
    ('vmovsldup mem ymm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movss
    ('movss xmm xmm', ('1*p5', 1)),
    ('movss mem xmm', ('', 0)),
@@ -349,7 +330,6 @@ snb_mov_instructions = [
    ('vmovss xmm xmm', ('1*p5', 1)),
    ('vmovss xmm mem', ('', 0)),
    ('movss mem xmm', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movsx:movsxd
    ('movsx gpr gpr', ('1*p015', 1)),
    ('movsx mem gpr', ('', 0)),
@@ -363,7 +343,6 @@ snb_mov_instructions = [
    ('movsl mem gpr', ('', 0)),  # AT&T version
    ('movsq gpr gpr', ('1*p015', 1)),  # AT&T version
    ('movsq mem gpr', ('', 0)),  # AT&T version
-
    # https://www.felixcloutier.com/x86/movupd
    ('movupd xmm xmm', ('1*p5', 1)),
    ('movupd mem xmm', ('', 0)),
@@ -374,7 +353,6 @@ snb_mov_instructions = [
    ('vmovupd ymm ymm', ('1*p5', 1)),
    ('vmovupd mem ymm', ('', 0)),
    ('vmovupd ymm mem', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movups
    ('movups xmm xmm', ('1*p5', 1)),
    ('movups mem xmm', ('', 0)),
@@ -385,7 +363,6 @@ snb_mov_instructions = [
    ('vmovups ymm ymm', ('1*p5', 1)),
    ('vmovups mem ymm', ('', 0)),
    ('vmovups ymm mem', ('', 0)),
-
    # https://www.felixcloutier.com/x86/movzx
    ('movzx gpr gpr', ('1*p015', 1)),
    ('movzx mem gpr', ('', 0)),
@@ -397,7 +374,6 @@ snb_mov_instructions = [
    ('movzl mem gpr', ('', 0)),  # AT&T version
    ('movzq gpr gpr', ('1*p015', 1)),  # AT&T version
    ('movzq mem gpr', ('', 0)),  # AT&T version
-
    # https://www.felixcloutier.com/x86/cmovcc
    ('cmova gpr gpr', ('1*p015+2*p05', 2)),
    ('cmova mem gpr', ('1*p015+2*p05', 2)),
@@ -459,12 +435,10 @@ snb_mov_instructions = [
    ('cmovs mem gpr', ('1*p015+1*p05', 2)),
    ('cmovz gpr gpr', ('1*p015+1*p05', 2)),
    ('cmovz mem gpr', ('1*p015+1*p05', 2)),
-
    # https://www.felixcloutier.com/x86/pmovmskb
    ('pmovmskb mm gpr', ('1*p0', 2)),
    ('pmovmskb xmm gpr', ('1*p0', 2)),
    ('vpmovmskb xmm gpr', ('1*p0', 2)),
-    
    # https://www.felixcloutier.com/x86/pmovsx
    ('pmovsxbw xmm xmm', ('1*p15', 1)),
    ('pmovsxbw mem xmm', ('1*p15', 1)),
@@ -484,7 +458,6 @@ snb_mov_instructions = [
    ('vpmovsxbd mem ymm', ('1*p15', 1)),
    ('vpmovsxbq ymm ymm', ('1*p15', 1)),
    ('vpmovsxbq mem ymm', ('1*p15', 1)),
-
    # https://www.felixcloutier.com/x86/pmovzx
    ('pmovzxbw xmm xmm', ('1*p15', 1)),
    ('pmovzxbw mem xmm', ('1*p15', 1)),
@@ -494,307 +467,294 @@ snb_mov_instructions = [
    ('vpmovzxbw mem ymm', ('1*p15', 1)),
 ]

-ivb_mov_instructions = list(OrderedDict(snb_mov_instructions + [
-    # https://www.felixcloutier.com/x86/mov
-    ('mov gpr gpr', ('', 0)),
-    ('mov imd gpr', ('', 0)),
+ivb_mov_instructions = list(
+    OrderedDict(
+        snb_mov_instructions
+        + [
+            # https://www.felixcloutier.com/x86/mov
+            ('mov gpr gpr', ('', 0)),
+            ('mov imd gpr', ('', 0)),
+            # https://www.felixcloutier.com/x86/movapd
+            ('movapd xmm xmm', ('', 0)),
+            ('vmovapd xmm xmm', ('', 0)),
+            ('vmovapd ymm ymm', ('', 0)),
+            # https://www.felixcloutier.com/x86/movaps
+            ('movaps xmm xmm', ('', 0)),
+            ('vmovaps xmm xmm', ('', 0)),
+            ('vmovaps ymm ymm', ('', 0)),
+            # https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
+            ('movdqa xmm xmm', ('', 0)),
+            ('vmovdqa xmm xmm', ('', 0)),
+            ('vmovdqa ymm ymm', ('', 0)),
+            # https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
+            ('movdqu xmm xmm', ('', 0)),
+            ('vmovdqu xmm xmm', ('', 0)),
+            ('vmovdqu ymm ymm', ('', 0)),
+            # https://www.felixcloutier.com/x86/movupd
+            ('movupd xmm xmm', ('', 0)),
+            ('vmovupd xmm xmm', ('', 0)),
+            ('vmovupd ymm ymm', ('', 0)),
+            # https://www.felixcloutier.com/x86/movupd
+            ('movups xmm xmm', ('', 0)),
+            ('vmovups xmm xmm', ('', 0)),
+            ('vmovups ymm ymm', ('', 0)),
+        ]
+    ).items()
+)

-    # https://www.felixcloutier.com/x86/movapd
-    ('movapd xmm xmm', ('', 0)),
-    ('vmovapd xmm xmm', ('', 0)),
-    ('vmovapd ymm ymm', ('', 0)),
+hsw_mov_instructions = list(
+    OrderedDict(
+        ivb_mov_instructions
+        + [
+            # https://www.felixcloutier.com/x86/mov
+            ('mov imd gpr', ('1*p0156', 1)),
+            ('mov gpr gpr', ('1*p0156', 1)),
+            ('movabs imd gpr', ('1*p0156', 1)),  # AT&T version
+            # https://www.felixcloutier.com/x86/movbe
+            ('movbe gpr mem', ('1*p15', 6)),
+            ('movbe mem gpr', ('1*p15', 6)),
+            # https://www.felixcloutier.com/x86/movmskpd
+            ('movmskpd xmm gpr', ('1*p0', 3)),
+            ('vmovmskpd xmm gpr', ('1*p0', 3)),
+            ('vmovmskpd ymm gpr', ('1*p0', 3)),
+            # https://www.felixcloutier.com/x86/movmskps
+            ('movmskps xmm gpr', ('1*p0', 3)),
+            ('vmovmskps xmm gpr', ('1*p0', 3)),
+            ('vmovmskps ymm gpr', ('1*p0', 3)),
+            # https://www.felixcloutier.com/x86/movsx:movsxd
+            ('movsx gpr gpr', ('1*p0156', 1)),
+            ('movsb gpr gpr', ('1*p0156', 1)),  # AT&T version
+            ('movsw gpr gpr', ('1*p0156', 1)),  # AT&T version
+            ('movsl gpr gpr', ('1*p0156', 1)),  # AT&T version
+            ('movsq gpr gpr', ('1*p0156', 1)),  # AT&T version
+            # https://www.felixcloutier.com/x86/movzx
+            ('movzx gpr gpr', ('1*p0156', 1)),
+            ('movzb gpr gpr', ('1*p0156', 1)),  # AT&T version
+            ('movzw gpr gpr', ('1*p0156', 1)),  # AT&T version
+            ('movzl gpr gpr', ('1*p0156', 1)),  # AT&T version
+            ('movzq gpr gpr', ('1*p0156', 1)),  # AT&T version
+            # https://www.felixcloutier.com/x86/cmovcc
+            ('cmova gpr gpr', ('1*p0156+2*p06', 2)),
+            ('cmova mem gpr', ('1*p0156+2*p06', 2)),
+            ('cmovae gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovae mem gpr', ('1*p0156+2*p06', 2)),
+            ('cmovb gpr gpr', ('1*p0156+2*p06', 2)),
+            ('cmovb mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovbe gpr gpr', ('1*p0156+2*p06', 2)),
+            ('cmovbe mem gpr', ('1*p0156+2*p06', 2)),
+            ('cmovc gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovc mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmove gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmove mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovg gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovg mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovge gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovge mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovl gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovl mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovle gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovle mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovna gpr gpr', ('1*p0156+2*p06', 2)),
+            ('cmovna mem gpr', ('1*p0156+2*p06', 2)),
+            ('cmovnae gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnae mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnb gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnb mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnbe gpr gpr', ('1*p0156+2*p06', 2)),
+            ('cmovnbe mem gpr', ('1*p0156+2*p06', 2)),
+            ('cmovnb gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnb mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnc gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnc mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovne gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovne mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovng gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovng mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnge gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnge mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnl gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnl mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovno gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovno mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnp gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnp mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovns gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovns mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnz gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovnz mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovo gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovo mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovp gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovp mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovpe gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovpe mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovpo gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovpo mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovs gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovs mem gpr', ('1*p0156+1*p06', 2)),
+            ('cmovz gpr gpr', ('1*p0156+1*p06', 2)),
+            ('cmovz mem gpr', ('1*p0156+1*p06', 2)),
+            # https://www.felixcloutier.com/x86/pmovmskb
+            ('pmovmskb mm gpr', ('1*p0', 3)),
+            ('pmovmskb xmm gpr', ('1*p0', 3)),
+            ('vpmovmskb xmm gpr', ('1*p0', 3)),
+            ('vpmovmskb ymm gpr', ('1*p0', 3)),
+            # https://www.felixcloutier.com/x86/pmovsx
+            ('pmovsxbw xmm xmm', ('1*p5', 1)),
+            ('pmovsxbw mem xmm', ('1*p5', 1)),
+            ('pmovsxbd xmm xmm', ('1*p5', 1)),
+            ('pmovsxbd mem xmm', ('1*p5', 1)),
+            ('pmovsxbq xmm xmm', ('1*p5', 1)),
+            ('pmovsxbq mem xmm', ('1*p5', 1)),
+            ('vpmovsxbw xmm xmm', ('1*p5', 1)),
+            ('vpmovsxbw mem xmm', ('1*p5', 1)),
+            ('vpmovsxbd xmm xmm', ('1*p5', 1)),
+            ('vpmovsxbd mem xmm', ('1*p5', 1)),
+            ('vpmovsxbq xmm xmm', ('1*p5', 1)),
+            ('vpmovsxbq mem xmm', ('1*p5', 1)),
+            ('vpmovsxbw ymm ymm', ('1*p5', 1)),
+            ('vpmovsxbw mem ymm', ('1*p5', 1)),
+            ('vpmovsxbd ymm ymm', ('1*p5', 1)),
+            ('vpmovsxbd mem ymm', ('1*p5', 1)),
+            ('vpmovsxbq ymm ymm', ('1*p5', 1)),
+            ('vpmovsxbq mem ymm', ('1*p5', 1)),
+            # https://www.felixcloutier.com/x86/pmovzx
+            ('pmovzxbw xmm xmm', ('1*p5', 1)),
+            ('pmovzxbw mem xmm', ('1*p5', 1)),
+            ('vpmovzxbw xmm xmm', ('1*p5', 1)),
+            ('vpmovzxbw mem xmm', ('1*p5', 1)),
+            ('vpmovzxbw ymm ymm', ('1*p5', 1)),
+            ('vpmovzxbw mem ymm', ('1*p5', 1)),
+        ]
+    ).items()
+)

-    # https://www.felixcloutier.com/x86/movaps
-    ('movaps xmm xmm', ('', 0)),
-    ('vmovaps xmm xmm', ('', 0)),
-    ('vmovaps ymm ymm', ('', 0)),
+bdw_mov_instructions = list(
+    OrderedDict(
+        hsw_mov_instructions
+        + [
+            # https://www.felixcloutier.com/x86/cmovcc
+            ('cmova gpr gpr', ('2*p06', 1)),
+            ('cmova mem gpr', ('2*p06', 1)),
+            ('cmovae gpr gpr', ('1*p06', 1)),
+            ('cmovae mem gpr', ('2*p06', 1)),
+            ('cmovb gpr gpr', ('2*p06', 1)),
+            ('cmovb mem gpr', ('1*p06', 1)),
+            ('cmovbe gpr gpr', ('2*p06', 1)),
+            ('cmovbe mem gpr', ('2*p06', 1)),
+            ('cmovc gpr gpr', ('1*p06', 1)),
+            ('cmovc mem gpr', ('1*p06', 1)),
+            ('cmove gpr gpr', ('1*p06', 1)),
+            ('cmove mem gpr', ('1*p06', 1)),
+            ('cmovg gpr gpr', ('1*p06', 1)),
+            ('cmovg mem gpr', ('1*p06', 1)),
+            ('cmovge gpr gpr', ('1*p06', 1)),
+            ('cmovge mem gpr', ('1*p06', 1)),
+            ('cmovl gpr gpr', ('1*p06', 1)),
+            ('cmovl mem gpr', ('1*p06', 1)),
+            ('cmovle gpr gpr', ('1*p06', 1)),
+            ('cmovle mem gpr', ('1*p06', 1)),
+            ('cmovna gpr gpr', ('2*p06', 1)),
+            ('cmovna mem gpr', ('2*p06', 1)),
+            ('cmovnae gpr gpr', ('1*p06', 1)),
+            ('cmovnae mem gpr', ('1*p06', 1)),
+            ('cmovnb gpr gpr', ('1*p06', 1)),
+            ('cmovnb mem gpr', ('1*p06', 1)),
+            ('cmovnbe gpr gpr', ('2*p06', 1)),
+            ('cmovnbe mem gpr', ('2*p06', 1)),
+            ('cmovnb gpr gpr', ('1*p06', 1)),
+            ('cmovnb mem gpr', ('1*p06', 1)),
+            ('cmovnc gpr gpr', ('1*p06', 1)),
+            ('cmovnc mem gpr', ('1*p06', 1)),
+            ('cmovne gpr gpr', ('1*p06', 1)),
+            ('cmovne mem gpr', ('1*p06', 1)),
+            ('cmovng gpr gpr', ('1*p06', 1)),
+            ('cmovng mem gpr', ('1*p06', 1)),
+            ('cmovnge gpr gpr', ('1*p06', 1)),
+            ('cmovnge mem gpr', ('1*p06', 1)),
+            ('cmovnl gpr gpr', ('1*p06', 1)),
+            ('cmovnl mem gpr', ('1*p06', 1)),
+            ('cmovno gpr gpr', ('1*p06', 1)),
+            ('cmovno mem gpr', ('1*p06', 1)),
+            ('cmovnp gpr gpr', ('1*p06', 1)),
+            ('cmovnp mem gpr', ('1*p06', 1)),
+            ('cmovns gpr gpr', ('1*p06', 1)),
+            ('cmovns mem gpr', ('1*p06', 1)),
+            ('cmovnz gpr gpr', ('1*p06', 1)),
+            ('cmovnz mem gpr', ('1*p06', 1)),
+            ('cmovo gpr gpr', ('1*p06', 1)),
+            ('cmovo mem gpr', ('1*p06', 1)),
+            ('cmovp gpr gpr', ('1*p06', 1)),
+            ('cmovp mem gpr', ('1*p06', 1)),
+            ('cmovpe gpr gpr', ('1*p06', 1)),
+            ('cmovpe mem gpr', ('1*p06', 1)),
+            ('cmovpo gpr gpr', ('1*p06', 1)),
+            ('cmovpo mem gpr', ('1*p06', 1)),
+            ('cmovs gpr gpr', ('1*p06', 1)),
+            ('cmovs mem gpr', ('1*p06', 1)),
+            ('cmovz gpr gpr', ('1*p06', 1)),
+            ('cmovz mem gpr', ('1*p06', 1)),
+        ]
+    ).items()
+)

-    # https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
-    ('movdqa xmm xmm', ('', 0)),
-    ('vmovdqa xmm xmm', ('', 0)),
-    ('vmovdqa ymm ymm', ('', 0)),
+skx_mov_instructions = list(
+    OrderedDict(
+        bdw_mov_instructions
+        + [
+            # https://www.felixcloutier.com/x86/movapd
+            # TODO with masking!
+            # TODO the following may eliminate or be bound to 1*p0156:
+            # ('movapd xmm xmm', ('1*p5', 1)),
+            # ('vmovapd xmm xmm', ('1*p5', 1)),
+            # ('vmovapd ymm ymm', ('1*p5', 1)),
+            # https://www.felixcloutier.com/x86/movaps
+            # TODO with masking!
+            # TODO the following may eliminate or be bound to 1*p0156:
+            # ('movaps xmm xmm', ('1*p5', 1)),
+            # ('vmovaps xmm xmm', ('1*p5', 1)),
+            # ('vmovaps ymm ymm', ('1*p5', 1)),
+            # https://www.felixcloutier.com/x86/movbe
+            ('movbe gpr mem', ('1*p15', 4)),
+            ('movbe mem gpr', ('1*p15', 4)),
+            # https://www.felixcloutier.com/x86/movddup
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movntdq
+            ('vmovntdq zmm mem', ('', 0)),  # TODO NT-store: what latency to use?
+            # https://www.felixcloutier.com/x86/movntdqa
+            ('vmovntdqa mem zmm', ('', 0)),
+            # https://www.felixcloutier.com/x86/movntpd
+            ('vmovntpd zmm mem', ('', 0)),  # TODO NT-store: what latency to use?
+            # https://www.felixcloutier.com/x86/movntps
+            ('vmovntps zmm mem', ('', 0)),  # TODO NT-store: what latency to use?
+            # https://www.felixcloutier.com/x86/movq2dq
+            ('movq2dq mm xmm', ('1*p0+1*p015', 1)),
+            # https://www.felixcloutier.com/x86/movsd
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movshdup
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movsldup
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movss
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movupd
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/movups
+            # TODO with masking!
+            # https://www.felixcloutier.com/x86/pmovsx
+            # TODO with masking!
+            ('vpmovsxbw ymm zmm', ('1*p5', 3)),
+            ('vpmovsxbw mem zmm', ('1*p5', 1)),
+        ]
+    ).items()
+)

-    # https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
-    ('movdqu xmm xmm', ('', 0)),
-    ('vmovdqu xmm xmm', ('', 0)),
-    ('vmovdqu ymm ymm', ('', 0)),
+csx_mov_instructions = OrderedDict(skx_mov_instructions + []).items()

-    # https://www.felixcloutier.com/x86/movupd
-    ('movupd xmm xmm', ('', 0)),
-    ('vmovupd xmm xmm', ('', 0)),
-    ('vmovupd ymm ymm', ('', 0)),
-
-    # https://www.felixcloutier.com/x86/movupd
-    ('movups xmm xmm', ('', 0)),
-    ('vmovups xmm xmm', ('', 0)),
-    ('vmovups ymm ymm', ('', 0)),
-]).items())
-
-hsw_mov_instructions = list(OrderedDict(ivb_mov_instructions + [
-    # https://www.felixcloutier.com/x86/mov
-    ('mov imd gpr', ('1*p0156', 1)),
-    ('mov gpr gpr', ('1*p0156', 1)),
-    ('movabs imd gpr', ('1*p0156', 1)),  # AT&T version
-
-    # https://www.felixcloutier.com/x86/movbe
-    ('movbe gpr mem', ('1*p15', 6)),
-    ('movbe mem gpr', ('1*p15', 6)),
-
-    # https://www.felixcloutier.com/x86/movmskpd
-    ('movmskpd xmm gpr', ('1*p0', 3)),
-    ('vmovmskpd xmm gpr', ('1*p0', 3)),
-    ('vmovmskpd ymm gpr', ('1*p0', 3)),
-
-    # https://www.felixcloutier.com/x86/movmskps
-    ('movmskps xmm gpr', ('1*p0', 3)),
-    ('vmovmskps xmm gpr', ('1*p0', 3)),
-    ('vmovmskps ymm gpr', ('1*p0', 3)),
-
-    # https://www.felixcloutier.com/x86/movsx:movsxd
-    ('movsx gpr gpr', ('1*p0156', 1)),
-    ('movsb gpr gpr', ('1*p0156', 1)),  # AT&T version
-    ('movsw gpr gpr', ('1*p0156', 1)),  # AT&T version
-    ('movsl gpr gpr', ('1*p0156', 1)),  # AT&T version
-    ('movsq gpr gpr', ('1*p0156', 1)),  # AT&T version
-
-    # https://www.felixcloutier.com/x86/movzx
-    ('movzx gpr gpr', ('1*p0156', 1)),
-    ('movzb gpr gpr', ('1*p0156', 1)),  # AT&T version
-    ('movzw gpr gpr', ('1*p0156', 1)),  # AT&T version
-    ('movzl gpr gpr', ('1*p0156', 1)),  # AT&T version
-    ('movzq gpr gpr', ('1*p0156', 1)),  # AT&T version
-
-    # https://www.felixcloutier.com/x86/cmovcc
-    ('cmova gpr gpr',   ('1*p0156+2*p06', 2)),
-    ('cmova mem gpr',   ('1*p0156+2*p06', 2)),
-    ('cmovae gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovae mem gpr',  ('1*p0156+2*p06', 2)),
-    ('cmovb gpr gpr',   ('1*p0156+2*p06', 2)),
-    ('cmovb mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovbe gpr gpr',  ('1*p0156+2*p06', 2)),
-    ('cmovbe mem gpr',  ('1*p0156+2*p06', 2)),
-    ('cmovc gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovc mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmove gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmove mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovg gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovg mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovge gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovge mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovl gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovl mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovle gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovle mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovna gpr gpr',  ('1*p0156+2*p06', 2)),
-    ('cmovna mem gpr',  ('1*p0156+2*p06', 2)),
-    ('cmovnae gpr gpr', ('1*p0156+1*p06', 2)),
-    ('cmovnae mem gpr', ('1*p0156+1*p06', 2)),
-    ('cmovnb gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnb mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnbe gpr gpr', ('1*p0156+2*p06', 2)),
-    ('cmovnbe mem gpr', ('1*p0156+2*p06', 2)),
-    ('cmovnb gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnb mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnc gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnc mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovne gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovne mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovng gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovng mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnge gpr gpr', ('1*p0156+1*p06', 2)),
-    ('cmovnge mem gpr', ('1*p0156+1*p06', 2)),
-    ('cmovnl gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnl mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovno gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovno mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnp gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnp mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovns gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovns mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnz gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovnz mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovo gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovo mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovp gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovp mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovpe gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovpe mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovpo gpr gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovpo mem gpr',  ('1*p0156+1*p06', 2)),
-    ('cmovs gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovs mem gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovz gpr gpr',   ('1*p0156+1*p06', 2)),
-    ('cmovz mem gpr',   ('1*p0156+1*p06', 2)),
-
-    # https://www.felixcloutier.com/x86/pmovmskb
-    ('pmovmskb mm gpr', ('1*p0', 3)),
-    ('pmovmskb xmm gpr', ('1*p0', 3)),
-    ('vpmovmskb xmm gpr', ('1*p0', 3)),
-    ('vpmovmskb ymm gpr', ('1*p0', 3)),
-
-    # https://www.felixcloutier.com/x86/pmovsx
-    ('pmovsxbw xmm xmm', ('1*p5', 1)),
-    ('pmovsxbw mem xmm', ('1*p5', 1)),
-    ('pmovsxbd xmm xmm', ('1*p5', 1)),
-    ('pmovsxbd mem xmm', ('1*p5', 1)),
-    ('pmovsxbq xmm xmm', ('1*p5', 1)),
-    ('pmovsxbq mem xmm', ('1*p5', 1)),
-    ('vpmovsxbw xmm xmm', ('1*p5', 1)),
-    ('vpmovsxbw mem xmm', ('1*p5', 1)),
-    ('vpmovsxbd xmm xmm', ('1*p5', 1)),
-    ('vpmovsxbd mem xmm', ('1*p5', 1)),
-    ('vpmovsxbq xmm xmm', ('1*p5', 1)),
-    ('vpmovsxbq mem xmm', ('1*p5', 1)),
-    ('vpmovsxbw ymm ymm', ('1*p5', 1)),
-    ('vpmovsxbw mem ymm', ('1*p5', 1)),
-    ('vpmovsxbd ymm ymm', ('1*p5', 1)),
-    ('vpmovsxbd mem ymm', ('1*p5', 1)),
-    ('vpmovsxbq ymm ymm', ('1*p5', 1)),
-    ('vpmovsxbq mem ymm', ('1*p5', 1)),
-
-    # https://www.felixcloutier.com/x86/pmovzx
-    ('pmovzxbw xmm xmm', ('1*p5', 1)),
-    ('pmovzxbw mem xmm', ('1*p5', 1)),
-    ('vpmovzxbw xmm xmm', ('1*p5', 1)),
-    ('vpmovzxbw mem xmm', ('1*p5', 1)),
-    ('vpmovzxbw ymm ymm', ('1*p5', 1)),
-    ('vpmovzxbw mem ymm', ('1*p5', 1)),
-]).items())
-
-bdw_mov_instructions = list(OrderedDict(hsw_mov_instructions + [
-    # https://www.felixcloutier.com/x86/cmovcc
-    ('cmova gpr gpr',   ('2*p06', 1)),
-    ('cmova mem gpr',   ('2*p06', 1)),
-    ('cmovae gpr gpr',  ('1*p06', 1)),
-    ('cmovae mem gpr',  ('2*p06', 1)),
-    ('cmovb gpr gpr',   ('2*p06', 1)),
-    ('cmovb mem gpr',   ('1*p06', 1)),
-    ('cmovbe gpr gpr',  ('2*p06', 1)),
-    ('cmovbe mem gpr',  ('2*p06', 1)),
-    ('cmovc gpr gpr',   ('1*p06', 1)),
-    ('cmovc mem gpr',   ('1*p06', 1)),
-    ('cmove gpr gpr',   ('1*p06', 1)),
-    ('cmove mem gpr',   ('1*p06', 1)),
-    ('cmovg gpr gpr',   ('1*p06', 1)),
-    ('cmovg mem gpr',   ('1*p06', 1)),
-    ('cmovge gpr gpr',  ('1*p06', 1)),
-    ('cmovge mem gpr',  ('1*p06', 1)),
-    ('cmovl gpr gpr',   ('1*p06', 1)),
-    ('cmovl mem gpr',   ('1*p06', 1)),
-    ('cmovle gpr gpr',  ('1*p06', 1)),
-    ('cmovle mem gpr',  ('1*p06', 1)),
-    ('cmovna gpr gpr',  ('2*p06', 1)),
-    ('cmovna mem gpr',  ('2*p06', 1)),
-    ('cmovnae gpr gpr', ('1*p06', 1)),
-    ('cmovnae mem gpr', ('1*p06', 1)),
-    ('cmovnb gpr gpr',  ('1*p06', 1)),
-    ('cmovnb mem gpr',  ('1*p06', 1)),
-    ('cmovnbe gpr gpr', ('2*p06', 1)),
-    ('cmovnbe mem gpr', ('2*p06', 1)),
-    ('cmovnb gpr gpr',  ('1*p06', 1)),
-    ('cmovnb mem gpr',  ('1*p06', 1)),
-    ('cmovnc gpr gpr',  ('1*p06', 1)),
-    ('cmovnc mem gpr',  ('1*p06', 1)),
-    ('cmovne gpr gpr',  ('1*p06', 1)),
-    ('cmovne mem gpr',  ('1*p06', 1)),
-    ('cmovng gpr gpr',  ('1*p06', 1)),
-    ('cmovng mem gpr',  ('1*p06', 1)),
-    ('cmovnge gpr gpr', ('1*p06', 1)),
-    ('cmovnge mem gpr', ('1*p06', 1)),
-    ('cmovnl gpr gpr',  ('1*p06', 1)),
-    ('cmovnl mem gpr',  ('1*p06', 1)),
-    ('cmovno gpr gpr',  ('1*p06', 1)),
-    ('cmovno mem gpr',  ('1*p06', 1)),
-    ('cmovnp gpr gpr',  ('1*p06', 1)),
-    ('cmovnp mem gpr',  ('1*p06', 1)),
-    ('cmovns gpr gpr',  ('1*p06', 1)),
-    ('cmovns mem gpr',  ('1*p06', 1)),
-    ('cmovnz gpr gpr',  ('1*p06', 1)),
-    ('cmovnz mem gpr',  ('1*p06', 1)),
-    ('cmovo gpr gpr',   ('1*p06', 1)),
-    ('cmovo mem gpr',   ('1*p06', 1)),
-    ('cmovp gpr gpr',   ('1*p06', 1)),
-    ('cmovp mem gpr',   ('1*p06', 1)),
-    ('cmovpe gpr gpr',  ('1*p06', 1)),
-    ('cmovpe mem gpr',  ('1*p06', 1)),
-    ('cmovpo gpr gpr',  ('1*p06', 1)),
-    ('cmovpo mem gpr',  ('1*p06', 1)),
-    ('cmovs gpr gpr',   ('1*p06', 1)),
-    ('cmovs mem gpr',   ('1*p06', 1)),
-    ('cmovz gpr gpr',   ('1*p06', 1)),
-    ('cmovz mem gpr',   ('1*p06', 1)),
-]).items())
-
-skx_mov_instructions = list(OrderedDict(bdw_mov_instructions + [
-    # https://www.felixcloutier.com/x86/movapd
-    # TODO with masking!
-    # TODO the following may eliminate or be bound to 1*p0156:
-    # ('movapd xmm xmm', ('1*p5', 1)),
-    # ('vmovapd xmm xmm', ('1*p5', 1)),
-    # ('vmovapd ymm ymm', ('1*p5', 1)),
-
-    # https://www.felixcloutier.com/x86/movaps
-    # TODO with masking!
-    # TODO the following may eliminate or be bound to 1*p0156:
-    # ('movaps xmm xmm', ('1*p5', 1)),
-    # ('vmovaps xmm xmm', ('1*p5', 1)),
-    # ('vmovaps ymm ymm', ('1*p5', 1)),
-
-    # https://www.felixcloutier.com/x86/movbe
-    ('movbe gpr mem', ('1*p15', 4)),
-    ('movbe mem gpr', ('1*p15', 4)),
-
-    # https://www.felixcloutier.com/x86/movddup
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movntdq
-    ('vmovntdq zmm mem', ('', 0)),  # TODO NT-store: what latency to use?
-
-    # https://www.felixcloutier.com/x86/movntdqa
-    ('vmovntdqa mem zmm', ('', 0)),
-
-    # https://www.felixcloutier.com/x86/movntpd
-    ('vmovntpd zmm mem', ('', 0)),  # TODO NT-store: what latency to use?
-
-    # https://www.felixcloutier.com/x86/movntps
-    ('vmovntps zmm mem', ('', 0)),  # TODO NT-store: what latency to use?
-
-    # https://www.felixcloutier.com/x86/movq2dq
-    ('movq2dq mm xmm', ('1*p0+1*p015', 1)),
-
-    # https://www.felixcloutier.com/x86/movsd
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movshdup
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movsldup
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movss
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movupd
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/movups
-    # TODO with masking!
-
-    # https://www.felixcloutier.com/x86/pmovsx
-    # TODO with masking!
-    ('vpmovsxbw ymm zmm', ('1*p5', 3)),
-    ('vpmovsxbw mem zmm', ('1*p5', 1)),
-]).items())
-
-csx_mov_instructions = OrderedDict(skx_mov_instructions + [
-
-]).items()

 def get_description(arch, rhs_comment=None):
    descriptions = {
@@ -803,7 +763,7 @@ def get_description(arch, rhs_comment=None):
        'hsw': '\n'.join([p7.process_item(*item) for item in hsw_mov_instructions]),
        'bdw': '\n'.join([p7.process_item(*item) for item in bdw_mov_instructions]),
        'skx': '\n'.join([p7.process_item(*item) for item in skx_mov_instructions]),
-        'csx': '\n'.join([p7.process_item(*item) for item in csx_mov_instructions])
+        'csx': '\n'.join([p7.process_item(*item) for item in csx_mov_instructions]),
    }

    description = descriptions[arch]
@@ -813,20 +773,21 @@ def get_description(arch, rhs_comment=None):

        commented_description = ""
        for l in descriptions[arch].split('\n'):
-            commented_description += ("{:<"+str(max_length)+"}  # {}\n").format(l, rhs_comment)
+            commented_description += ("{:<" + str(max_length) + "}  # {}\n").format(l, rhs_comment)
        description = commented_description

    return description

+
 if __name__ == '__main__':
    import sys
+
    if len(sys.argv) != 2:
        print("Usage: {} (snb|ivb|hsw|bdw|skx|csx)".format(sys.argv[0]))
        sys.exit(0)
-    
+
    try:
        print(get_description(sys.argv[1], rhs_comment=' '.join(sys.argv)))
    except KeyError:
        print("Unknown architecture.")
        sys.exit(1)
-    
--- a/osaca/data/hsw.yml
+++ b/osaca/data/hsw.yml
@@ -1,4 +1,4 @@
-osaca_version: 0.3.2
+osaca_version: 0.3.4
 micro_architecture: Intel Haswell
 arch_code: HSW
 isa: x86
--- a/osaca/data/icl.yml
+++ b/osaca/data/icl.yml
--- a/osaca/data/isa/aarch64.yml
+++ b/osaca/data/isa/aarch64.yml
@@ -1,52 +1,35 @@
-osaca_version: 0.3.0
+osaca_version: 0.3.7
 isa: "AArch64"
 # Contains all operand-irregular instruction forms OSACA supports for AArch64.
 # Operand-regular for a AArch64 instruction form with N operands in the shape of
 #   mnemonic op1 ... opN
 # means that op1 is the only destination operand and op2 to op(N) are source operands.
 instruction_forms:
-    - name: "fmla"
+    - name: fmla
      operands:
        - class: "register"
-          prefix: "v"
-          shape: "s"
+          prefix: "*"
+          shape: "*"
          source: true
          destination: true
        - class: "register"
-          prefix: "v"
-          shape: "s"
+          prefix: "*"
+          shape: "*"
          source: true
          destination: false
        - class: "register"
-          prefix: "v"
-          shape: "s"
+          prefix: "*"
+          shape: "*"
          source: true
          destination: false
-    - name: "fmla"
+    - name: ldp
      operands:
        - class: "register"
-          prefix: "v"
-          shape: "d"
-          source: true
-          destination: true
-        - class: "register"
-          prefix: "v"
-          shape: "d"
-          source: true
-          destination: false
-        - class: "register"
-          prefix: "v"
-          shape: "d"
-          source: true
-          destination: false
-    - name: "ldp"
-      operands:
-        - class: "register"
-          prefix: "d"
+          prefix: "*"
          source: false
          destination: true
        - class: "register"
-          prefix: "d"
+          prefix: "*"
          source: false
          destination: true
        - class: "memory"
@@ -54,18 +37,14 @@ instruction_forms:
          offset: "*"
          index: "*"
          scale: "*"
-          pre-indexed: false
-          post-indexed: false
+          pre-indexed: "*"
+          post-indexed: "*"
          source: true
          destination: false
-    - name: "ldp"
+    - name: [ldr, ldur]
      operands:
        - class: "register"
-          prefix: "q"
-          source: false
-          destination: true
-        - class: "register"
-          prefix: "q"
+          prefix: "*"
          source: false
          destination: true
        - class: "memory"
@@ -73,90 +52,18 @@ instruction_forms:
          offset: "*"
          index: "*"
          scale: "*"
-          pre-indexed: false
-          post-indexed: false
+          pre-indexed: "*"
+          post-indexed: "*"
          source: true
          destination: false
-    - name: "ldp"
+    - name: stp
      operands:
        - class: "register"
-          prefix: "q"
-          source: false
-          destination: true
-        - class: "register"
-          prefix: "q"
-          source: false
-          destination: true
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: true
-          post-indexed: false
-          source: true
-          destination: true
-    - name: "ldp"
-      operands:
-        - class: "register"
-          prefix: "q"
-          source: false
-          destination: true
-        - class: "register"
-          prefix: "q"
-          source: false
-          destination: true
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: false
-          post-indexed: true
-          source: true
-          destination: true
-    - name: "stp"
-      operands:
-        - class: "register"
-          prefix: "d"
+          prefix: "*"
          source: true
          destination: false
        - class: "register"
-          prefix: "d"
-          source: true
-          destination: false
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: false
-          post-indexed: false
-          source: false
-          destination: true   
-    - name: "stp"
-      operands:
-        - class: "register"
-          prefix: "q"
-          source: true
-          destination: false
-        - class: "register"
-          prefix: "q"
-          source: true
-          destination: false
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: false
-          post-indexed: false
-          source: false
-          destination: true
-    - name: "str"
-      operands:
-        - class: "register"
-          prefix: "x"
+          prefix: "*"
          source: true
          destination: false
        - class: "memory"
@@ -168,10 +75,10 @@ instruction_forms:
          post-indexed: "*"
          source: false
          destination: true
-    - name: "str"
+    - name: [str, stur]
      operands:
        - class: "register"
-          prefix: "d"
+          prefix: "*"
          source: true
          destination: false
        - class: "memory"
@@ -183,48 +90,73 @@ instruction_forms:
          post-indexed: "*"
          source: false
          destination: true
-    - name: "str"
+    - name: cmp
      operands:
        - class: "register"
-          prefix: "q"
+          prefix: "*"
          source: true
          destination: false
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: "*"
-          post-indexed: "*"
-          source: false
-          destination: true
-    - name: "stur"
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+    - name: cmp
      operands:
        - class: "register"
-          prefix: "q"
+          prefix: "*"
          source: true
          destination: false
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: "*"
-          post-indexed: "*"
-          source: false
-          destination: true
-    - name: "stur"
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+    - name: cmn
      operands:
        - class: "register"
-          prefix: "d"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+    - name: cmn
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+    - name: fcmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+    - name: fcmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "immediate"
+          imd: "double"
+          source: true
+          destination: false
+    - name: fcmp
+      operands:
+        - class: "register"
+          prefix: "*"
+          source: true
+          destination: false
+        - class: "immediate"
+          imd: "float"
          source: true
          destination: false
-        - class: "memory"
-          base: "*"
-          offset: "*"
-          index: "*"
-          scale: "*"
-          pre-indexed: "*"
-          post-indexed: "*"
-          source: false
-          destination: true
--- a/osaca/data/isa/x86.yml
+++ b/osaca/data/isa/x86.yml
--- a/osaca/data/ivb.yml
+++ b/osaca/data/ivb.yml
--- a/osaca/data/model_importer.py
+++ b/osaca/data/model_importer.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
-import os.path
 import argparse
+import os.path
 import sys
 import xml.etree.ElementTree as ET
 from distutils.version import StrictVersion
@@ -8,8 +8,23 @@ from distutils.version import StrictVersion
 from osaca.parser import get_parser
 from osaca.semantics import MachineModel

-intel_archs = ['CON', 'WOL', 'NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 
-               'CNL', 'ICL']
+intel_archs = [
+    'CON',
+    'WOL',
+    'NHM',
+    'WSM',
+    'SNB',
+    'IVB',
+    'HSW',
+    'BDW',
+    'SKL',
+    'SKX',
+    'KBL',
+    'CFL',
+    'CNL',
+    'ICL',
+]
+amd_archs = ['ZEN1', 'ZEN+', 'ZEN2']


 def port_pressure_from_tag_attributes(attrib):
@@ -19,6 +34,7 @@ def port_pressure_from_tag_attributes(attrib):
    for p in attrib['ports'].split('+'):
        cycles, ports = p.split('*')
        ports = ports.lstrip('p')
+        ports = ports.lstrip('FP')
        port_occupation.append([int(cycles), ports])

    # Also consider div on DIV pipeline
@@ -88,10 +104,10 @@ def extract_paramters(instruction_tag, parser, isa):
    return parameters


-def extract_model(tree, arch):
+def extract_model(tree, arch, skip_mem=True):
    try:
        isa = MachineModel.get_isa_for_arch(arch)
-    except:
+    except Exception:
        print("Skipping...", file=sys.stderr)
        return None
    mm = MachineModel(isa=isa)
@@ -101,6 +117,7 @@ def extract_model(tree, arch):
        ignore = False

        mnemonic = instruction_tag.attrib['asm']
+        iform = instruction_tag.attrib['iform']
        # skip any mnemonic which contain spaces (e.g., "REX CRC32")
        if ' ' in mnemonic:
            continue
@@ -118,6 +135,26 @@ def extract_model(tree, arch):
        arch_tag = instruction_tag.find('architecture[@name="' + arch.upper() + '"]')
        if arch_tag is None:
            continue
+        # skip any instructions without port utilization
+        if not any(['ports' in x.attrib for x in arch_tag.findall('measurement')]):
+            print("Couldn't find port utilization, skip: ", iform, file=sys.stderr)
+            continue
+        # skip if computed and measured TP don't match
+        if not [x.attrib['TP_ports'] == x.attrib['TP'] for x in arch_tag.findall('measurement')][
+            0
+        ]:
+            print(
+                "Calculated TP from port utilization doesn't match TP, skip: ",
+                iform,
+                file=sys.stderr,
+            )
+            continue
+        # skip if instruction contains memory operand
+        if skip_mem and any(
+            [x.attrib['type'] == 'mem' for x in instruction_tag.findall('operand')]
+        ):
+            print("Contains memory operand, skip: ", iform, file=sys.stderr)
+            continue
        # We collect all measurement and IACA information and compare them later
        for measurement_tag in arch_tag.iter('measurement'):
            if 'TP_ports' in measurement_tag.attrib:
@@ -143,10 +180,14 @@ def extract_model(tree, arch):
                    if 'max_cycles' in l_tag.attrib
                ]
            if latencies[1:] != latencies[:-1]:
-                print("Contradicting latencies found, using first:", mnemonic, latencies,
-                      file=sys.stderr)
+                print(
+                    "Contradicting latencies found, using smallest:",
+                    iform,
+                    latencies,
+                    file=sys.stderr,
+                )
            if latencies:
-                latency = latencies[0]
+                latency = min(latencies)
        if ignore:
            continue

@@ -160,16 +201,14 @@ def extract_model(tree, arch):
        # Check if all are equal
        if port_pressure:
            if port_pressure[1:] != port_pressure[:-1]:
-                print(
-                    "Contradicting port occupancies, using latest IACA:",
-                    mnemonic, file=sys.stderr)
+                print("Contradicting port occupancies, using latest IACA:", iform, file=sys.stderr)
            port_pressure = port_pressure[-1]
        else:
            # print("No data available for this architecture:", mnemonic, file=sys.stderr)
            continue
-        
+
        # Adding Intel's 2D and 3D pipelines on Intel µarchs, without Ice Lake:
-        if arch.upper() in intel_archs and not arch.upper() in ['ICL']:  
+        if arch.upper() in intel_archs and not arch.upper() in ['ICL']:
            if any([p['class'] == 'memory' for p in parameters]):
                # We have a memory parameter, if ports 2 & 3 are present, also add 2D & 3D
                # TODO remove port7 on 'hsw' onward and split entries depending on addressing mode
@@ -180,10 +219,16 @@ def extract_model(tree, arch):
                        port_23 = True
                    if '4' in pp[1]:
                        port_4 = True
-                # Add (1, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
+                # Add (X, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
+                # X = 2 on SNB and IVB IFF used in combination with ymm register, otherwise X = 1
+                if arch.upper() in ['SNB', 'IVB'] and \
+                    any([p['class'] == 'register' and p['name'] == 'ymm' for p in parameters]):
+                    data_port_throughput = 2
+                else:
+                    data_port_throughput = 1
                if port_23 and not port_4:
-                    port_pressure.append((1, ['2D', '3D']))
-        
+                    port_pressure.append((data_port_throughput, ['2D', '3D']))
+
        # Add missing ports:
        for ports in [pp[1] for pp in port_pressure]:
            for p in ports:
@@ -201,7 +246,7 @@ def rhs_comment(uncommented_string, comment):

    commented_string = ""
    for l in uncommented_string.split('\n'):
-        commented_string += ("{:<"+str(max_length)+"}  # {}\n").format(l, comment)
+        commented_string += ("{:<" + str(max_length) + "}  # {}\n").format(l, comment)
    return commented_string


@@ -218,21 +263,33 @@ def main():
        help='architecture to extract, use IACA abbreviations (e.g., SNB). '
        'if not given, all will be extracted and saved to file in CWD.',
    )
+    parser.add_argument(
+        '--mem',
+        dest='skip_mem',
+        action='store_false',
+        help='add instruction forms including memory addressing operands, which are '
+        'skipped by default'
+    )
    args = parser.parse_args()
    basename = os.path.basename(__file__)

    tree = ET.parse(args.xml)
-    print('Available architectures:', ', '.join(architectures(tree)))
+    print('# Available architectures:', ', '.join(architectures(tree)))
    if args.arch:
-        model = extract_model(tree, args.arch)
+        print('# Chosen architecture: {}'.format(args.arch))
+        model = extract_model(tree, args.arch, args.skip_mem)
        if model is not None:
-            print(rhs_comment(model.dump(), basename+" "+sys.argv[0]))
+            print(
+                rhs_comment(
+                    model.dump(), "uops.info import"
+                )
+            )
    else:
        for arch in architectures(tree):
            print(arch, end='')
-            model = extract_model(tree, arch.lower())
+            model = extract_model(tree, arch.lower(), args.skip_mem)
            if model:
-                model_string = rhs_comment(model.dump(), basename+" "+arch)
+                model_string = rhs_comment(model.dump(), basename + " " + arch)

                with open('{}.yml'.format(arch.lower()), 'w') as f:
                    f.write(model_string)
--- a/osaca/data/n1.yml
+++ b/osaca/data/n1.yml
@@ -0,0 +1,771 @@
+osaca_version: 0.3.4
+micro_architecture: Arm Neoverse N1 
+arch_code: n1
+isa: AArch64
+ROB_size: 128                   # wikichip
+retired_uOps_per_cycle: 8       # wikichip
+scheduler_size: 120             # wikichip
+hidden_loads: false
+load_latency: {w: 4.0, x: 4.0, b: 4.0, h: 4.0, s: 4.0, d: 5.0, q: 6.0, v: 5.0, z: 4.0}
+load_throughput:
+- {base: x, index: ~, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure:   [[1, '67']]}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure:  [[1, '67'], [1, '123']]}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '67']]}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure:   [[1, '67'], [1, '123']]}
+- {base: x, index: ~, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure:  [[1, '67'], [1, '123']]}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: true, port_pressure:    [[1, '67'], [1, '123']]}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: false, post-indexed: false, port_pressure:   [[1, '67']]}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: true, port_pressure:     [[1, '67'], [1, '123']]}
+- {base: x, index: x, offset: ~, scale: 1, pre-indexed: true, post-indexed: false, port_pressure:    [[1, '67'], [1, '123']]}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: true, port_pressure:  [[1, '67'], [1, '123']]}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: false, post-indexed: false, port_pressure: [[1, '67']]}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: true, port_pressure:   [[1, '67'], [1, '123']]}
+- {base: x, index: x, offset: imd, scale: 1, pre-indexed: true, post-indexed: false, port_pressure:  [[1, '67'], [1, '123']]}
+load_throughput_default: [[1, '67']]
+store_throughput: []
+store_throughput_default: [[1, '56'], [1, '67']]
+ports: ['0', '1', '2', '3', '4', '4DV', '5', '6', '7']
+port_model_scheme: |
+  +----------------------------------------------------------------------------+
+  |                                120 entries                                 |
+  +----------------------------------------------------------------------------+
+    0 |BR   1 |IS0  2 |IS1  3 |IM0     4 |FP0     5 |FP1     6 |LDST   7 |LDST
+      \/      \/      \/      \/         \/         \/         \/        \/
+   +------+ +-----+ +-----+ +-----+  +--------+ +--------+  +-------+ +-------+ 
+   |Branch| | INT | | INT | | INT |  | FP ALU | | FP ALU |  |  AGU  | |  AGU  |
+   +------+ | ALU | | ALU | | ALU |  +--------+ +--------+  +-------+ +-------+
+            +-----+ +-----+ +-----+  +--------+ +--------+  +-------+ +-------+
+                    +-----+ +-----+  | FP MUL | | FP MUL |  |LD DATA| |LD DATA|
+                    | ST  | | INT |  +--------+ +--------+  +-------+ +-------+
+                    | INT | | MUL |  +--------+ +---------+
+                    +-----+ +-----+  | FP DIV | |SIMD SHFT|
+                            +-----+  +--------+ +---------+
+                            | INT |  +--------+ +--------+
+                            | DIV |  |  FMA   | |  FMA   |
+                            +-----+  +--------+ +--------+
+                            +-----+  +--------+ +--------+
+                            |SHIFT|  | ST SIMD| | ST SIMD|
+                            +-----+  |  DATA  | |  DATA  |
+                            +-----+  +--------+ +--------+
+                            | ST  |
+                            | INT |
+                            +-----+
+instruction_forms:
+- name: add
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p123
+  port_pressure: [[1, '123']]
+- name: add
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p123
+  port_pressure: [[1, '123']]
+- name: adds
+  operands:
+  - class: register
+    prefix: x 
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p123
+  port_pressure: [[1, '132']]
+- name: b.ne
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '0']]
+- name: b.gt
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '0']]
+- name: bne
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '0']]
+- name: cmp
+  operands:
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p123
+  port_pressure: [[1, '123']]
+- name: cmp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.3333333
+  latency: 1.0  # 	1*p123
+  port_pressure: [[1, '123']]
+- name: dup
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fadd
+  operands:
+  - class: register
+    prefix: d
+    width: '*'
+  - class: register
+    prefix: d
+    width: '*'
+  - class: register
+    prefix: d
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fdiv
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+    width: 128
+  - class: register
+    prefix: v
+    shape: s
+    width: 128
+  - class: register
+    prefix: v
+    shape: s
+    width: 128
+  throughput: 6.0
+  latency: 8.0  # 	1*p4+6*p4DV
+  port_pressure: [[1, '4'], [6, [4DV]]]
+- name: fdiv
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+    width: 128
+  - class: register
+    prefix: v
+    shape: d
+    width: 128
+  - class: register
+    prefix: v
+    shape: d
+    width: 128
+  throughput: 10.0
+  latency: 12.0  # 	1*p4+10*p4DV
+  port_pressure: [[4, '0'], [10, [4DV]]]
+- name: fmla
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fmla
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fmov
+  operands:
+  - {class: register, prefix: s}
+  - {class: immediate, imd: double}
+  latency: ~  # 	1*p45
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fmul
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  throughput: 0.5
+  latency: 3.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fmul
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  throughput: 0.5
+  latency: 3.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fmul
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  throughput: 0.5
+  latency: 3.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: frecpe
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  throughput: 2.0
+  latency: 4.0  # 	1*p4
+  port_pressure: [[2, '4']]
+- name: frecpe
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  throughput: 1.0
+  latency: 3.0  # 	1*p4
+  port_pressure: [[1, '4']]
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  - class: register
+    prefix: v
+    shape: s
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  - class: register
+    prefix: v
+    shape: d
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 5.0  # 	2*p67, from n1 opt guide
+  port_pressure: [[2, '67']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 5.0  # 	2*p67+1*p123, from n1 opt guide
+  port_pressure: [[2, '67'], [1, '123']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 7.0  # 	2*p67, from n1 opt guide
+  port_pressure: [[2, '67']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: ~
+    index: ~
+    scale: 1
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 7.0  # 	2*p67+1*p123, from n1 opt guide
+  port_pressure: [[2, '56'], [1, '123']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 7.0  # 	2*p67
+  port_pressure: [[2, '67']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: true
+    post-indexed: false
+  throughput: 1.0
+  latency: 7.0  # 	2*p67+1*p123
+  port_pressure: [[2, '67'], [1, '123']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 5.0  # 	2*p67+1*p123
+  port_pressure: [[2, '67'], [1, '123']]
+- name: ldur    # JL: assumed from n1 opt guide
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 6.0  # 	1*p67
+  port_pressure: [[1, '67']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 6.0  # 	1*p67
+  port_pressure: [[1, '67']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 5.0  # 	1*p67
+  port_pressure: [[1, '67']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: imd
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 5.0  # 	1*p67
+  port_pressure: [[1, '67']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  throughput: 0.5
+  latency: 5.0  # 	1*p67
+  port_pressure: [[1, '67']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
+- name: mov
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.25
+  latency: 1.0  # 	1*p3456
+  port_pressure: [[1, '3456']]
+- name: mov
+  operands:
+  - class: register
+    prefix: v
+    shape: b
+    width: '*'
+  - class: register
+    prefix: v
+    shape: b
+    width: '*'
+  throughput: 0.5
+  latency: 2.0  # 	1*p45
+  port_pressure: [[1, '45']]
+- name: stp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p45+1*p67
+  port_pressure: [[2, '45'], [1, '67']]
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 0  # 	2*p45+2*p67+1*123
+  port_pressure: [[2, '45'], [2, '67'], [1, '123']]
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p45+2*p67
+  port_pressure: [[2, '45'], [2, '67']]
+- name: stur # JL: assumed from n1 opt guide
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 0.5
+  latency: 0  # 	1*p67+1*p23
+  port_pressure: [[1, '56'], [1, '23']]
+- name: stur # JL: assumed from n1 opt guide
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p67+1*p45
+  port_pressure: [[2, '67'], [1, '45']]
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 0.5
+  latency: 0  # 	1*p67+1*p23
+  port_pressure: [[1, '56'], [1, '23']]
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 0.5
+  latency: 0  # 	1*p67+1*p45
+  port_pressure: [[1, '67'], [1, '45']]
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: true
+  throughput: 0.5
+  latency: 0  # 	1*p67+1*p45+1*p123
+  port_pressure: [[1, '67'], [1, '45'], [1, '123']]
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p67+1*p45
+  port_pressure: [[1, '67'], [1, '45']]
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 0  # 	1*p67+1*p45+1*123
+  port_pressure: [[1, '67'], [1, '45'], [1, '123']]
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: true
+  throughput: 1.0
+  latency: 0  # 	1*p67+1*p23+1*p123
+  port_pressure: [[1, '67'], [1, '23'], [1, '123']]
+- name: sub
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p123
+  port_pressure: [[1, '123']]
--- a/osaca/data/skx.yml
+++ b/osaca/data/skx.yml
--- a/osaca/data/snb.yml
+++ b/osaca/data/snb.yml
--- a/osaca/data/tx2.yml
+++ b/osaca/data/tx2.yml
@@ -1,4 +1,4 @@
-osaca_version: 0.3.2
+osaca_version: 0.3.4
 micro_architecture: Thunder X2
 arch_code: tx2
 isa: AArch64
@@ -80,24 +80,114 @@ instruction_forms:
  throughput: 0.33333333
  latency: 1.0  # 	1*p012
  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: and
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: mul
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 1.0
+  latency: 4.0  # 	1*p1
+  port_pressure: [[1, '1']]
+- name: mul
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 1.0
+  latency: 4.0  # 	1*p1
+  port_pressure: [[1, '1']]
 - name: b.ne
  operands:
  - class: identifier
-  throughput: 0.0
+  throughput: 1.0
  latency: 0.0
-  port_pressure: []
+  port_pressure: [[1, '2']]
+- name: b.lt
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
+- name: b.hs
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
+- name: b.eq
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
+- name: b
+  operands:
+  - class: identifier
+  throughput: 1.0
+  latency: 0.0
+  port_pressure: [[1, '2']]
 - name: b.gt
  operands:
  - class: identifier
-  throughput: 0.0
+  throughput: 1.0
  latency: 0.0
-  port_pressure: []
+  port_pressure: [[1, '2']]
 - name: bne
  operands:
  - class: identifier
-  throughput: 0.0
+  throughput: 1.0
  latency: 0.0
-  port_pressure: []
+  port_pressure: [[1, '2']]
 - name: cmp
  operands:
  - class: register
@@ -107,6 +197,15 @@ instruction_forms:
  throughput: 0.33333333
  latency: 1.0  # 	1*p012
  port_pressure: [[1, '012']]
+- name: cmp
+  operands:
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: cmp
  operands:
  - class: register
@@ -126,6 +225,17 @@ instruction_forms:
  throughput: 0.5
  latency: 5.0  # 	1*p01
  port_pressure: [[1, '01']]
+- name: dup
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 0.5
+  latency: 5.0  # 	1*p01
+  port_pressure: [[1, '01']]
 - name: fadd
  operands:
  - class: register
@@ -267,6 +377,34 @@ instruction_forms:
  throughput: 0.5
  latency: 6.0  # 	1*p01
  port_pressure: [[1, '01']]
+- name: frecpe
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  throughput: 0.5
+  latency: 5.0  # 	1*p01
+  port_pressure: [[1, '01']]
+- name: frecpe
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  throughput: 0.5
+  latency: 5.0  # 	1*p01
+  port_pressure: [[1, '01']]
 - name: fsub
  operands:
  - class: register
@@ -295,6 +433,28 @@ instruction_forms:
  throughput: 0.5
  latency: 6.0  # 	1*p01
  port_pressure: [[1, '01']]
+- name: lsl
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: lsl
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: ldp
  operands:
  - class: register
@@ -375,6 +535,22 @@ instruction_forms:
  throughput: 1.0
  latency: 4.0  # 	2*p34
  port_pressure: [[2.0, '34']]
+- name: ldp
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 4.0  # 	2*p34
+  port_pressure: [[2.0, '34']]
 - name: ldp
  operands:
  - class: register
@@ -477,6 +653,15 @@ instruction_forms:
  throughput: 0.5
  latency: 4.0  # 	1*p34
  port_pressure: [[1.0, '34']]
+- name: ldr
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
 - name: ldr
  operands:
  - class: register
@@ -504,15 +689,42 @@ instruction_forms:
  throughput: 0.0
  latency: 0.0
  port_pressure: []
+- name: mov
+  operands:
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: mov
+  operands:
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: mov
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: mov
  operands:
  - class: register
    prefix: x
  - class: register
    prefix: x
-  throughput: 0.5
-  latency: 1.0  # 	1*p01
-  port_pressure: [[1, '01']]
+  throughput: 0.333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: mov
  operands:
  - class: register
@@ -540,6 +752,43 @@ instruction_forms:
  throughput: ~
  latency: ~
  port_pressure: []
+- name: ret
+  operands: []
+  throughput: 0.5
+  latency: ~  # 	1*p34
+  port_pressure: [[1, '34']]
+- name: stp
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p34+1*p5
+  port_pressure: [[2, '34'], [1, '5']]
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	2*p34+1*p5
+  port_pressure: [[2, '34'], [1, '5']]
 - name: stp
  operands:
  - class: register
@@ -616,6 +865,20 @@ instruction_forms:
  throughput: 1.0
  latency: 4.0  # 	1*p34+1*p5
  port_pressure: [[1.0, '34'], [1.0, '5']]
+- name: str
+  operands:
+  - class: register
+    prefix: w
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    pre-indexed: false
+    post-indexed: false
+  throughput: 1.0
+  latency: 0  # 	1*p34+1*p5
+  port_pressure: [[1.0, '34'], [1.0, '5']]
 - name: str
  operands:
  - class: register
@@ -700,6 +963,39 @@ instruction_forms:
  throughput: 1.0
  latency: 0  # 	1*p34+1*p5
  port_pressure: [[1.0, '34'], [1.0, '5'], [1, '012']]
+- name: subs
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: subs
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: sub
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: immediate
+    imd: int
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
 - name: sub
  operands:
  - class: register
@@ -711,3 +1007,25 @@ instruction_forms:
  throughput: 0.33333333
  latency: 1.0  # 	1*p012
  port_pressure: [[1, '012']]
+- name: sub
+  operands:
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  - class: register
+    prefix: w
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
+- name: sub
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  throughput: 0.33333333
+  latency: 1.0  # 	1*p012
+  port_pressure: [[1, '012']]
--- a/osaca/data/zen1.yml
+++ b/osaca/data/zen1.yml
@@ -1,4 +1,4 @@
-osaca_version: 0.3.2
+osaca_version: 0.3.4
 micro_architecture: AMD Zen (family 17h)
 arch_code: ZEN1
 isa: x86
--- a/osaca/data/zen2.yml
+++ b/osaca/data/zen2.yml
@@ -1,4 +1,4 @@
-osaca_version: 0.3.2
+osaca_version: 0.3.4
 micro_architecture: AMD Zen2
 arch_code: ZEN2
 isa: x86
@@ -725,6 +725,39 @@ instruction_forms:
  throughput: 1.0
  latency: 0  # 	1*p89+1*p10D
  port_pressure: [[1, '89'], [1, [10D]]]
+- name: vmovdqu
+  operands:
+  - class: memory
+    base: gpr
+    offset: "*"
+    index: ~
+    scale: 1
+  - class: register
+    name: "*"
+  throughput: 0.5
+  latency: 4.0  # 	1*p8910+1*p8D9D
+  port_pressure: [[1, ['8','9','10']], [1, [8D,9D]]]
+- name: vmovdqu
+  operands:
+  - class: memory
+    base: gpr
+    offset: "*"
+    index: gpr
+    scale: "*"
+  - class: register
+    name: "*"
+  throughput: 0.5
+  latency: 4.0  # 	1*p8910+1*p8D9D
+  port_pressure: [[1, ['8','9']], [1, [8D,9D]]]
+- name: vmovdqu
+  operands:
+  - class: register
+    name: "*"
+  - class: register
+    name: "*"
+  throughput: 0.0
+  latency: 0.0
+  port_pressure: []
 - name: add
  operands:
  - class: immediate
@@ -1081,6 +1114,16 @@ instruction_forms:
  latency: 3.0  # 	1*p01
  port_pressure: [[1, '01']]
  uops: 1
+- name: [shl, shr]
+  operands:
+  - class: immediate
+    imd: int
+  - class: register
+    name: gpr
+  throughput: 0.25
+  latency: 1.0  # 	1*p4567
+  port_pressure: [[1, '4567']]
+  uops: 1
 - name: UNPCKHPS                        # model_importer.py ./model_importer.py
  operands:                             # model_importer.py ./model_importer.py
  - class: register                     # model_importer.py ./model_importer.py
--- a/osaca/db_interface.py
+++ b/osaca/db_interface.py
@@ -274,10 +274,19 @@ def _create_db_operand_x86(operand):


 def _scrape_from_felixcloutier(mnemonic):
-    """Scrape src/dst information from felixcloutier website and return infromation for user."""
-    from bs4 import BeautifulSoup
+    """Scrape src/dst information from felixcloutier website and return information for user."""
    import requests

+    try:
+        from bs4 import BeautifulSoup
+    except ImportError:
+        print(
+            'Module BeautifulSoup not installed. Fetching instruction form information '
+            'online requires BeautifulSoup.\nUse \'pip install bs4\' for installation.',
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
    index = 'https://www.felixcloutier.com/x86/index.html'
    base_url = 'https://www.felixcloutier.com/x86/'
    url = base_url + mnemonic.lower()
@@ -287,12 +296,15 @@ def _scrape_from_felixcloutier(mnemonic):

    # GET website
    r = requests.get(url=url)
-    # Parse result
-    soup = BeautifulSoup(r.text, 'html.parser')
    if r.status_code == 200:
        # Found result
-        table = soup.find('h2', attrs={'id': 'instruction-operand-encoding'}).findNextSibling()
-        operands = _get_src_dst_from_table(table)
+        operand_enc = BeautifulSoup(r.text, 'html.parser').find(
+            'h2', attrs={'id': 'instruction-operand-encoding'}
+        )
+        if operand_enc:
+            # operand encoding found, otherwise, no need to mark as suspicous
+            table = operand_enc.findNextSibling()
+            operands = _get_src_dst_from_table(table)
    elif r.status_code == 404:
        # Check for alternative href
        index = BeautifulSoup(requests.get(url=index).text, 'html.parser')
@@ -300,12 +312,15 @@ def _scrape_from_felixcloutier(mnemonic):
        if len(alternatives) > 0:
            # alternative(s) found, take first one
            url = base_url + alternatives[0].attrs['href'][2:]
-            table = (
-                BeautifulSoup(requests.get(url=url).text, 'html.parser')
-                .find('h2', attrs={'id': 'instruction-operand-encoding'})
-                .findNextSibling()
+            operand_enc = BeautifulSoup(requests.get(url=url).text, 'html.parser').find(
+                'h2', attrs={'id': 'instruction-operand-encoding'}
            )
-            operands = _get_src_dst_from_table(table)
+            if operand_enc:
+                # operand encoding found, otherwise, no need to mark as suspicous
+                table = (
+                    operand_enc.findNextSibling()
+                )
+                operands = _get_src_dst_from_table(table)
    if operands:
        # Found src/dst assignment for NUM_OPERANDS
        if not any(['r' in x and 'w' in x for x in operands]):
@@ -313,9 +328,8 @@ def _scrape_from_felixcloutier(mnemonic):
    return (suspicious, ' '.join(operands))


-def _get_src_dst_from_table(table):
+def _get_src_dst_from_table(table, num_operands=2):
    """Prettify bs4 table object to string for user"""
-    NUM_OPERANDS = 2
    # Parse table
    header = [''.join(x.string.lower().split()) for x in table.find('tr').findAll('td')]
    data = table.findAll('tr')[1:]
@@ -327,10 +341,10 @@ def _get_src_dst_from_table(table):
                data_dict[i][header[j]] = col.string
    # Get only the instruction forms with 2 operands
    num_ops = [_get_number_of_operands(row) for _, row in data_dict.items()]
-    if NUM_OPERANDS in num_ops:
-        row = data_dict[num_ops.index(NUM_OPERANDS)]
+    if num_operands in num_ops:
+        row = data_dict[num_ops.index(num_operands)]
        reads_writes = []
-        for i in range(1, NUM_OPERANDS + 1):
+        for i in range(1, num_operands + 1):
            m = re.search(r'(\([^\(\)]+\))', row['operand{}'.format(i)])
            if not m:
                # no parentheses (probably immediate operand), assume READ
@@ -369,6 +383,7 @@ def _check_sanity_arch_db(arch_mm, isa_mm, internet_check=True):
    missing_port_pressure = []
    suspicious_instructions = []
    duplicate_instr_arch = []
+    duplicate_strings = []

    for instr_form in arch_mm['instruction_forms']:
        # check value in DB entry
@@ -388,6 +403,7 @@ def _check_sanity_arch_db(arch_mm, isa_mm, internet_check=True):
        # instr forms with less than 3 operands might need an ISA DB entry due to src_reg operands
        if (
            len(instr_form['operands']) < 3
+            and len(instr_form['operands']) > 1
            and 'mov' not in instr_form['name'].lower()
            and not instr_form['name'].lower().startswith('j')
            and instr_form not in suspicious_instructions
@@ -406,9 +422,10 @@ def _check_sanity_arch_db(arch_mm, isa_mm, internet_check=True):
            duplicate_instr_arch.append(instr_form)
    # every entry exists twice --> uniquify
    tmp_list = []
-    for i in range(0, len(duplicate_instr_arch)):
+    for _ in range(0, len(duplicate_instr_arch)):
        tmp = duplicate_instr_arch.pop()
-        if tmp not in duplicate_instr_arch:
+        if _get_full_instruction_name(tmp).lower() not in duplicate_strings:
+            duplicate_strings.append(_get_full_instruction_name(tmp).lower())
            tmp_list.append(tmp)
    duplicate_instr_arch = tmp_list
    return (
--- a/osaca/frontend.py
+++ b/osaca/frontend.py
@@ -76,7 +76,7 @@ class Frontend(object):
                self._get_flag_symbols(instruction_form['flags'])
                if instruction_form['instruction'] is not None
                else ' ',
-                instruction_form['line'].strip(),
+                instruction_form['line'].strip().replace('\t', ' '),
            )
            line = line if show_lineno else col_sep + col_sep.join(line.split(col_sep)[1:])
            if show_cmnts is False and self._is_comment(instruction_form):
@@ -138,13 +138,13 @@ class Frontend(object):
                separator,
                sum([instr_form['latency_lcd'] for instr_form in dep_dict[dep]['dependencies']]),
                separator,
-                dep_dict[dep]['root']['line'],
+                dep_dict[dep]['root']['line'].strip(),
                separator,
                [node['line_number'] for node in dep_dict[dep]['dependencies']],
            )
        return s

-    def full_analysis(self, kernel, kernel_dg: KernelDG, ignore_unknown=False, verbose=False):
+    def full_analysis(self, kernel, kernel_dg: KernelDG, ignore_unknown=False, arch_warning=False, length_warning=False, verbose=False):
        """
        Build the full analysis report including header, the symbol map, the combined TP/CP/LCD
        view and the list based LCD view.
@@ -156,11 +156,16 @@ class Frontend(object):
        :param ignore_unknown: flag for ignore warning if performance data is missing, defaults to
            `False`
        :type ignore_unknown: boolean, optional
+        :param print_arch_warning: flag for additional user warning to specify micro-arch 
+        :type print_arch_warning: boolean, optional
+        :param print_length_warning: flag for additional user warning to specify kernel length with --lines
+        :type print_length_warning: boolean, optional
        :param verbose: flag for verbosity level, defaults to False
        :type verbose: boolean, optional
        """
        return (
            self._header_report()
+            + self._user_warnings(arch_warning, length_warning)
            + self._symbol_map()
            + self.combined_view(
                kernel,
@@ -246,7 +251,7 @@ class Frontend(object):
                self._get_flag_symbols(instruction_form['flags'])
                if instruction_form['instruction'] is not None
                else ' ',
-                instruction_form['line'].strip(),
+                instruction_form['line'].strip().replace('\t', ' '),
            )
        s += '\n'
        # check for unknown instructions and throw warning if called without --ignore-unknown
@@ -279,12 +284,33 @@ class Frontend(object):
            '------------------\n'
            '                     No final analysis is given. If you want to ignore this\n'
            '                     warning and run the analysis anyway, start osaca with\n'
-            '                                       --ignore_unknown flag.\n'
+            '                                       --ignore-unknown flag.\n'
            '--------------------------------------------------------------------------------'
            '----------------{}\n'
        ).format(amount, '-' * len(str(amount)))
        return s

+    def _user_warnings(self, arch_warning, length_warning):
+        """Returns warning texts for giving the user more insight in what he is doing."""
+        arch_text = (
+                    'WARNING: No micro-architecture was specified and a default uarch was used.\n'
+                    '         Specify the uarch with --arch. See --help for more information.\n'
+        )
+        length_text = (
+                    'WARNING: You are analyzing a large amount of instruction forms. Analysis '
+                    'across loops/block boundaries often do not make much sense.\n'
+                    '         Specify the kernel length with --length. See --help for more '
+                    'information.\n'
+                    '         If this is intentional, you can safely ignore this message.\n'
+        )
+
+        warnings = ''
+        warnings += arch_text if arch_warning else ''
+        warnings += length_text if length_warning else ''
+        warnings += '\n'
+        return warnings
+
+
    def _get_separator_list(self, separator, separator_2=' '):
        """Creates column view for seperators in the TP/combined view."""
        separator_list = []
@@ -319,7 +345,12 @@ class Frontend(object):
                continue
            left_len = len(str(float(ports[i])).split('.')[0])
            substr = '{:' + str(left_len) + '.' + str(max(port_len[i] - left_len - 1, 0)) + 'f}'
-            string_result += substr.format(ports[i]) + ' {} '.format(separator[i])
+            substr = substr.format(ports[i])
+            string_result += (
+                substr + ' {} '.format(separator[i])
+                if '.' in substr
+                else '{:.1f}{} '.format(ports[i], separator[i])
+            )
        return string_result[:-1]

    def _get_node_by_lineno(self, lineno, kernel):
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -5,19 +5,33 @@ import io
 import os
 import re
 import sys
+import traceback

 from osaca.db_interface import import_benchmark_output, sanity_check
 from osaca.frontend import Frontend
-from osaca.parser import BaseParser, ParserAArch64v81, ParserX86ATT
+from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT
 from osaca.semantics import (INSTR_FLAGS, ArchSemantics, KernelDG,
                             MachineModel, reduce_to_section)

-MODULE_DATA_DIR = os.path.join(
-    os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
-)
-LOCAL_OSACA_DIR = os.path.join(os.path.expanduser('~') + '/.osaca/')
-DATA_DIR = os.path.join(LOCAL_OSACA_DIR, 'data/')
-SUPPORTED_ARCHS = ['SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'CSX', 'ZEN1', 'ZEN2', 'TX2']
+
+SUPPORTED_ARCHS = [
+    'SNB',
+    'IVB',
+    'HSW',
+    'BDW',
+    'SKX',
+    'CSX',
+    'ICL',
+    'ZEN1',
+    'ZEN2',
+    'TX2',
+    'N1',
+    'A64FX',
+]
+DEFAULT_ARCHS = {
+    'aarch64': 'A64FX',
+    'x86': 'SKX',
+}


 # Stolen from pip
@@ -71,7 +85,8 @@ def create_parser(parser=None):
    parser.add_argument(
        '--arch',
        type=str,
-        help='Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ZEN1, ZEN2, TX2).',
+        help='Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ZEN1, ZEN2, TX2, N1, '
+        'A64FX). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.',
    )
    parser.add_argument(
        '--fixed',
@@ -79,6 +94,13 @@ def create_parser(parser=None):
        help='Run the throughput analysis with fixed probabilities for all suitable ports per '
        'instruction. Otherwise, OSACA will print the optimal port utilization for the kernel.',
    )
+    parser.add_argument(
+        '--lines',
+        type=str,
+        help='Define lines that should be included in the analysis. This option overwrites any'
+        ' range defined by markers in the assembly. Add either single lines or ranges defined by'
+        ' "-" or ":", each entry separated by commas, e.g.: --lines 1,2,8-18,20:24',
+    )
    parser.add_argument(
        '--db-check',
        dest='check_db',
@@ -128,6 +150,12 @@ def create_parser(parser=None):
    parser.add_argument(
        '--verbose', '-v', action='count', default=0, help='Increases verbosity level.'
    )
+    parser.add_argument(
+        '--out', '-o',
+        default=sys.stdout,
+        type=argparse.FileType('w'),
+        help='Write analysis to this file (default to stdout).'
+    )
    parser.add_argument(
        'file', type=argparse.FileType('r'), help='Path to object (ASM or instruction file).'
    )
@@ -144,7 +172,12 @@ def check_arguments(args, parser):
    """
    supported_import_files = ['ibench', 'asmbench']

-    if 'arch' in args and (args.arch is None or args.arch.upper() not in SUPPORTED_ARCHS):
+    if args.arch is None and (args.check_db or 'import_data' in args):
+        parser.error(
+            'DB check and data import cannot work with a default microarchitecture. '
+            'Please see --help for all valid architecture codes.'
+        )
+    elif args.arch is not None and args.arch.upper() not in SUPPORTED_ARCHS:
        parser.error(
            'Microarchitecture not supported. Please see --help for all valid architecture codes.'
        )
@@ -188,9 +221,9 @@ def insert_byte_marker(args):
        from kerncraft.incore_model import asm_instrumentation
    except ImportError:
        print(
-            "Module kerncraft not installed. Use 'pip install --user "
-            "kerncraft' for installation.\nFor more information see "
-            "https://github.com/RRZE-HPC/kerncraft",
+            'Module kerncraft not installed. Use \'pip install --user '
+            'kerncraft\' for installation.\nFor more information see '
+            'https://github.com/RRZE-HPC/kerncraft',
            file=sys.stderr,
        )
        sys.exit(1)
@@ -221,19 +254,41 @@ def inspect(args, output_file=sys.stdout):
    :param output_file: Define the stream for output, defaults to :class:`sys.stdout`
    :type output_file: stream, optional
    """
-    arch = args.arch
+    # Read file
+    code = args.file.read()
+
+    # Detect ISA if necessary
+    arch = args.arch if args.arch is not None else DEFAULT_ARCHS[BaseParser.detect_ISA(code)]
+    print_arch_warning = False if args.arch else True
    isa = MachineModel.get_isa_for_arch(arch)
    verbose = args.verbose
    ignore_unknown = args.ignore_unknown

-    # Read file
-    code = args.file.read()
    # Parse file
    parser = get_asm_parser(arch)
-    parsed_code = parser.parse_file(code)
+    try:
+        parsed_code = parser.parse_file(code)
+    except:
+        # probably the wrong parser based on heuristic
+        if args.arch is None:
+            # change ISA and try again
+            arch = DEFAULT_ARCHS['x86'] if BaseParser.detect_ISA(code) == 'aarch64' else DEFAULT_ARCHS['aarch64']
+            isa = MachineModel.get_isa_for_arch(arch)
+            parser = get_asm_parser(arch)
+            parsed_code = parser.parse_file(code)
+        else:
+            traceback.print_exc(file=sys.stderr)
+            sys.exit(1)

-    # Reduce to marked kernel and add semantics
-    kernel = reduce_to_section(parsed_code, isa)
+    # Reduce to marked kernel or chosen section and add semantics
+    if args.lines:
+        line_range = get_line_range(args.lines)
+        kernel = [line for line in parsed_code if line['line_number'] in line_range]
+        print_length_warning = False
+    else:
+        kernel = reduce_to_section(parsed_code, isa)
+        # Print warning if kernel has no markers and is larger than threshold (100)
+        print_length_warning = True if len(kernel) == len(parsed_code) and len(kernel) > 100 else False
    machine_model = MachineModel(arch=arch)
    semantics = ArchSemantics(machine_model)
    semantics.add_semantics(kernel)
@@ -249,7 +304,12 @@ def inspect(args, output_file=sys.stdout):
    frontend = Frontend(args.file.name, arch=arch)
    print(
        frontend.full_analysis(
-            kernel, kernel_graph, ignore_unknown=ignore_unknown, verbose=verbose
+            kernel,
+            kernel_graph,
+            ignore_unknown=ignore_unknown,
+            arch_warning=print_arch_warning,
+            length_warning=print_length_warning,
+            verbose=verbose
        ),
        file=output_file,
    )
@@ -292,7 +352,7 @@ def get_asm_parser(arch) -> BaseParser:
    if isa == 'x86':
        return ParserX86ATT()
    elif isa == 'aarch64':
-        return ParserAArch64v81()
+        return ParserAArch64()


 def get_unmatched_instruction_ratio(kernel):
@@ -306,13 +366,26 @@ def get_unmatched_instruction_ratio(kernel):
            unmatched_counter += 1
    return unmatched_counter / len(kernel)

+def get_line_range(line_str):
+    line_str = line_str.replace(':', '-')
+    lines = line_str.split(',')
+    lines_int = []
+    for l in lines:
+        if '-' in l:
+            start = int(l.split('-')[0])
+            end = int(l.split('-')[1])
+            rnge = list(range(start, end+1))
+            lines_int += rnge
+        else:
+            lines_int.append(int(l))
+    return lines_int

 def main():
    """Initialize and run command line interface."""
    parser = create_parser()
    args = parser.parse_args()
    check_arguments(args, parser)
-    run(args)
+    run(args, output_file=args.out)


 if __name__ == '__main__':
--- a/osaca/parser/init.py
+++ b/osaca/parser/init.py
@@ -6,14 +6,14 @@ Only the parser below will be exported, so please add new parsers to __all__.
 from .attr_dict import AttrDict
 from .base_parser import BaseParser
 from .parser_x86att import ParserX86ATT
-from .parser_AArch64v81 import ParserAArch64v81
+from .parser_AArch64 import ParserAArch64

-__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64v81', 'get_parser']
+__all__ = ['AttrDict', 'BaseParser', 'ParserX86ATT', 'ParserAArch64', 'get_parser']

 def get_parser(isa):
    if isa.lower() == 'x86':
        return ParserX86ATT()
    elif isa.lower() == 'aarch64':
-        return ParserAArch64v81()
+        return ParserAArch64()
    else:
        raise ValueError("Unknown ISA {!r}.".format(isa))
--- a/osaca/parser/base_parser.py
+++ b/osaca/parser/base_parser.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 """Parser superclass of specific parsers."""
-
+import operator
+import re

 class BaseParser(object):
    # Identifiers for operand types
@@ -8,14 +9,35 @@ class BaseParser(object):
    DIRECTIVE_ID = 'directive'
    IMMEDIATE_ID = 'immediate'
    LABEL_ID = 'label'
+    IDENTIFIER_ID = 'identifier'
    MEMORY_ID = 'memory'
    REGISTER_ID = 'register'
    SEGMENT_EXT_ID = 'segment_extension'
    INSTRUCTION_ID = 'instruction'
    OPERANDS_ID = 'operands'
+    _parser_constructed = False

    def __init__(self):
-        self.construct_parser()
+        if not self._parser_constructed:
+            self.construct_parser()
+            self._parser_constructed = True
+
+    @staticmethod
+    def detect_ISA(file_content):
+        """Detect the ISA of the assembly based on the used registers and return the ISA code."""
+        # Check for the amount of registers in the code to determine the ISA
+        # 1) Check for xmm, ymm, zmm, rax, rbx, rcx, and rdx registers in x86
+        heuristics_x86ATT = [r'%[xyz]mm[0-9]', r'%[er][abcd]x[0-9]']
+        # 2) check for v and z vector registers and x/w general-purpose registers
+        heuristics_aarch64 = [r'[vz][0-9][0-9]?\.[0-9][0-9]?[bhsd]', r'[wx][0-9]']
+        matches = {'x86': 0, 'aarch64': 0}
+
+        for h in heuristics_x86ATT:
+            matches['x86'] += len(re.findall(h, file_content))
+        for h in heuristics_aarch64:
+            matches['aarch64'] += len(re.findall(h, file_content))
+
+        return max(matches.items(), key=operator.itemgetter(1))[0]

    def parse_file(self, file_content, start_line=0):
        """
--- a/osaca/parser/parser_AArch64v81.py
+++ b/osaca/parser/parser_AArch64v81.py
@@ -6,7 +6,15 @@ import pyparsing as pp
 from osaca.parser import AttrDict, BaseParser


-class ParserAArch64v81(BaseParser):
+class ParserAArch64(BaseParser):
+    _instance = None
+
+    # Singelton pattern, as this is created very many times
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ParserAArch64, cls).__new__(cls)
+        return cls._instance
+
    def __init__(self):
        super().__init__()
        self.isa = 'aarch64'
@@ -19,22 +27,23 @@ class ParserAArch64v81(BaseParser):
            pp.ZeroOrMore(pp.Word(pp.printables))
        ).setResultsName(self.COMMENT_ID)
        # Define ARM assembly identifier
+        decimal_number = pp.Combine(
+            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
+        ).setResultsName('value')
+        hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
        relocation = pp.Combine(pp.Literal(':') + pp.Word(pp.alphanums + '_') + pp.Literal(':'))
        first = pp.Word(pp.alphas + '_.', exact=1)
        rest = pp.Word(pp.alphanums + '_.')
        identifier = pp.Group(
            pp.Optional(relocation).setResultsName('relocation')
            + pp.Combine(first + pp.Optional(rest)).setResultsName('name')
-        ).setResultsName('identifier')
+            + pp.Optional(pp.Suppress(pp.Literal('+')) + (hex_number | decimal_number).setResultsName('offset'))
+        ).setResultsName(self.IDENTIFIER_ID)
        # Label
        self.label = pp.Group(
            identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
        ).setResultsName(self.LABEL_ID)
        # Directive
-        decimal_number = pp.Combine(
-            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
-        ).setResultsName('value')
-        hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
        directive_option = pp.Combine(
            pp.Word(pp.alphas + '#@.%', exact=1)
            + pp.Optional(pp.Word(pp.printables + ' ', excludeChars=','))
@@ -46,7 +55,7 @@ class ParserAArch64v81(BaseParser):
        self.directive = pp.Group(
            pp.Literal('.')
            + pp.Word(pp.alphanums + '_').setResultsName('name')
-            + commaSeparatedList.setResultsName('parameters')
+            + (pp.OneOrMore(directive_parameter) ^ commaSeparatedList).setResultsName('parameters')
            + pp.Optional(self.comment)
        ).setResultsName(self.DIRECTIVE_ID)
        # LLVM-MCA markers
@@ -91,31 +100,49 @@ class ParserAArch64v81(BaseParser):
            ^ pp.CaselessLiteral('ror')
            ^ pp.CaselessLiteral('sxtw')
            ^ pp.CaselessLiteral('uxtw')
+            ^ pp.CaselessLiteral('mul vl')
        )
        arith_immediate = pp.Group(
            immediate.setResultsName('base_immediate')
            + pp.Suppress(pp.Literal(','))
            + shift_op.setResultsName('shift_op')
-            + immediate.setResultsName('shift')
+            + pp.Optional(immediate).setResultsName('shift')
        ).setResultsName(self.IMMEDIATE_ID)
        # Register:
-        # scalar: [XWBHSDQ][0-9]{1,2}  |   vector: V[0-9]{1,2}\.[12468]{1,2}[BHSD]()?
-        # define SP and ZR register aliases as regex, due to pyparsing does not support
+        # scalar: [XWBHSDQ][0-9]{1,2}  |   vector: [VZ][0-9]{1,2}(\.[12468]{1,2}[BHSD])?
+        #  | predicate: P[0-9]{1,2}(/[ZM])?
+        # ignore vector len control ZCR_EL[123] for now
+        # define SP, ZR register aliases as regex, due to pyparsing does not support
        # proper lookahead
        alias_r31_sp = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(sp|SP))')
        alias_r31_zr = pp.Regex('(?P<prefix>[a-zA-Z])?(?P<name>(zr|ZR))')
-        scalar = pp.Word(pp.alphas, exact=1).setResultsName('prefix') + pp.Word(
+        scalar = pp.Word('xwbhsdqXWBHSDQ', exact=1).setResultsName('prefix') + pp.Word(
            pp.nums
        ).setResultsName('name')
        index = pp.Literal('[') + pp.Word(pp.nums).setResultsName('index') + pp.Literal(']')
        vector = (
-            pp.CaselessLiteral('v').setResultsName('prefix')
+            pp.oneOf('v z', caseless=True).setResultsName('prefix')
            + pp.Word(pp.nums).setResultsName('name')
            + pp.Literal('.')
            + pp.Optional(pp.Word('12468')).setResultsName('lanes')
            + pp.Word(pp.alphas, exact=1).setResultsName('shape')
            + pp.Optional(index)
        )
+        predicate = (
+            pp.CaselessLiteral('p').setResultsName('prefix')
+            + pp.Word(pp.nums).setResultsName('name')
+            + pp.Optional(
+                (
+                    pp.Suppress(pp.Literal('/'))
+                    + pp.oneOf('z m', caseless=True).setResultsName('predication')
+                )
+                | (
+                    pp.Literal('.')
+                    + pp.Optional(pp.Word('12468')).setResultsName('lanes')
+                    + pp.Word(pp.alphas, exact=1).setResultsName('shape')
+                )
+            )
+        )
        self.list_element = vector ^ scalar
        register_list = (
            pp.Literal('{')
@@ -129,7 +156,8 @@ class ParserAArch64v81(BaseParser):
            + pp.Optional(index)
        )
        register = pp.Group(
-            (alias_r31_sp | alias_r31_zr | vector | scalar | register_list)
+            (alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
+            #(alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
            + pp.Optional(
                pp.Suppress(pp.Literal(','))
                + shift_op.setResultsName('shift_op')
@@ -144,7 +172,7 @@ class ParserAArch64v81(BaseParser):
            pp.Literal('[')
            + pp.Optional(register.setResultsName('base'))
            + pp.Optional(pp.Suppress(pp.Literal(',')))
-            + pp.Optional(register_index ^ immediate.setResultsName('offset'))
+            + pp.Optional(register_index ^ (immediate ^ arith_immediate).setResultsName('offset'))
            + pp.Literal(']')
            + pp.Optional(
                pp.Literal('!').setResultsName('pre_indexed')
@@ -177,6 +205,11 @@ class ParserAArch64v81(BaseParser):
            + pp.Optional(self.comment)
        )

+        # for testing
+        self.predicate = predicate
+        self.vector = vector
+        self.register = register
+
    def parse_line(self, line, line_number=None):
        """
        Parse line and return instruction form.
@@ -193,7 +226,7 @@ class ParserAArch64v81(BaseParser):
                self.DIRECTIVE_ID: None,
                self.COMMENT_ID: None,
                self.LABEL_ID: None,
-                'line': line.strip(),
+                'line': line,
                'line_number': line_number,
            }
        )
@@ -317,14 +350,18 @@ class ParserAArch64v81(BaseParser):
            return self.process_immediate(operand[self.IMMEDIATE_ID])
        if self.LABEL_ID in operand:
            return self.process_label(operand[self.LABEL_ID])
+        if self.IDENTIFIER_ID in operand:
+            return self.process_identifier(operand[self.IDENTIFIER_ID])
        return operand

    def process_memory_address(self, memory_address):
        """Post-process memory address operand"""
        # Remove unnecessarily created dictionary entries during parsing
-        offset = None if 'offset' not in memory_address else memory_address['offset']
-        base = None if 'base' not in memory_address else memory_address['base']
-        index = None if 'index' not in memory_address else memory_address['index']
+        offset = memory_address.get('offset', None)
+        if isinstance(offset, list) and len(offset) == 1:
+            offset = offset[0]
+        base = memory_address.get('base', None)
+        index = memory_address.get('index', None)
        scale = 1
        if base is not None and 'name' in base and base['name'] == 'sp':
            base['prefix'] = 'x'
@@ -351,18 +388,20 @@ class ParserAArch64v81(BaseParser):
    def process_register_list(self, register_list):
        """Post-process register lists (e.g., {r0,r3,r5}) and register ranges (e.g., {r0-r7})"""
        # Remove unnecessarily created dictionary entries during parsing
-        vlist = []
+        rlist = []
        dict_name = ''
        if 'list' in register_list:
            dict_name = 'list'
        if 'range' in register_list:
            dict_name = 'range'
-        for v in register_list[dict_name]:
-            vlist.append(
-                AttrDict.convert_dict(self.list_element.parseString(v, parseAll=True).asDict())
+        for r in register_list[dict_name]:
+            rlist.append(
+                AttrDict.convert_dict(self.list_element.parseString(r, parseAll=True).asDict())
            )
-        index = None if 'index' not in register_list else register_list['index']
-        new_dict = AttrDict({dict_name: vlist, 'index': index})
+        index = register_list.get('index', None)
+        new_dict = AttrDict({dict_name: rlist, 'index': index})
+        if len(new_dict[dict_name]) == 1:
+            return AttrDict({self.REGISTER_ID: new_dict[dict_name][0]})
        return AttrDict({self.REGISTER_ID: new_dict})

    def process_immediate(self, immediate):
@@ -375,7 +414,9 @@ class ParserAArch64v81(BaseParser):
            # normal integer value, nothing to do
            return AttrDict({self.IMMEDIATE_ID: immediate})
        if 'base_immediate' in immediate:
-            # arithmetic immediate, nothing to do
+            # arithmetic immediate, add calculated value as value
+            immediate['shift'] = immediate['shift'][0]
+            immediate['value'] = int(immediate['base_immediate']['value']) << int(immediate['shift']['value'])
            return AttrDict({self.IMMEDIATE_ID: immediate})
        if 'float' in immediate:
            dict_name = 'float'
@@ -396,6 +437,13 @@ class ParserAArch64v81(BaseParser):
        label['name'] = label['name']['name']
        return AttrDict({self.LABEL_ID: label})

+    def process_identifier(self, identifier):
+        """Post-process identifier operand"""
+        # remove value if it consists of symbol+offset
+        if 'value' in identifier:
+            del identifier['value']
+        return AttrDict({self.IDENTIFIER_ID: identifier})
+
    def get_full_reg_name(self, register):
        """Return one register name string including all attributes"""
        if 'lanes' in register:
@@ -440,7 +488,7 @@ class ParserAArch64v81(BaseParser):

    def is_vector_register(self, register):
        """Check if register is a vector register"""
-        if register['prefix'] in 'bhsdqv':
+        if register['prefix'] in 'bhsdqvz':
            return True
        return False

@@ -455,7 +503,7 @@ class ParserAArch64v81(BaseParser):
    def is_reg_dependend_of(self, reg_a, reg_b):
        """Check if ``reg_a`` is dependent on ``reg_b``"""
        prefixes_gpr = 'wx'
-        prefixes_vec = 'bhsdqv'
+        prefixes_vec = 'bhsdqvz'
        if reg_a['name'] == reg_b['name']:
            if reg_a['prefix'].lower() in prefixes_gpr and reg_b['prefix'].lower() in prefixes_gpr:
                return True
--- a/osaca/parser/parser_x86att.py
+++ b/osaca/parser/parser_x86att.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import string
+import re

 import pyparsing as pp

@@ -8,6 +9,14 @@ from osaca.parser import AttrDict, BaseParser


 class ParserX86ATT(BaseParser):
+    _instance = None
+
+    # Singelton pattern, as this is created very many times
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(ParserX86ATT, cls).__new__(cls)
+        return cls._instance
+
    def __init__(self):
        super().__init__()
        self.isa = 'x86'
@@ -33,8 +42,20 @@ class ParserX86ATT(BaseParser):
            + pp.Optional(relocation).setResultsName('relocation')
        ).setResultsName('identifier')
        # Label
+        rest = pp.Word(pp.alphanums + '$_.+-()')
+        label_identifier = pp.Group(
+            pp.Optional(id_offset).setResultsName('offset')
+            + pp.Combine(first + pp.Optional(rest)).setResultsName('name')
+            + pp.Optional(relocation).setResultsName('relocation')
+        ).setResultsName('identifier')
+        numeric_identifier = pp.Group(
+            pp.Word(pp.nums).setResultsName('name')
+            + pp.Optional(pp.oneOf('b f', caseless=True).setResultsName('suffix'))
+        ).setResultsName('identifier')
        self.label = pp.Group(
-            identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
+            (label_identifier | numeric_identifier).setResultsName('name')
+            + pp.Literal(':')
+            + pp.Optional(self.comment)
        ).setResultsName(self.LABEL_ID)
        # Register: pp.Regex('^%[0-9a-zA-Z]+{}{z},?')
        self.register = pp.Group(
@@ -43,7 +64,7 @@ class ParserX86ATT(BaseParser):
            + pp.Optional(pp.Literal('(') + pp.Word(pp.nums) + pp.Literal(')'))
            + pp.Optional(
                pp.Literal('{')
-                + pp.Literal('%')
+                + pp.Optional(pp.Suppress(pp.Literal('%')))
                + pp.Word(pp.alphanums).setResultsName('mask')
                + pp.Literal('}')
                + pp.Optional(
@@ -98,7 +119,7 @@ class ParserX86ATT(BaseParser):
                + pp.Literal(')')
                + pp.Optional(
                    pp.Literal('{')
-                    + pp.Literal('%')
+                    + pp.Optional(pp.Suppress(pp.Literal('%')))
                    + pp.Word(pp.alphanums).setResultsName('mask')
                    + pp.Literal('}')
                )
@@ -108,23 +129,20 @@ class ParserX86ATT(BaseParser):
        ).setResultsName(self.MEMORY_ID)

        # Directive
-        directive_option = pp.Combine(
-            pp.Word('#@.', exact=1) + pp.Word(pp.printables, excludeChars=',')
-        )
+        # parameter can be any quoted string or sequence of characters besides '#' (for comments)
+        # or ',' (parameter delimiter)
        directive_parameter = (
            pp.quotedString
-            ^ directive_option
-            ^ identifier
-            ^ hex_number
-            ^ decimal_number
-            ^ self.register
-            ^ pp.Group(pp.Word(pp.alphanums + '_').setResultsName('name'))
+            ^ (
+                pp.Word(pp.printables, excludeChars=',#')
+                + pp.Optional(pp.Suppress(pp.Literal(',')))
+            )
+            ^ pp.Suppress(pp.Literal(','))
        )
-        commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
        self.directive = pp.Group(
            pp.Literal('.')
            + pp.Word(pp.alphanums + '_').setResultsName('name')
-            + commaSeparatedList.setResultsName('parameters')
+            + pp.ZeroOrMore(directive_parameter).setResultsName('parameters')
            + pp.Optional(self.comment)
        ).setResultsName(self.DIRECTIVE_ID)

@@ -134,7 +152,9 @@ class ParserX86ATT(BaseParser):
            pp.alphanums
        ).setResultsName('mnemonic')
        # Combine to instruction form
-        operand_first = pp.Group(self.register ^ immediate ^ memory ^ identifier)
+        operand_first = pp.Group(
+            self.register ^ immediate ^ memory ^ identifier ^ numeric_identifier
+        )
        operand_rest = pp.Group(self.register ^ immediate ^ memory)
        self.instruction_parser = (
            mnemonic
@@ -173,7 +193,7 @@ class ParserX86ATT(BaseParser):
                self.DIRECTIVE_ID: None,
                self.COMMENT_ID: None,
                self.LABEL_ID: None,
-                'line': line.strip(),
+                'line': line,
                'line_number': line_number,
            }
        )
@@ -277,14 +297,24 @@ class ParserX86ATT(BaseParser):
            return self.process_immediate(operand[self.IMMEDIATE_ID])
        if self.LABEL_ID in operand:
            return self.process_label(operand[self.LABEL_ID])
+        if self.DIRECTIVE_ID in operand:
+            return self.process_directive(operand[self.DIRECTIVE_ID])
        return operand

+    def process_directive(self, directive):
+        directive_new = {'name': directive['name'], 'parameters': []}
+        if 'parameters' in directive:
+            directive_new['parameters'] = directive['parameters']
+        if 'comment' in directive:
+            directive_new['comment'] = directive['comment']
+        return AttrDict({self.DIRECTIVE_ID: directive_new})
+
    def process_memory_address(self, memory_address):
        """Post-process memory address operand"""
        # Remove unecessarily created dictionary entries during memory address parsing
-        offset = None if 'offset' not in memory_address else memory_address['offset']
-        base = None if 'base' not in memory_address else memory_address['base']
-        index = None if 'index' not in memory_address else memory_address['index']
+        offset = memory_address.get('offset', None)
+        base = memory_address.get('base', None)
+        index = memory_address.get('index', None)
        scale = 1 if 'scale' not in memory_address else int(memory_address['scale'])
        if isinstance(offset, str) and base is None and index is None:
            offset = {'value': offset}
@@ -297,7 +327,7 @@ class ParserX86ATT(BaseParser):
    def process_label(self, label):
        """Post-process label asm line"""
        # remove duplicated 'name' level due to identifier
-        label['name'] = label['name']['name']
+        label['name'] = label['name'][0]['name']
        return AttrDict({self.LABEL_ID: label})

    def process_immediate(self, immediate):
@@ -333,45 +363,44 @@ class ParserX86ATT(BaseParser):

    def is_reg_dependend_of(self, reg_a, reg_b):
        """Check if ``reg_a`` is dependent on ``reg_b``"""
+        # Normalize name
+        reg_a_name = reg_a['name'].upper()
+        reg_b_name = reg_b['name'].upper()
+
        # Check if they are the same registers
-        if reg_a.name == reg_b.name:
+        if reg_a_name == reg_b_name:
            return True
        # Check vector registers first
        if self.is_vector_register(reg_a):
            if self.is_vector_register(reg_b):
-                if reg_a.name[1:] == reg_b.name[1:]:
+                if reg_a_name[1:] == reg_b_name[1:]:
                    # Registers in the same vector space
                    return True
            return False
        # Check basic GPRs
-        a_dep = ['RAX', 'EAX', 'AX', 'AH', 'AL']
-        b_dep = ['RBX', 'EBX', 'BX', 'BH', 'BL']
-        c_dep = ['RCX', 'ECX', 'CX', 'CH', 'CL']
-        d_dep = ['RDX', 'EDX', 'DX', 'DH', 'DL']
-        sp_dep = ['RSP', 'ESP', 'SP', 'SPL']
-        src_dep = ['RSI', 'ESI', 'SI', 'SIL']
-        dst_dep = ['RDI', 'EDI', 'DI', 'DIL']
-        basic_gprs = [a_dep, b_dep, c_dep, d_dep, sp_dep, src_dep, dst_dep]
+        gpr_groups = {
+            'A': ['RAX', 'EAX', 'AX', 'AH', 'AL'],
+            'B': ['RBX', 'EBX', 'BX', 'BH', 'BL'],
+            'C': ['RCX', 'ECX', 'CX', 'CH', 'CL'],
+            'D': ['RDX', 'EDX', 'DX', 'DH', 'DL'],
+            'SP': ['RSP', 'ESP', 'SP', 'SPL'],
+            'SRC': ['RSI', 'ESI', 'SI', 'SIL'],
+            'DST': ['RDI', 'EDI', 'DI', 'DIL']
+        }
        if self.is_basic_gpr(reg_a):
            if self.is_basic_gpr(reg_b):
-                for dep_group in basic_gprs:
-                    if reg_a['name'].upper() in dep_group:
-                        if reg_b['name'].upper() in dep_group:
+                for dep_group in gpr_groups.values():
+                    if reg_a_name in dep_group:
+                        if reg_b_name in dep_group:
                            return True
            return False
+
        # Check other GPRs
-        gpr_parser = (
-            pp.CaselessLiteral('R')
-            + pp.Word(pp.nums).setResultsName('id')
-            + pp.Optional(pp.Word('dwbDWB', exact=1))
-        )
-        try:
-            id_a = gpr_parser.parseString(reg_a['name'], parseAll=True).asDict()['id']
-            id_b = gpr_parser.parseString(reg_b['name'], parseAll=True).asDict()['id']
-            if id_a == id_b:
-                return True
-        except pp.ParseException:
-            return False
+        ma = re.match(r'R([0-9]+)[DWB]?', reg_a_name)
+        mb = re.match(r'R([0-9]+)[DWB]?', reg_b_name)
+        if ma and mb and ma.group(1) == mb.group(1):
+            return True
+
        # No dependencies
        return False

@@ -385,19 +414,11 @@ class ParserX86ATT(BaseParser):
        """Check if register is a general purpose register"""
        if register is None:
            return False
-        gpr_parser = (
-            pp.CaselessLiteral('R')
-            + pp.Word(pp.nums).setResultsName('id')
-            + pp.Optional(pp.Word('dwbDWB', exact=1))
-        )
+
        if self.is_basic_gpr(register):
            return True
-        else:
-            try:
-                gpr_parser.parseString(register['name'], parseAll=True)
-                return True
-            except pp.ParseException:
-                return False
+
+        return re.match(r'R([0-9]+)[DWB]?', register['name'], re.IGNORECASE)

    def is_vector_register(self, register):
        """Check if register is a vector register"""
--- a/osaca/semantics/arch_semantics.py
+++ b/osaca/semantics/arch_semantics.py
@@ -53,9 +53,18 @@ class ArchSemantics(ISASemantics):
                )
                if len(set(port_sums)) > 1:
                    # balance ports
-                    for _ in range(cycles * 100):
-                        instr_ports[port_sums.index(max(port_sums))] -= INC
-                        instr_ports[port_sums.index(min(port_sums))] += INC
+                    # init list for keeping track of the current change
+                    differences = [cycles / len(ports)  for p in ports]
+                    for _ in range(int(cycles * (1 / INC))):
+                        if len(instr_ports) == 1:
+                            # no balancing possible anymore
+                            break
+                        max_port_idx = port_sums.index(max(port_sums))
+                        min_port_idx = port_sums.index(min(port_sums))
+                        instr_ports[max_port_idx] -= INC
+                        instr_ports[min_port_idx] += INC
+                        differences[max_port_idx] -= INC
+                        differences[min_port_idx] += INC
                        # instr_ports = [round(p, 2) for p in instr_ports]
                        self._itemsetter(*indices)(instruction_form['port_pressure'], *instr_ports)
                        # check if min port is zero
@@ -63,7 +72,12 @@ class ArchSemantics(ISASemantics):
                            # if port_pressure is not exactly 0.00, add the residual to
                            # the former port
                            if min(instr_ports) != 0.0:
-                                instr_ports[port_sums.index(min(port_sums))] += min(instr_ports)
+                                min_port_idx = port_sums.index(min(port_sums))
+                                instr_ports[min_port_idx] += min(instr_ports)
+                                differences[min_port_idx] += min(instr_ports)
+                                # we don't need to decrease difference for other port, just
+                                # delete it
+                                del differences[instr_ports.index(min(instr_ports))]
                                self._itemsetter(*indices)(
                                    instruction_form['port_pressure'], *instr_ports
                                )
@@ -80,6 +94,17 @@ class ArchSemantics(ISASemantics):
                            instr_ports = self._to_list(
                                itemgetter(*indices)(instruction_form['port_pressure'])
                            )
+                        # never remove more than the fixed utilization per uop and port, i.e., 
+                        # cycles/len(ports)
+                        if round(min(differences), 2) <= 0:
+                            # don't worry if port_pressure isn't exactly 0 and just
+                            # remove from further balancing by deleting index since
+                            # pressure is not 0
+                            del indices[differences.index(min(differences))]
+                            instr_ports = self._to_list(
+                                 itemgetter(*indices)(instruction_form['port_pressure'])
+                            )
+                            del differences[differences.index(min(differences))]
                        port_sums = self._to_list(
                            itemgetter(*indices)(self.get_throughput_sum(kernel))
                        )
@@ -373,9 +398,7 @@ class ArchSemantics(ISASemantics):

            def g(obj, value):
                obj[item] = value
-
        else:
-
            def g(obj, *values):
                for item, value in zip(items, values):
                    obj[item] = value
@@ -391,9 +414,11 @@ class ArchSemantics(ISASemantics):
    @staticmethod
    def get_throughput_sum(kernel):
        """Get the overall throughput sum separated by port of all instructions of a kernel."""
-        tp_sum = reduce(
-            (lambda x, y: [sum(z) for z in zip(x, y)]),
-            [instr['port_pressure'] for instr in kernel],
-        )
-        tp_sum = [round(x, 2) for x in tp_sum]
+        # ignoring all lines with throughput == 0.0, because there won't be anything to sum up
+        # typically comment, label and non-instruction lines
+        port_pressures = [instr['port_pressure'] for instr in kernel if instr['throughput'] != 0.0]
+        # Essentially summing up each columns of port_pressures, where each column is one port
+        # and each row is one line of the kernel
+        # round is necessary to ensure termination of ArchsSemantics.assign_optimal_throughput
+        tp_sum = [round(sum(col), 2) for col in zip(*port_pressures)]
        return tp_sum
--- a/osaca/semantics/hw_model.py
+++ b/osaca/semantics/hw_model.py
@@ -1,12 +1,14 @@
 #!/usr/bin/env python3

-import base64
 import os
 import pickle
 import re
 import string
 from copy import deepcopy
 from itertools import product
+import hashlib
+from pathlib import Path
+from collections import defaultdict

 import ruamel.yaml
 from ruamel.yaml.compat import StringIO
@@ -17,6 +19,7 @@ from osaca.parser import ParserX86ATT

 class MachineModel(object):
    WILDCARD = '*'
+    INTERNAL_VERSION = 1  # increase whenever self._data format changes to invalidate cache!

    def __init__(self, arch=None, path_to_yaml=None, isa=None, lazy=False):
        if not arch and not path_to_yaml:
@@ -39,7 +42,7 @@ class MachineModel(object):
                'load_throughput_default': [],
                'ports': [],
                'port_model_scheme': None,
-                'instruction_forms': [],
+                'instruction_forms': []
            }
        else:
            if arch and path_to_yaml:
@@ -49,7 +52,7 @@ class MachineModel(object):
            yaml = self._create_yaml_object()
            if arch:
                self._arch = arch.lower()
-                self._path = utils.find_file(self._arch + '.yml')
+                self._path = utils.find_datafile(self._arch + '.yml')
            # check if file is cached
            cached = self._get_cached(self._path) if not lazy else False
            if cached:
@@ -59,8 +62,6 @@ class MachineModel(object):
                with open(self._path, 'r') as f:
                    if not lazy:
                        self._data = yaml.load(f)
-                        # cache file for next call
-                        self._write_in_cache(self._path, self._data)
                    else:
                        file_content = ''
                        line = f.readline()
@@ -69,21 +70,26 @@ class MachineModel(object):
                            line = f.readline()
                        self._data = yaml.load(file_content)
                        self._data['instruction_forms'] = []
-            # separate multi-alias instruction forms
-            for entry in [
-                x for x in self._data['instruction_forms'] if isinstance(x['name'], list)
-            ]:
-                for name in entry['name']:
-                    new_entry = {'name': name}
-                    for k in [x for x in entry.keys() if x != 'name']:
-                        new_entry[k] = entry[k]
-                    self._data['instruction_forms'].append(new_entry)
-                # remove old entry
-                self._data['instruction_forms'].remove(entry)
-            # For use with dict instead of list as DB
-            # self._data['instruction_dict'] = (
-            #     self._convert_to_dict(self._data['instruction_forms'])
-            # )
+                # separate multi-alias instruction forms
+                for entry in [x for x in self._data['instruction_forms']
+                              if isinstance(x['name'], list)]:
+                    for name in entry['name']:
+                        new_entry = {'name': name}
+                        for k in [x for x in entry.keys() if x != 'name']:
+                            new_entry[k] = entry[k]
+                        self._data['instruction_forms'].append(new_entry)
+                    # remove old entry
+                    self._data['instruction_forms'].remove(entry)
+                # Normalize instruction_form names (to UPPERCASE) and build dict for faster access:
+                self._data['instruction_forms_dict'] = defaultdict(list)
+                for iform in self._data['instruction_forms']:
+                    iform['name'] = iform['name'].upper()
+                    self._data['instruction_forms_dict'][iform['name']].append(iform)
+                self._data['internal_version'] = self.INTERNAL_VERSION
+
+                if not lazy:
+                    # cache internal representation for future use
+                    self._write_in_cache(self._path)

    def __getitem__(self, key):
        """Return configuration entry."""
@@ -98,36 +104,21 @@ class MachineModel(object):
    def get_instruction(self, name, operands):
        """Find and return instruction data from name and operands."""
        # For use with dict instead of list as DB
-        # return self.get_instruction_from_dict(name, operands)
        if name is None:
            return None
+        name_matched_iforms = self._data['instruction_forms_dict'].get(name.upper(), [])
        try:
            return next(
                instruction_form
-                for instruction_form in self._data['instruction_forms']
-                if instruction_form['name'].upper() == name.upper()
-                and self._match_operands(
+                for instruction_form in name_matched_iforms if self._match_operands(
                    instruction_form['operands'] if 'operands' in instruction_form else [],
-                    operands,
-                )
-            )
+                    operands))
        except StopIteration:
            return None
        except TypeError as e:
            print('\nname: {}\noperands: {}'.format(name, operands))
            raise TypeError from e

-    def get_instruction_from_dict(self, name, operands):
-        """Find and return instruction data from name and operands stored in dictionary."""
-        if name is None:
-            return None
-        try:
-            # Check if key is in dict
-            instruction_form = self._data['instruction_dict'][self._get_key(name, operands)]
-            return instruction_form
-        except KeyError:
-            return None
-
    def average_port_pressure(self, port_pressure):
        """Construct average port pressure list from instruction data."""
        port_list = self._data['ports']
@@ -234,13 +225,15 @@ class MachineModel(object):
                for y in list(filter(lambda x: True if x != 'class' else False, op))
            ]
            operands.append('{}({})'.format(op['class'], ','.join(op_attrs)))
-        return '{}  {}'.format(instruction_form['name'], ','.join(operands))
+        return '{}  {}'.format(instruction_form['name'].lower(), ','.join(operands))

    @staticmethod
    def get_isa_for_arch(arch):
        """Return ISA for given micro-arch ``arch``."""
        arch_dict = {
+            'a64fx': 'aarch64',
            'tx2': 'aarch64',
+            'n1': 'aarch64',
            'zen1': 'x86',
            'zen+': 'x86',
            'zen2': 'x86',
@@ -292,7 +285,8 @@ class MachineModel(object):
            {
                k: v
                for k, v in self._data.items()
-                if k not in ['instruction_forms', 'load_throughput']
+                if k not in ['instruction_forms', 'instruction_forms_dict', 'load_throughput',
+                             'internal_version']
            },
            stream,
        )
@@ -312,35 +306,54 @@ class MachineModel(object):
        :type filepath: str
        :returns: cached DB if existing, `False` otherwise
        """
-        hashname = self._get_hashname(filepath)
-        cachepath = utils.exists_cached_file(hashname + '.pickle')
-        if cachepath:
-            # Check if modification date of DB is older than cached version
-            if os.path.getmtime(filepath) < os.path.getmtime(cachepath):
-                # load cached version
-                cached_db = pickle.load(open(cachepath, 'rb'))
-                return cached_db
-            else:
-                # DB newer than cached version --> delete cached file and return False
-                os.remove(cachepath)
+        p = Path(filepath)
+        hexhash = hashlib.sha256(p.read_bytes()).hexdigest()
+
+        # 1. companion cachefile: same location, with '.<name>_<sha512hash>.pickle'
+        companion_cachefile = p.with_name('.' + p.stem + '_' + hexhash).with_suffix('.pickle')
+        if companion_cachefile.exists():
+            # companion file (must be up-to-date, due to equal hash)
+            with companion_cachefile.open('rb') as f:
+                data = pickle.load(f)
+            if data.get('internal_version') == self.INTERNAL_VERSION:
+                return data
+
+        # 2. home cachefile: ~/.osaca/cache/<name>_<sha512hash>.pickle
+        home_cachefile = (Path(utils.CACHE_DIR) / (p.stem + '_' + hexhash)).with_suffix('.pickle')
+        if home_cachefile.exists():
+            # home file (must be up-to-date, due to equal hash)
+            with home_cachefile.open('rb') as f:
+                data = pickle.load(f)
+            if data.get('internal_version') == self.INTERNAL_VERSION:
+                return data
        return False

-    def _write_in_cache(self, filepath, data):
+    def _write_in_cache(self, filepath):
        """
        Write machine model to cache

        :param filepath: path to store DB
        :type filepath: str
-        :param data: :class:`MachineModel` to store
-        :type data: :class:`dict`
        """
-        hashname = self._get_hashname(filepath)
-        filepath = os.path.join(utils.CACHE_DIR, hashname + '.pickle')
-        pickle.dump(data, open(filepath, 'wb'))
+        p = Path(filepath)
+        hexhash = hashlib.sha256(p.read_bytes()).hexdigest()
+        # 1. companion cachefile: same location, with '.<name>_<sha512hash>.pickle'
+        companion_cachefile = p.with_name('.' + p.stem + '_' + hexhash).with_suffix('.pickle')
+        if os.access(str(companion_cachefile.parent), os.W_OK):
+            with companion_cachefile.open('wb') as f:
+                pickle.dump(self._data, f)
+                return

-    def _get_hashname(self, name):
-        """Returns unique hashname for machine model"""
-        return base64.b64encode(name.encode()).decode()
+        # 2. home cachefile: ~/.osaca/cache/<name>_<sha512hash>.pickle
+        cache_dir = Path(utils.CACHE_DIR)
+        try:
+            os.makedirs(cache_dir, exist_ok=True)
+        except OSError:
+            return
+        home_cachefile = (cache_dir / (p.stem + '_' + hexhash)).with_suffix('.pickle')
+        if os.access(str(home_cachefile.parent), os.W_OK):
+            with home_cachefile.open('wb') as f:
+                pickle.dump(self._data, f)

    def _get_key(self, name, operands):
        """Get unique instruction form key for dict DB."""
@@ -350,18 +363,6 @@ class MachineModel(object):
        key_string += '_'.join([self._get_operand_hash(op) for op in operands])
        return key_string

-    def _convert_to_dict(self, instruction_forms):
-        """Convert list DB to dict DB"""
-        instruction_dict = {}
-        for instruction_form in instruction_forms:
-            instruction_dict[
-                self._get_key(
-                    instruction_form['name'],
-                    instruction_form['operands'] if 'operands' in instruction_form else None,
-                )
-            ] = instruction_form
-        return instruction_dict
-
    def _get_operand_hash(self, operand):
        """Get unique key for operand for dict DB"""
        operand_string = ''
@@ -396,7 +397,7 @@ class MachineModel(object):
                operand_string += 'p' if operand['post-indexed'] else ''
        return operand_string

-    def _create_db_operand_aarch64(operand):
+    def _create_db_operand_aarch64(self, operand):
        """Create instruction form operand for DB out of operand string."""
        if operand == 'i':
            return {'class': 'immediate', 'imd': 'int'}
@@ -417,7 +418,7 @@ class MachineModel(object):
        else:
            raise ValueError('Parameter {} is not a valid operand code'.format(operand))

-    def _create_db_operand_x86(operand):
+    def _create_db_operand_x86(self, operand):
        """Create instruction form operand for DB out of operand string."""
        if operand == 'r':
            return {'class': 'register', 'name': 'gpr'}
@@ -490,6 +491,7 @@ class MachineModel(object):
        if 'class' in operand:
            # compare two DB entries
            return self._compare_db_entries(i_operand, operand)
+        # TODO support class wildcards
        # register
        if 'register' in operand:
            if i_operand['class'] != 'register':
@@ -501,12 +503,14 @@ class MachineModel(object):
                return False
            return self._is_AArch64_mem_type(i_operand, operand['memory'])
        # immediate
+        # TODO support wildcards
        if 'value' in operand or ('immediate' in operand and 'value' in operand['immediate']):
            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'int'
        if 'float' in operand or ('immediate' in operand and 'float' in operand['immediate']):
            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'float'
        if 'double' in operand or ('immediate' in operand and 'double' in operand['immediate']):
            return i_operand['class'] == 'immediate' and i_operand['imd'] == 'double'
+        # identifier
        if 'identifier' in operand or (
            'immediate' in operand and 'identifier' in operand['immediate']
        ):
@@ -526,7 +530,7 @@ class MachineModel(object):
        if 'register' in operand:
            if i_operand['class'] != 'register':
                return False
-            return self._is_x86_reg_type(i_operand['name'], operand['register'])
+            return self._is_x86_reg_type(i_operand, operand['register'], consider_masking=True)
        # memory
        if 'memory' in operand:
            if i_operand['class'] != 'memory':
@@ -546,7 +550,9 @@ class MachineModel(object):
        )
        for key in operand_attributes:
            try:
-                if operand_1[key] != operand_2[key] and not any([x == self.WILDCARD for x in [operand_1[key], operand_2[key]]]):
+                if operand_1[key] != operand_2[key] and not any(
+                    [x == self.WILDCARD for x in [operand_1[key], operand_2[key]]]
+                ):
                    return False
            except KeyError:
                return False
@@ -573,8 +579,13 @@ class MachineModel(object):
            return False
        return True

-    def _is_x86_reg_type(self, i_reg_name, reg):
+    def _is_x86_reg_type(self, i_reg, reg, consider_masking=False):
        """Check if register type match."""
+        i_reg_name = i_reg['name'] if i_reg and 'name' in i_reg else i_reg
+        if reg is None:
+            if i_reg is None:
+                return True
+            return False
        # check for wildcards
        if i_reg_name == self.WILDCARD or reg['name'] == self.WILDCARD:
            return True
@@ -582,6 +593,33 @@ class MachineModel(object):
        parser_x86 = ParserX86ATT()
        if parser_x86.is_vector_register(reg):
            if reg['name'].rstrip(string.digits).lower() == i_reg_name:
+                # Consider masking and zeroing for AVX512
+                if consider_masking:
+                    mask_ok = zero_ok = True
+                    if 'mask' in reg or 'mask' in i_reg:
+                        # one instruction is missing the masking while the other has it
+                        mask_ok = False
+                        # check for wildcard
+                        if (
+                            (
+                                'mask' in reg
+                                and reg['mask'].rstrip(string.digits).lower() == i_reg.get('mask')
+                            )
+                            or reg.get('mask') == self.WILDCARD
+                            or i_reg.get('mask') == self.WILDCARD
+                        ):
+                            mask_ok = True
+                        if bool('zeroing' in reg) ^ bool('zeroing' in i_reg):
+                            # one instruction is missing zeroing while the other has it
+                            zero_ok = False
+                            # check for wildcard
+                            if (
+                                i_reg.get('zeroing') == self.WILDCARD
+                                or reg.get('zeroing') == self.WILDCARD
+                            ):
+                                zero_ok = True
+                        if not mask_ok or not zero_ok:
+                            return False
                return True
        else:
            if i_reg_name == 'gpr':
--- a/osaca/semantics/isa_semantics.py
+++ b/osaca/semantics/isa_semantics.py
@@ -2,7 +2,7 @@
 from itertools import chain

 from osaca import utils
-from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
+from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT

 from .hw_model import MachineModel

@@ -26,12 +26,12 @@ class ISASemantics(object):

    def __init__(self, isa, path_to_yaml=None):
        self._isa = isa.lower()
-        path = utils.find_file('isa/' + self._isa + '.yml') if not path_to_yaml else path_to_yaml
+        path = path_to_yaml or utils.find_datafile('isa/' + self._isa + '.yml')
        self._isa_model = MachineModel(path_to_yaml=path)
        if self._isa == 'x86':
            self._parser = ParserX86ATT()
        elif self._isa == 'aarch64':
-            self._parser = ParserAArch64v81()
+            self._parser = ParserAArch64()

    def process(self, instruction_forms):
        """Process a list of instruction forms."""
@@ -52,7 +52,6 @@ class ISASemantics(object):
            return
        # check if instruction form is in ISA yaml, otherwise apply standard operand assignment
        # (one dest, others source)
-        # import pdb; pdb.set_trace()
        isa_data = self._isa_model.get_instruction(
            instruction_form['instruction'], instruction_form['operands']
        )
@@ -103,14 +102,14 @@ class ISASemantics(object):
                if ('post_indexed' in operand['memory'] and operand['memory']['post_indexed']) or (
                    'pre_indexed' in operand['memory'] and operand['memory']['pre_indexed']
                ):
-                    op_dict['source'].remove(operand)
-                    op_dict['src_dst'].append(operand)
+                    op_dict['src_dst'].append(AttrDict.convert_dict(
+                        {'register': operand['memory']['base']}))
            for operand in [op for op in op_dict['destination'] if 'memory' in op]:
                if ('post_indexed' in operand['memory'] and operand['memory']['post_indexed']) or (
                    'pre_indexed' in operand['memory'] and operand['memory']['pre_indexed']
                ):
-                    op_dict['destination'].remove(operand)
-                    op_dict['src_dst'].append(operand)
+                    op_dict['src_dst'].append(AttrDict.convert_dict(
+                        {'register': operand['memory']['base']}))
        # store operand list in dict and reassign operand key/value pair
        instruction_form['semantic_operands'] = AttrDict.convert_dict(op_dict)
        # assign LD/ST flags
--- a/osaca/semantics/marker_utils.py
+++ b/osaca/semantics/marker_utils.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 from collections import OrderedDict

-from osaca.parser import ParserAArch64v81, ParserX86ATT, get_parser
+from osaca.parser import ParserAArch64, ParserX86ATT, get_parser

 COMMENT_MARKER = {'start': 'OSACA-BEGIN', 'end': 'OSACA-END'}

@@ -22,9 +22,9 @@ def reduce_to_section(kernel, isa):
    else:
        raise ValueError('ISA not supported.')
    if start == -1:
-        raise LookupError('Could not find START MARKER. Make sure it is inserted!')
+        start = 0
    if end == -1:
-        raise LookupError('Could not find END MARKER. Make sure it is inserted!')
+        end = len(kernel)
    return kernel[start:end]


@@ -38,7 +38,7 @@ def find_marked_kernel_AArch64(lines):
    nop_bytes = ['213', '3', '32', '31']
    return find_marked_section(
        lines,
-        ParserAArch64v81(),
+        ParserAArch64(),
        ['mov'],
        'x1',
        [111, 222],
@@ -277,6 +277,11 @@ def find_basic_loop_bodies(lines):
            current_block.append(line)
            # Find end of block by searching for references to valid jump labels
            if line['instruction'] and line['operands']:
+                # Ignore `b.none` instructions (relevant von ARM SVE code)
+                # This branch instruction is often present _within_ inner loop blocks, but usually 
+                # do not terminate
+                if line['instruction'] == 'b.none':
+                    continue
                for operand in [o for o in line['operands'] if 'identifier' in o]:
                    if operand['identifier']['name'] in valid_jump_labels:
                        if operand['identifier']['name'] == label:
--- a/osaca/utils.py
+++ b/osaca/utils.py
@@ -1,28 +1,14 @@
 #!/usr/bin/env python3
 import os.path

+DATA_DIRS = [os.path.expanduser('~/.osaca/data'), os.path.join(os.path.dirname(__file__), 'data')]
 CACHE_DIR = os.path.expanduser('~/.osaca/cache')


-def find_file(name):
+def find_datafile(name):
    """Check for existence of name in user or package data folders and return path."""
-    search_paths = [os.path.expanduser('~/.osaca/data'),
-                    os.path.join(os.path.dirname(__file__), 'data')]
-    for dir in search_paths:
+    for dir in DATA_DIRS:
        path = os.path.join(dir, name)
        if os.path.exists(path):
            return path
-    raise FileNotFoundError("Could not find {!r} in {!r}.".format(name, search_paths))
-
-
-def exists_cached_file(name):
-    """Check for existence of file in cache dir. Returns path if it exists and False otherwise."""
-    if not os.path.exists(CACHE_DIR):
-        os.makedirs(CACHE_DIR)
-        return False
-    search_paths = [CACHE_DIR]
-    for dir in search_paths:
-        path = os.path.join(dir, name)
-        if os.path.exists(path):
-            return path
-    return False
+    raise FileNotFoundError("Could not find {!r} in {!r}.".format(name, DATA_DIRS))
--- a/setup.py
+++ b/setup.py
@@ -2,11 +2,14 @@

 # Always prefer setuptools over distutils
 from setuptools import setup, find_packages
+from setuptools.command.install import install as _install
+from setuptools.command.sdist import sdist as _sdist
 # To use a consistent encoding
 from codecs import open
 import os
 import io
 import re
+import sys

 here = os.path.abspath(os.path.dirname(__file__))

@@ -27,6 +30,27 @@ def find_version(*file_paths):
    raise RuntimeError("Unable to find version string.")


+def _run_build_cache(dir):
+    from subprocess import check_call
+    # This is run inside the install staging directory (that had no .pyc files)
+    # We don't want to generate any.
+    # https://github.com/eliben/pycparser/pull/135
+    check_call([sys.executable, '-B', '_build_cache.py'],
+               cwd=os.path.join(dir, 'osaca', 'data'))
+
+
+class install(_install):
+    def run(self):
+        _install.run(self)
+        self.execute(_run_build_cache, (self.install_lib,), msg="Build ISA and architecture cache")
+
+
+class sdist(_sdist):
+    def make_release_tree(self, basedir, files):
+        _sdist.make_release_tree(self, basedir, files)
+        self.execute(_run_build_cache, (basedir,), msg="Build ISA and architecture cache")
+
+
 # Get the long description from the README file
 with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f:
    long_description = f.read()
@@ -59,7 +83,7 @@ setup(
        #   3 - Alpha
        #   4 - Beta
        #   5 - Production/Stable
-        'Development Status :: 3 - Alpha',
+        'Development Status :: 4 - Beta',

        # Indicate who your project is intended for
        'Intended Audience :: Developers',
@@ -76,6 +100,9 @@ setup(
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
    ],

    # What doesd your project relate to?
@@ -91,8 +118,8 @@ setup(
    # https://packaging.python.org/en/latest/requirements.html
    install_requires=[
        'networkx',
-        'pyparsing',
-        'ruamel.yaml',
+        'pyparsing>=2.3.1',
+        'ruamel.yaml>=0.15.71',
    ],
    python_requires='>=3.5',

@@ -124,4 +151,7 @@ setup(
            'osaca=osaca.osaca:main',
        ],
    },
+
+    # Overwriting install and sdist to enforce cache distribution with package
+    cmdclass={'install': install, 'sdist': sdist},
 )
--- a/tests/all_tests.py
+++ b/tests/all_tests.py
@@ -8,7 +8,7 @@ suite = unittest.TestLoader().loadTestsFromNames(
    [
        'test_base_parser',
        'test_parser_x86att',
-        'test_parser_AArch64v81',
+        'test_parser_AArch64',
        'test_marker_utils',
        'test_semantics',
        'test_frontend',
--- a/tests/test_base_parser.py
+++ b/tests/test_base_parser.py
@@ -18,6 +18,12 @@ class TestBaseParser(unittest.TestCase):
            pass
        with open(self._find_file('triad_x86_iaca.s')) as f:
            self.triad_code = f.read()
+        with open(self._find_file('triad_arm_iaca.s')) as f:
+            self.triad_code_arm = f.read()
+        with open(self._find_file('kernel_x86.s')) as f:
+            self.x86_code = f.read()
+        with open(self._find_file('kernel_aarch64.s')) as f:
+            self.aarch64_code = f.read()

    ##################
    # Test
@@ -59,6 +65,12 @@ class TestBaseParser(unittest.TestCase):
        with self.assertRaises(NotImplementedError):
            self.parser.normalize_imd(imd_hex_1)

+    def test_detect_ISA(self):
+        self.assertEqual(BaseParser.detect_ISA(self.triad_code), 'x86')
+        self.assertEqual(BaseParser.detect_ISA(self.triad_code_arm), 'aarch64')
+        self.assertEqual(BaseParser.detect_ISA(self.x86_code), 'x86')
+        self.assertEqual(BaseParser.detect_ISA(self.aarch64_code), 'aarch64')
+
    ##################
    # Helper functions
    ##################
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -11,7 +11,7 @@ from shutil import copyfile
 from unittest.mock import patch

 import osaca.osaca as osaca
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT
 from osaca.semantics import MachineModel


@@ -71,7 +71,7 @@ class TestCLI(unittest.TestCase):

    def test_get_parser(self):
        self.assertTrue(isinstance(osaca.get_asm_parser('csx'), ParserX86ATT))
-        self.assertTrue(isinstance(osaca.get_asm_parser('tx2'), ParserAArch64v81))
+        self.assertTrue(isinstance(osaca.get_asm_parser('tx2'), ParserAArch64))
        with self.assertRaises(ValueError):
            osaca.get_asm_parser('UNKNOWN')

@@ -153,6 +153,64 @@ class TestCLI(unittest.TestCase):
                output = StringIO()
                osaca.run(args, output_file=output)

+    def test_without_arch(self):
+        # Run test kernels without --arch flag
+        parser = osaca.create_parser()
+        # x86
+        kernel_x86 = 'kernel_x86.s'
+        args = parser.parse_args([self._find_test_file(kernel_x86)])
+        output = StringIO()
+        osaca.run(args, output_file=output)
+        # AArch64
+        kernel_aarch64 = 'kernel_aarch64.s'
+        args = parser.parse_args([self._find_test_file(kernel_aarch64)])
+        osaca.run(args, output_file=output)
+    
+    def test_user_warnings(self):
+        parser = osaca.create_parser()
+        kernel = 'triad_x86_unmarked.s'
+        args = parser.parse_args(
+            ['--arch', 'csx', '--ignore-unknown', self._find_test_file(kernel)]
+        )
+        output = StringIO()
+        osaca.run(args, output_file=output)
+        # WARNING for length
+        self.assertTrue(output.getvalue().count('WARNING') == 1)
+        args = parser.parse_args(
+            ['--lines', '100-199', '--ignore-unknown', self._find_test_file(kernel)]
+        )
+        output = StringIO()
+        osaca.run(args, output_file=output)
+        # WARNING for arch
+        self.assertTrue(output.getvalue().count('WARNING') == 1)
+
+
+    def test_lines_arg(self):
+        # Run tests with --lines option
+        parser = osaca.create_parser()
+        kernel_x86 = 'triad_x86_iaca.s'
+        args_base = parser.parse_args(
+            ['--arch', 'csx', self._find_test_file(kernel_x86)]
+        )
+        output_base = StringIO()
+        osaca.run(args_base, output_file=output_base)
+        output_base = output_base.getvalue().split('\n')[8:]
+        args = []
+        args.append(parser.parse_args(
+            ['--lines', '146-154', '--arch', 'csx', self._find_test_file(kernel_x86)]
+        ))
+        args.append(parser.parse_args(
+            ['--lines', '146:154', '--arch', 'csx', self._find_test_file(kernel_x86)]
+        ))
+        args.append(parser.parse_args(
+            ['--lines', '146,147:148,149-154', '--arch', 'csx', self._find_test_file(kernel_x86)]
+        ))
+        for a in args:
+            with self.subTest(params=a):
+                output = StringIO()
+                osaca.run(a, output_file=output)
+                self.assertEqual(output.getvalue().split('\n')[8:], output_base)
+
    ##################
    # Helper functions
    ##################
--- a/tests/test_db_interface.py
+++ b/tests/test_db_interface.py
@@ -124,6 +124,18 @@ class TestDBInterface(unittest.TestCase):
        with self.assertRaises(AssertionError):
            dbi.import_benchmark_output('csx', 'ibench', 'invalid_file')

+    def test_online_scraping(self):
+        # addpd -- suspicious instruction, normal URL
+        instr_1 = ['addpd', (True, '(r) (r,w)')]
+        self.assertEqual(dbi._scrape_from_felixcloutier(instr_1[0]), instr_1[1])
+        # movpd -- not suspicious,
+        instr_2 = ['movapd', (False, '(r) (w)')]
+        self.assertEqual(dbi._scrape_from_felixcloutier(instr_2[0]), instr_2[1])
+        # vfmadd132pd -- only in combined view with 213/231.
+        # No 2-operand version, therefore, empty string
+        instr_3 = ['vfmadd132pd', (True, '')]
+        self.assertEqual(dbi._scrape_from_felixcloutier(instr_3[0]), instr_3[1])
+
    ##################
    # Helper functions
    ##################
--- a/tests/test_files/triad_x86_unmarked.s
+++ b/tests/test_files/triad_x86_unmarked.s
@@ -0,0 +1,345 @@
+	.file	"triad.c"
+	.section	.rodata.str1.8,"aMS",@progbits,1
+	.align 8
+.LC9:
+	.string	"%12.1f | %9.8f | %9.3f | %7.1f | %7.1f | %7d | %4d \n"
+	.text
+	.p2align 4,,15
+	.globl	triad
+	.type	triad, @function
+triad:
+.LFB24:
+	.cfi_startproc
+	pushq	%r13
+	.cfi_def_cfa_offset 16
+	.cfi_offset 13, -16
+	movslq	%edi, %rax
+	movl	$64, %edi
+	leaq	16(%rsp), %r13
+	.cfi_def_cfa 13, 0
+	andq	$-32, %rsp
+	pushq	-8(%r13)
+	pushq	%rbp
+	.cfi_escape 0x10,0x6,0x2,0x76,0
+	movq	%rsp, %rbp
+	pushq	%r15
+	.cfi_escape 0x10,0xf,0x2,0x76,0x78
+	leaq	0(,%rax,8), %r15
+	pushq	%r14
+	movq	%r15, %rsi
+	pushq	%r13
+	.cfi_escape 0xf,0x3,0x76,0x68,0x6
+	.cfi_escape 0x10,0xe,0x2,0x76,0x70
+	pushq	%r12
+	pushq	%rbx
+	.cfi_escape 0x10,0xc,0x2,0x76,0x60
+	.cfi_escape 0x10,0x3,0x2,0x76,0x58
+	movq	%rax, %rbx
+	subq	$72, %rsp
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r14
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r12
+	call	aligned_alloc
+	movq	%r15, %rsi
+	movl	$64, %edi
+	movq	%rax, %r13
+	call	aligned_alloc
+	movq	%rax, %r15
+	leal	-1(%rbx), %eax
+	movl	%eax, -96(%rbp)
+	testl	%ebx, %ebx
+	jle	.L2
+	cmpl	$2, %eax
+	jbe	.L14
+	movl	%ebx, %esi
+	vmovapd	.LC0(%rip), %ymm0
+	xorl	%eax, %eax
+	xorl	%ecx, %ecx
+	shrl	$2, %esi
+	.p2align 4,,10
+	.p2align 3
+.L4:
+	addl	$1, %ecx
+	vmovapd	%ymm0, (%r15,%rax)
+	vmovapd	%ymm0, 0(%r13,%rax)
+	vmovapd	%ymm0, (%r12,%rax)
+	vmovapd	%ymm0, (%r14,%rax)
+	addq	$32, %rax
+	cmpl	%ecx, %esi
+	ja	.L4
+	movl	%ebx, %eax
+	andl	$-4, %eax
+	cmpl	%eax, %ebx
+	je	.L26
+	vzeroupper
+.L3:
+	vmovsd	.LC1(%rip), %xmm0
+	movslq	%eax, %rcx
+	vmovsd	%xmm0, (%r15,%rcx,8)
+	vmovsd	%xmm0, 0(%r13,%rcx,8)
+	vmovsd	%xmm0, (%r12,%rcx,8)
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	leal	1(%rax), %ecx
+	cmpl	%ecx, %ebx
+	jle	.L2
+	movslq	%ecx, %rcx
+	addl	$2, %eax
+	vmovsd	%xmm0, (%r15,%rcx,8)
+	vmovsd	%xmm0, 0(%r13,%rcx,8)
+	vmovsd	%xmm0, (%r12,%rcx,8)
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	cmpl	%eax, %ebx
+	jle	.L2
+	cltq
+	vmovsd	%xmm0, (%r15,%rax,8)
+	vmovsd	%xmm0, 0(%r13,%rax,8)
+	vmovsd	%xmm0, (%r12,%rax,8)
+	vmovsd	%xmm0, (%r14,%rax,8)
+.L2:
+	movl	%ebx, %eax
+	movl	$1, -84(%rbp)
+	movl	%ebx, %r10d
+	andl	$-4, %eax
+	shrl	$2, %r10d
+	movl	%eax, -100(%rbp)
+	.p2align 4,,10
+	.p2align 3
+.L13:
+	leaq	-56(%rbp), %rsi
+	leaq	-72(%rbp), %rdi
+	movl	%r10d, -88(%rbp)
+	call	timing
+	movl	-88(%rbp), %r10d
+	xorl	%r11d, %r11d
+	.p2align 4,,10
+	.p2align 3
+.L12:
+	vmovsd	(%r14), %xmm0
+	vxorpd	%xmm7, %xmm7, %xmm7
+	vucomisd	%xmm7, %xmm0
+	jbe	.L6
+	movq	%r14, %rdi
+	movl	%r11d, -92(%rbp)
+	movl	%r10d, -88(%rbp)
+	vzeroupper
+	call	dummy
+	movl	-92(%rbp), %r11d
+	movl	-88(%rbp), %r10d
+.L6:
+	testl	%ebx, %ebx
+	jle	.L8
+	cmpl	$2, -96(%rbp)
+	jbe	.L15
+	xorl	%eax, %eax
+	xorl	%ecx, %ecx
+	.p2align 4,,10
+	.p2align 3
+.L10:
+	vmovapd	(%r15,%rax), %ymm0
+	vmovapd	(%r12,%rax), %ymm3
+	addl	$1, %ecx
+	vfmadd132pd	0(%r13,%rax), %ymm3, %ymm0
+	vmovapd	%ymm0, (%r14,%rax)
+	addq	$32, %rax
+	cmpl	%ecx, %r10d
+	ja	.L10
+	movl	-100(%rbp), %eax
+	cmpl	%ebx, %eax
+	je	.L8
+.L9:
+	movslq	%eax, %rcx
+	vmovsd	0(%r13,%rcx,8), %xmm0
+	vmovsd	(%r12,%rcx,8), %xmm5
+	vfmadd132sd	(%r15,%rcx,8), %xmm5, %xmm0
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	leal	1(%rax), %ecx
+	cmpl	%ebx, %ecx
+	jge	.L8
+	movslq	%ecx, %rcx
+	addl	$2, %eax
+	vmovsd	0(%r13,%rcx,8), %xmm0
+	vmovsd	(%r12,%rcx,8), %xmm6
+	vfmadd132sd	(%r15,%rcx,8), %xmm6, %xmm0
+	vmovsd	%xmm0, (%r14,%rcx,8)
+	cmpl	%eax, %ebx
+	jle	.L8
+	cltq
+	vmovsd	(%r15,%rax,8), %xmm0
+	vmovsd	(%r12,%rax,8), %xmm4
+	vfmadd132sd	0(%r13,%rax,8), %xmm4, %xmm0
+	vmovsd	%xmm0, (%r14,%rax,8)
+.L8:
+	addl	$1, %r11d
+	cmpl	-84(%rbp), %r11d
+	jne	.L12
+	leaq	-56(%rbp), %rsi
+	leaq	-64(%rbp), %rdi
+	movl	%r11d, -84(%rbp)
+	movl	%r10d, -88(%rbp)
+	vzeroupper
+	call	timing
+	vmovsd	-64(%rbp), %xmm1
+	vsubsd	-72(%rbp), %xmm1, %xmm1
+	vmovsd	.LC3(%rip), %xmm2
+	movl	-84(%rbp), %r11d
+	movl	-88(%rbp), %r10d
+	vucomisd	%xmm1, %xmm2
+	leal	(%r11,%r11), %eax
+	movl	%eax, -84(%rbp)
+	ja	.L13
+	movl	%eax, %esi
+	vxorpd	%xmm6, %xmm6, %xmm6
+	vxorpd	%xmm0, %xmm0, %xmm0
+	movl	%ebx, %edx
+	sarl	%esi
+	vcvtsi2sd	%ebx, %xmm0, %xmm0
+	movl	$.LC9, %edi
+	movl	$5, %eax
+	vcvtsi2sd	%esi, %xmm6, %xmm6
+	vmulsd	.LC5(%rip), %xmm6, %xmm2
+	vmovsd	.LC4(%rip), %xmm5
+	vmovsd	.LC6(%rip), %xmm7
+	vmulsd	%xmm0, %xmm6, %xmm4
+	vmulsd	%xmm0, %xmm2, %xmm2
+	vdivsd	%xmm1, %xmm4, %xmm4
+	vdivsd	%xmm1, %xmm2, %xmm2
+	vdivsd	%xmm5, %xmm4, %xmm4
+	vmulsd	%xmm7, %xmm2, %xmm3
+	vaddsd	%xmm0, %xmm0, %xmm2
+	vmulsd	.LC8(%rip), %xmm0, %xmm0
+	vmulsd	%xmm6, %xmm2, %xmm2
+	vmulsd	.LC7(%rip), %xmm2, %xmm2
+	vmulsd	%xmm7, %xmm3, %xmm3
+	vdivsd	%xmm5, %xmm0, %xmm0
+	vdivsd	%xmm5, %xmm4, %xmm4
+	vdivsd	%xmm1, %xmm2, %xmm2
+	call	printf
+	movq	%r14, %rdi
+	call	free
+	movq	%r12, %rdi
+	call	free
+	movq	%r13, %rdi
+	call	free
+	addq	$72, %rsp
+	movq	%r15, %rdi
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	.cfi_remember_state
+	.cfi_def_cfa 13, 0
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	leaq	-16(%r13), %rsp
+	.cfi_def_cfa 7, 16
+	popq	%r13
+	.cfi_def_cfa_offset 8
+	jmp	free
+	.p2align 4,,10
+	.p2align 3
+.L15:
+	.cfi_restore_state
+	xorl	%eax, %eax
+	jmp	.L9
+.L26:
+	vzeroupper
+	jmp	.L2
+.L14:
+	xorl	%eax, %eax
+	jmp	.L3
+	.cfi_endproc
+.LFE24:
+	.size	triad, .-triad
+	.section	.rodata.str1.8
+	.align 8
+.LC10:
+	.string	"TRIAD a[i] = b[i]+c[i]*d[i], 32 byte/it, 2 Flop/it"
+	.align 8
+.LC11:
+	.string	"Size (KByte) |   runtime  |  MFlop/s  |  MB/s   |  MLUP/s | repeat | size"
+	.section	.text.startup,"ax",@progbits
+	.p2align 4,,15
+	.globl	main
+	.type	main, @function
+main:
+.LFB25:
+	.cfi_startproc
+	pushq	%rbx
+	.cfi_def_cfa_offset 16
+	.cfi_offset 3, -16
+	movl	$.LC10, %edi
+	movl	$20, %ebx
+	call	puts
+	movl	$.LC11, %edi
+	call	puts
+	.p2align 4,,10
+	.p2align 3
+.L28:
+	vxorpd	%xmm1, %xmm1, %xmm1
+	movq	.LC12(%rip), %rax
+	vcvtsi2sd	%ebx, %xmm1, %xmm1
+	addl	$1, %ebx
+	vmovq	%rax, %xmm0
+	call	pow
+	vcvttsd2si	%xmm0, %edi
+	call	triad
+	cmpl	$36, %ebx
+	jne	.L28
+	xorl	%eax, %eax
+	popq	%rbx
+	.cfi_def_cfa_offset 8
+	ret
+	.cfi_endproc
+.LFE25:
+	.size	main, .-main
+	.section	.rodata.cst32,"aM",@progbits,32
+	.align 32
+.LC0:
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.long	1907715710
+	.long	1048610426
+	.section	.rodata.cst8,"aM",@progbits,8
+	.align 8
+.LC1:
+	.long	1907715710
+	.long	1048610426
+	.align 8
+.LC3:
+	.long	2576980378
+	.long	1070176665
+	.align 8
+.LC4:
+	.long	0
+	.long	1083129856
+	.align 8
+.LC5:
+	.long	0
+	.long	1077936128
+	.align 8
+.LC6:
+	.long	0
+	.long	1062207488
+	.align 8
+.LC7:
+	.long	2696277389
+	.long	1051772663
+	.align 8
+.LC8:
+	.long	0
+	.long	1075838976
+	.align 8
+.LC12:
+	.long	3435973837
+	.long	1073007820
+	.ident	"GCC: (GNU) 7.2.0"
+	.section	.note.GNU-stack,"",@progbits
--- a/tests/test_frontend.py
+++ b/tests/test_frontend.py
@@ -7,7 +7,7 @@ import os
 import unittest

 from osaca.frontend import Frontend
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT
 from osaca.semantics import ArchSemantics, KernelDG, MachineModel


@@ -20,7 +20,7 @@ class TestFrontend(unittest.TestCase):
    def setUpClass(self):
        # set up parser and kernels
        self.parser_x86 = ParserX86ATT()
-        self.parser_AArch64 = ParserAArch64v81()
+        self.parser_AArch64 = ParserAArch64()
        with open(self._find_file('kernel_x86.s')) as f:
            code_x86 = f.read()
        with open(self._find_file('kernel_aarch64.s')) as f:
@@ -33,7 +33,7 @@ class TestFrontend(unittest.TestCase):
            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
        )
        self.machine_model_tx2 = MachineModel(
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
+            arch='tx2'
        )
        self.semantics_csx = ArchSemantics(
            self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
--- a/tests/test_kerncraftAPI.py
+++ b/tests/test_kerncraftAPI.py
@@ -9,7 +9,7 @@ import unittest
 from collections import OrderedDict

 from osaca.api import KerncraftAPI
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT


 class TestKerncraftAPI(unittest.TestCase):
@@ -17,7 +17,7 @@ class TestKerncraftAPI(unittest.TestCase):
    def setUpClass(self):
        # set up parser and kernels
        self.parser_x86 = ParserX86ATT()
-        self.parser_AArch64 = ParserAArch64v81()
+        self.parser_AArch64 = ParserAArch64()
        with open(self._find_file('triad_x86_iaca.s')) as f:
            self.code_x86 = f.read()
        with open(self._find_file('triad_arm_iaca.s')) as f:
@@ -63,7 +63,7 @@ class TestKerncraftAPI(unittest.TestCase):
                ('0DV', 0.0),
                ('1', 34.0),
                ('1DV', 0.0),
-                ('2', 2.0),
+                ('2', 3.0),
                ('3', 64.0),
                ('4', 64.0),
                ('5', 32.0),
--- a/tests/test_marker_utils.py
+++ b/tests/test_marker_utils.py
@@ -8,13 +8,13 @@ from collections import OrderedDict

 from osaca.semantics import reduce_to_section, find_basic_blocks, find_jump_labels, \
    find_basic_loop_bodies
-from osaca.parser import ParserAArch64v81, ParserX86ATT
+from osaca.parser import ParserAArch64, ParserX86ATT


 class TestMarkerUtils(unittest.TestCase):
    @classmethod
    def setUpClass(self):
-        self.parser_AArch = ParserAArch64v81()
+        self.parser_AArch = ParserAArch64()
        self.parser_x86 = ParserX86ATT()
        with open(self._find_file('triad_arm_iaca.s')) as f:
            triad_code_arm = f.read()
@@ -178,120 +178,115 @@ class TestMarkerUtils(unittest.TestCase):

    def test_marker_special_cases_AArch(self):
        bytes_line = '.byte     213,3,32,31\n'
-        mov_start = 'mov      x1, #111\n'
-        mov_end = 'mov      x1, #222\n'
-        prologue = 'dup v0.2d, x14\n' + '    neg x9, x9\n' + '    .p2align    6\n'
+        start_marker = 'mov      x1, #111\n' + bytes_line
+        end_marker = 'mov      x1, #222\n' + bytes_line
+        prologue = (
+            'dup v0.2d, x14\n'
+            'neg x9, x9\n'
+            '.p2align    6\n')
        kernel = (
            '.LBB0_28:\n'
            + 'fmul    v7.2d, v7.2d, v19.2d\n'
            + 'stp q0, q1, [x10, #-32]\n'
-            + 'b.ne    .LBB0_28\n'
-        )
-        epilogue = '.LBB0_29:   //   Parent Loop BB0_20 Depth=1\n' + 'bl    dummy\n'
-        kernel_length = len(list(filter(None, kernel.split('\n'))))
+            + 'b.ne    .LBB0_28\n')
+        epilogue = (
+            '.LBB0_29:   //   Parent Loop BB0_20 Depth=1\n'
+            'bl    dummy\n')

-        # marker directly at the beginning
-        code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        beginning_parsed = self.parser_AArch.parse_file(code_beginning)
-        test_kernel = reduce_to_section(beginning_parsed, 'AArch64')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
+        samples = [
+            # (test name,
+            #  ignored prologue, section to be extraced, ignored epilogue)
+            ("markers",
+             prologue + start_marker, kernel, end_marker + epilogue),
+            ("marker at file start",
+             start_marker, kernel, end_marker + epilogue),
+            ("no start marker",
+             '', prologue + kernel, end_marker + epilogue),
+            ("marker at file end",
+             prologue + start_marker, kernel, end_marker),
+            ("no end marker",
+             prologue + start_marker, kernel + epilogue, ''),
+            ("empty kernel",
+             prologue + start_marker, '', end_marker + epilogue),
+        ]

-        # marker at the end
-        code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        end_parsed = self.parser_AArch.parse_file(code_end)
-        test_kernel = reduce_to_section(end_parsed, 'AArch64')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
-
-        # no kernel
-        code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
-        empty_parsed = self.parser_AArch.parse_file(code_empty)
-        test_kernel = reduce_to_section(empty_parsed, 'AArch64')
-        self.assertEqual(len(test_kernel), 0)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        self.assertEqual(test_kernel, [])
-
-        # no start marker
-        code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
-        no_start_parsed = self.parser_AArch.parse_file(code_no_start)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_start_parsed, 'AArch64')
-
-        # no end marker
-        code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
-        no_end_parsed = self.parser_AArch.parse_file(code_no_end)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_end_parsed, 'AArch64')
-
-        # no marker at all
-        code_no_marker = prologue + kernel + epilogue
-        no_marker_parsed = self.parser_AArch.parse_file(code_no_marker)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_marker_parsed, 'AArch64')
+        for test_name, pro, kernel, epi in samples:
+            code = pro + kernel + epi
+            parsed = self.parser_AArch.parse_file(code)
+            test_kernel = reduce_to_section(parsed, 'AArch64')
+            if kernel:
+                kernel_length = len(kernel.strip().split('\n'))
+            else:
+                kernel_length = 0
+            self.assertEqual(
+                len(test_kernel), kernel_length,
+                msg="Invalid exctracted kernel length on {!r} sample".format(test_name))
+            if pro:
+                kernel_start = len((pro).strip().split('\n'))
+            else:
+                kernel_start = 0
+            parsed_kernel = self.parser_AArch.parse_file(kernel, start_line=kernel_start)
+            self.assertEqual(
+                test_kernel, parsed_kernel,
+                msg="Invalid exctracted kernel on {!r}".format(test_name))

    def test_marker_special_cases_x86(self):
-        bytes_line = '.byte     100\n.byte     103\n.byte     144\n'
-        mov_start = 'movl     $111, %ebx\n'
-        mov_end = 'movl     $222, %ebx\n'
-        prologue = 'movl    -88(%rbp), %r10d\n' + 'xorl    %r11d, %r11d\n' + '.p2align 4,,10\n'
+        bytes_line = (
+            '.byte     100\n'
+            '.byte     103\n'
+            '.byte     144\n')
+        start_marker = 'movl     $111, %ebx\n' + bytes_line
+        end_marker = 'movl     $222, %ebx\n' + bytes_line
+        prologue = (
+            'movl    -88(%rbp), %r10d\n'
+            'xorl    %r11d, %r11d\n'
+            '.p2align 4,,10\n')
        kernel = (
            '.L3: #L3\n'
-            + 'vmovsd  .LC1(%rip), %xmm0\n'
-            + 'vmovsd  %xmm0, (%r15,%rcx,8)\n'
-            + 'cmpl    %ecx, %ebx\n'
-            + 'jle .L3\n'
-        )
-        epilogue = 'leaq    -56(%rbp), %rsi\n' + 'movl    %r10d, -88(%rbp)\n' + 'call    timing\n'
-        kernel_length = len(list(filter(None, kernel.split('\n'))))
+            'vmovsd  .LC1(%rip), %xmm0\n'
+            'vmovsd  %xmm0, (%r15,%rcx,8)\n'
+            'cmpl    %ecx, %ebx\n'
+            'jle .L3\n')
+        epilogue = (
+            'leaq    -56(%rbp), %rsi\n'
+            'movl    %r10d, -88(%rbp)\n'
+            'call    timing\n')
+        samples = [
+            # (test name,
+            #  ignored prologue, section to be extraced, ignored epilogue)
+            ("markers",
+             prologue + start_marker, kernel, end_marker + epilogue),
+            ("marker at file start",
+             start_marker, kernel, end_marker + epilogue),
+            ("no start marker",
+             '', prologue + kernel, end_marker + epilogue),
+            ("marker at file end",
+             prologue + start_marker, kernel, end_marker),
+            ("no end marker",
+             prologue + start_marker, kernel + epilogue, ''),
+            ("empty kernel",
+             prologue + start_marker, '', end_marker + epilogue),
+        ]

-        # marker directly at the beginning
-        code_beginning = mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        beginning_parsed = self.parser_x86.parse_file(code_beginning)
-        test_kernel = reduce_to_section(beginning_parsed, 'x86')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
-
-        # marker at the end
-        code_end = prologue + mov_start + bytes_line + kernel + mov_end + bytes_line + epilogue
-        end_parsed = self.parser_x86.parse_file(code_end)
-        test_kernel = reduce_to_section(end_parsed, 'x86')
-        self.assertEqual(len(test_kernel), kernel_length)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
-        self.assertEqual(test_kernel, parsed_kernel)
-
-        # no kernel
-        code_empty = prologue + mov_start + bytes_line + mov_end + bytes_line + epilogue
-        empty_parsed = self.parser_x86.parse_file(code_empty)
-        test_kernel = reduce_to_section(empty_parsed, 'x86')
-        self.assertEqual(len(test_kernel), 0)
-        kernel_start = len(list(filter(None, (prologue + mov_start + bytes_line).split('\n'))))
-        self.assertEqual(test_kernel, [])
-
-        # no start marker
-        code_no_start = prologue + bytes_line + kernel + mov_end + bytes_line + epilogue
-        no_start_parsed = self.parser_x86.parse_file(code_no_start)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_start_parsed, 'x86')
-
-        # no end marker
-        code_no_end = prologue + mov_start + bytes_line + kernel + mov_end + epilogue
-        no_end_parsed = self.parser_x86.parse_file(code_no_end)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_end_parsed, 'x86')
-
-        # no marker at all
-        code_no_marker = prologue + kernel + epilogue
-        no_marker_parsed = self.parser_x86.parse_file(code_no_marker)
-        with self.assertRaises(LookupError):
-            reduce_to_section(no_marker_parsed, 'x86')
+        for test_name, pro, kernel, epi in samples:
+            code = pro + kernel + epi
+            parsed = self.parser_x86.parse_file(code)
+            test_kernel = reduce_to_section(parsed, 'x86')
+            if kernel:
+                kernel_length = len(kernel.strip().split('\n'))
+            else:
+                kernel_length = 0
+            self.assertEqual(
+                len(test_kernel), kernel_length,
+                msg="Invalid exctracted kernel length on {!r} sample".format(test_name))
+            if pro:
+                kernel_start = len((pro).strip().split('\n'))
+            else:
+                kernel_start = 0
+            parsed_kernel = self.parser_x86.parse_file(kernel, start_line=kernel_start)
+            self.assertEqual(
+                test_kernel, parsed_kernel,
+                msg="Invalid exctracted kernel on {!r}".format(test_name))

    def test_find_jump_labels(self):
        self.assertEqual(find_jump_labels(self.parsed_x86),
--- a/tests/test_parser_AArch64v81.py
+++ b/tests/test_parser_AArch64v81.py
@@ -8,13 +8,13 @@ import unittest

 from pyparsing import ParseException

-from osaca.parser import AttrDict, ParserAArch64v81
+from osaca.parser import AttrDict, ParserAArch64


-class TestParserAArch64v81(unittest.TestCase):
+class TestParserAArch64(unittest.TestCase):
    @classmethod
    def setUpClass(self):
-        self.parser = ParserAArch64v81()
+        self.parser = ParserAArch64()
        with open(self._find_file('triad_arm_iaca.s')) as f:
            self.triad_code = f.read()

@@ -146,8 +146,8 @@ class TestParserAArch64v81(unittest.TestCase):
    def test_parse_line(self):
        line_comment = '// -- Begin  main'
        line_label = '.LBB0_1:              // =>This Inner Loop Header: Depth=1'
-        line_directive = '\t.cfi_def_cfa w29, -16'
-        line_instruction = '\tldr s0, [x11, w10, sxtw #2]\t\t// = <<2'
+        line_directive = '.cfi_def_cfa w29, -16'
+        line_instruction = 'ldr s0, [x11, w10, sxtw #2]    // = <<2'
        line_prefetch = 'prfm    pldl1keep, [x26, #2048] //HPL'
        line_preindexed = 'stp x29, x30, [sp, #-16]!'
        line_postindexed = 'ldp q2, q3, [x11], #64'
@@ -201,7 +201,7 @@ class TestParserAArch64v81(unittest.TestCase):
            'directive': None,
            'comment': '= <<2',
            'label': None,
-            'line': 'ldr s0, [x11, w10, sxtw #2]\t\t// = <<2',
+            'line': 'ldr s0, [x11, w10, sxtw #2]    // = <<2',
            'line_number': 4,
        }
        instruction_form_5 = {
@@ -309,23 +309,23 @@ class TestParserAArch64v81(unittest.TestCase):
        self.assertEqual(self.parser.normalize_imd(identifier), identifier)

    def test_multiple_regs(self):
-        instr_range = 'PUSH {r5-r7}'
+        instr_range = 'PUSH {x5-x7}'
        reg_range = AttrDict({
            'register': {
                'range': [
-                    {'prefix': 'r', 'name': '5'},
-                    {'prefix': 'r', 'name': '7'}
+                    {'prefix': 'x', 'name': '5'},
+                    {'prefix': 'x', 'name': '7'}
                ],
                'index': None
            }
        })
-        instr_list = 'POP {r5, r7, r9}'
+        instr_list = 'POP {x5, x7, x9}'
        reg_list = AttrDict({
            'register': {
                'list': [
-                    {'prefix': 'r', 'name': '5'},
-                    {'prefix': 'r', 'name': '7'},
-                    {'prefix': 'r', 'name': '9'}
+                    {'prefix': 'x', 'name': '5'},
+                    {'prefix': 'x', 'name': '7'},
+                    {'prefix': 'x', 'name': '9'}
                ],
                'index': None
            }
@@ -411,5 +411,5 @@ class TestParserAArch64v81(unittest.TestCase):


 if __name__ == '__main__':
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestParserAArch64v81)
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParserAArch64)
    unittest.TextTestRunner(verbosity=2).run(suite)
--- a/tests/test_parser_x86att.py
+++ b/tests/test_parser_x86att.py
@@ -45,20 +45,31 @@ class TestParserX86ATT(unittest.TestCase):
        self.assertEqual(len(self._get_directive(self.parser, '\t.text').parameters), 0)
        self.assertEqual(self._get_directive(self.parser, '\t.align\t16,0x90').name, 'align')
        self.assertEqual(len(self._get_directive(self.parser, '\t.align\t16,0x90').parameters), 2)
+        self.assertEqual(len(self._get_directive(self.parser, '.text').parameters), 0)
+        self.assertEqual(
+            len(self._get_directive(self.parser, '.file\t1 "path/to/file.c"').parameters), 2
+        )
+        self.assertEqual(
+            self._get_directive(self.parser, '.file\t1 "path/to/file.c"').parameters[1],
+            '"path/to/file.c"',
+        )
        self.assertEqual(
            self._get_directive(self.parser, '\t.set\tL$set$0,LECIE1-LSCIE1').parameters,
-            [{'name': 'L$set$0'}, {'name': 'LECIE1-LSCIE1'}])
+            ['L$set$0', 'LECIE1-LSCIE1'],
+        )
        self.assertEqual(
            self._get_directive(
-                self.parser, 
-                '\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support'
-                ).parameters,
-            [{'name': v} for v in 
-             ['__TEXT', '__eh_frame', 'coalesced', 'no_toc+strip_static_syms+live_support']])
+                self.parser,
+                '\t.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support',
+            ).parameters,
+            ['__TEXT', '__eh_frame', 'coalesced', 'no_toc+strip_static_syms+live_support'],
+        )
        self.assertEqual(
            self._get_directive(
-                self.parser, '\t.section\t__TEXT,__literal16,16byte_literals').parameters,
-            [{'name': v} for v in ['__TEXT', '__literal16', '16byte_literals']])
+                self.parser, '\t.section\t__TEXT,__literal16,16byte_literals'
+            ).parameters,
+            ['__TEXT', '__literal16', '16byte_literals'],
+        )
        self.assertEqual(
            self._get_directive(self.parser, '\t.align\t16,0x90').parameters[1], '0x90'
        )
@@ -145,8 +156,8 @@ class TestParserX86ATT(unittest.TestCase):
    def test_parse_line(self):
        line_comment = '# -- Begin  main'
        line_label = '..B1.7:                         # Preds ..B1.6'
-        line_directive = '\t\t.quad   .2.3_2__kmpc_loc_pack.2 #qed'
-        line_instruction = '\t\tlea       2(%rax,%rax), %ecx #12.9'
+        line_directive = '.quad   .2.3_2__kmpc_loc_pack.2 #qed'
+        line_instruction = 'lea       2(%rax,%rax), %ecx #12.9'

        instruction_form_1 = {
            'instruction': None,
@@ -169,7 +180,7 @@ class TestParserX86ATT(unittest.TestCase):
        instruction_form_3 = {
            'instruction': None,
            'operands': [],
-            'directive': {'name': 'quad', 'parameters': [{'name': '.2.3_2__kmpc_loc_pack.2'}]},
+            'directive': {'name': 'quad', 'parameters': ['.2.3_2__kmpc_loc_pack.2']},
            'comment': 'qed',
            'label': None,
            'line': '.quad   .2.3_2__kmpc_loc_pack.2 #qed',
@@ -186,9 +197,7 @@ class TestParserX86ATT(unittest.TestCase):
                        'scale': 1,
                    }
                },
-                {
-                    'register': {'name': 'ecx'}
-                }
+                {'register': {'name': 'ecx'}},
            ],
            'directive': None,
            'comment': '12.9',
--- a/tests/test_semantics.py
+++ b/tests/test_semantics.py
@@ -11,7 +11,7 @@ from subprocess import call
 import networkx as nx

 from osaca.osaca import get_unmatched_instruction_ratio
-from osaca.parser import AttrDict, ParserAArch64v81, ParserX86ATT
+from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT
 from osaca.semantics import (INSTR_FLAGS, ArchSemantics, KernelDG,
                             MachineModel, reduce_to_section)

@@ -20,48 +20,43 @@ class TestSemanticTools(unittest.TestCase):
    MODULE_DATA_DIR = os.path.join(
        os.path.dirname(os.path.split(os.path.abspath(__file__))[0]), 'osaca/data/'
    )
-    USER_DATA_DIR = os.path.join(os.path.expanduser('~'), '.osaca/')

    @classmethod
-    def setUpClass(self):
-        # copy db files in user directory
-        if not os.path.isdir(os.path.join(self.USER_DATA_DIR, 'data')):
-            os.makedirs(os.path.join(self.USER_DATA_DIR, 'data'))
-            call(['cp', '-r', self.MODULE_DATA_DIR, self.USER_DATA_DIR])
+    def setUpClass(cls):
        # set up parser and kernels
-        self.parser_x86 = ParserX86ATT()
-        self.parser_AArch64 = ParserAArch64v81()
-        with open(self._find_file('kernel_x86.s')) as f:
-            self.code_x86 = f.read()
-        with open(self._find_file('kernel_aarch64.s')) as f:
-            self.code_AArch64 = f.read()
-        self.kernel_x86 = reduce_to_section(self.parser_x86.parse_file(self.code_x86), 'x86')
-        self.kernel_AArch64 = reduce_to_section(
-            self.parser_AArch64.parse_file(self.code_AArch64), 'aarch64'
+        cls.parser_x86 = ParserX86ATT()
+        cls.parser_AArch64 = ParserAArch64()
+        with open(cls._find_file('kernel_x86.s')) as f:
+            cls.code_x86 = f.read()
+        with open(cls._find_file('kernel_aarch64.s')) as f:
+            cls.code_AArch64 = f.read()
+        cls.kernel_x86 = reduce_to_section(cls.parser_x86.parse_file(cls.code_x86), 'x86')
+        cls.kernel_AArch64 = reduce_to_section(
+            cls.parser_AArch64.parse_file(cls.code_AArch64), 'aarch64'
        )

        # set up machine models
-        self.machine_model_csx = MachineModel(
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'csx.yml')
+        cls.machine_model_csx = MachineModel(
+            path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'csx.yml')
        )
-        self.machine_model_tx2 = MachineModel(
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'tx2.yml')
+        cls.machine_model_tx2 = MachineModel(
+            path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'tx2.yml')
        )
-        self.semantics_csx = ArchSemantics(
-            self.machine_model_csx, path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/x86.yml')
+        cls.semantics_csx = ArchSemantics(
+            cls.machine_model_csx, path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'isa/x86.yml')
        )
-        self.semantics_tx2 = ArchSemantics(
-            self.machine_model_tx2,
-            path_to_yaml=os.path.join(self.MODULE_DATA_DIR, 'isa/aarch64.yml'),
+        cls.semantics_tx2 = ArchSemantics(
+            cls.machine_model_tx2,
+            path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, 'isa/aarch64.yml'),
        )
-        self.machine_model_zen = MachineModel(arch='zen1')
+        cls.machine_model_zen = MachineModel(arch='zen1')

-        for i in range(len(self.kernel_x86)):
-            self.semantics_csx.assign_src_dst(self.kernel_x86[i])
-            self.semantics_csx.assign_tp_lt(self.kernel_x86[i])
-        for i in range(len(self.kernel_AArch64)):
-            self.semantics_tx2.assign_src_dst(self.kernel_AArch64[i])
-            self.semantics_tx2.assign_tp_lt(self.kernel_AArch64[i])
+        for i in range(len(cls.kernel_x86)):
+            cls.semantics_csx.assign_src_dst(cls.kernel_x86[i])
+            cls.semantics_csx.assign_tp_lt(cls.kernel_x86[i])
+        for i in range(len(cls.kernel_AArch64)):
+            cls.semantics_tx2.assign_src_dst(cls.kernel_AArch64[i])
+            cls.semantics_tx2.assign_tp_lt(cls.kernel_AArch64[i])

    ###########
    # Tests
@@ -88,28 +83,21 @@ class TestSemanticTools(unittest.TestCase):
        self.assertIsNone(test_mm_x86.get_instruction(None, []))
        self.assertIsNone(test_mm_arm.get_instruction(None, []))

-        # test dict DB creation
-        test_mm_x86._data['instruction_dict'] = test_mm_x86._convert_to_dict(
-            test_mm_x86._data['instruction_forms']
-        )
-        test_mm_arm._data['instruction_dict'] = test_mm_arm._convert_to_dict(
-            test_mm_arm._data['instruction_forms']
-        )
-        # test get_instruction from dict DB
-        self.assertIsNone(test_mm_x86.get_instruction_from_dict(None, []))
-        self.assertIsNone(test_mm_arm.get_instruction_from_dict(None, []))
-        self.assertIsNone(test_mm_x86.get_instruction_from_dict('NOT_IN_DB', []))
-        self.assertIsNone(test_mm_arm.get_instruction_from_dict('NOT_IN_DB', []))
+        # test get_instruction from DB
+        self.assertIsNone(test_mm_x86.get_instruction(None, []))
+        self.assertIsNone(test_mm_arm.get_instruction(None, []))
+        self.assertIsNone(test_mm_x86.get_instruction('NOT_IN_DB', []))
+        self.assertIsNone(test_mm_arm.get_instruction('NOT_IN_DB', []))
        name_x86_1 = 'vaddpd'
        operands_x86_1 = [
            {'class': 'register', 'name': 'xmm'},
            {'class': 'register', 'name': 'xmm'},
            {'class': 'register', 'name': 'xmm'},
        ]
-        instr_form_x86_1 = test_mm_x86.get_instruction_from_dict(name_x86_1, operands_x86_1)
+        instr_form_x86_1 = test_mm_x86.get_instruction(name_x86_1, operands_x86_1)
        self.assertEqual(instr_form_x86_1, test_mm_x86.get_instruction(name_x86_1, operands_x86_1))
        self.assertEqual(
-            test_mm_x86.get_instruction_from_dict('jg', [{'class': 'identifier'}]),
+            test_mm_x86.get_instruction('jg', [{'class': 'identifier'}]),
            test_mm_x86.get_instruction('jg', [{'class': 'identifier'}]),
        )
        name_arm_1 = 'fadd'
@@ -118,10 +106,10 @@ class TestSemanticTools(unittest.TestCase):
            {'class': 'register', 'prefix': 'v', 'shape': 's'},
            {'class': 'register', 'prefix': 'v', 'shape': 's'},
        ]
-        instr_form_arm_1 = test_mm_arm.get_instruction_from_dict(name_arm_1, operands_arm_1)
+        instr_form_arm_1 = test_mm_arm.get_instruction(name_arm_1, operands_arm_1)
        self.assertEqual(instr_form_arm_1, test_mm_arm.get_instruction(name_arm_1, operands_arm_1))
        self.assertEqual(
-            test_mm_arm.get_instruction_from_dict('b.ne', [{'class': 'identifier'}]),
+            test_mm_arm.get_instruction('b.ne', [{'class': 'identifier'}]),
            test_mm_arm.get_instruction('b.ne', [{'class': 'identifier'}]),
        )

--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py35,py36
+envlist = py35,py36,py37,py38,py39
 [testenv]
 commands=
    python tests/all_tests.py
Author	SHA1	Message	Date
Julian Hammer	4ff8fdc4ab	version bump	2020-11-11 15:14:27 +01:00
JanLJL	c204096d74	fixed typo	2020-11-11 14:11:00 +01:00
JanLJL	dea217c12c	fixed test after changing TP value of instruction	2020-11-11 14:04:07 +01:00
JanLJL	92c162daa2	new instructions	2020-11-11 13:54:23 +01:00
JanLJL	87ea8f0f0a	new instructions	2020-11-11 12:27:49 +01:00
Julian Hammer	cb04efc384	fixed typo	2020-11-10 13:33:24 +01:00
JanLJL	14c0ea6180	bugfixes	2020-11-09 23:29:42 +01:00
Julian Hammer	314ff4cf9d	improved performance of arch_semantics and reg dependency matching	2020-11-09 19:27:47 +01:00
Julian Hammer	f64253b2b9	added dict for instruction lookup	2020-11-09 17:00:46 +01:00
Julian Hammer	979d08358e	singelton for isa parsers	2020-11-09 12:36:14 +01:00
Julian Hammer	a2dd6f752d	added comment	2020-11-09 12:35:13 +01:00
Julian Hammer	2fb36406a7	performance improvement of throughput summation	2020-11-09 12:01:00 +01:00
Julian Hammer	94086033a8	added __main__.py	2020-11-09 08:27:31 +01:00
JanLJL	75edfc808a	version bump	2020-11-06 20:40:13 +01:00
JanLJL	c8c077a834	enhanced length warning	2020-11-06 15:49:13 +01:00
JanLJL	26ee005adc	added missing test file	2020-11-06 15:07:57 +01:00
JanLJL	207c53aaad	minor bugfix in HW model and added user warnings for more insight	2020-11-06 15:06:36 +01:00
JanLJL	fafd7bc526	Merge branch 'master' of https://github.com/RRZE-HPC/OSACA	2020-11-06 12:57:46 +01:00
JanLJL	b986d7eba0	added --lines option	2020-11-06 12:57:41 +01:00
Julian Hammer	6b0adb5d68	improved cache handing (always hashing original file)	2020-11-06 12:27:34 +01:00
JanLJL	f9f382a948	bugfixes	2020-11-06 12:03:54 +01:00
Julian Hammer	c6b58c63ab	Merge branch 'master' of github.com:RRZE-HPC/OSACA	2020-11-03 16:28:28 +01:00
Julian Hammer	78530bfdb0	fail-safed _build_cache.py	2020-11-03 16:28:07 +01:00
JanLJL	5aa0899961	added bdist	2020-11-03 16:10:46 +01:00
JanLJL	7f0abd7d10	version bump	2020-11-02 15:48:19 +01:00
JanLJL	9ba9bab107	try different ISA as fallback when parsing without --arch flag, use SKX as x86 default and enhanced ISA detection heuristic	2020-11-02 15:33:50 +01:00
Julian Hammer	983e66938c	version bump	2020-10-29 13:15:23 +01:00
JanLJL	1c889fa785	Merge branch 'master' of https://github.com/RRZE-HPC/OSACA	2020-10-29 13:00:09 +01:00
JanLJL	022598d94f	autodetect ISA and default uarch for ISA	2020-10-29 13:00:02 +01:00
Julian	1f5c9d1c61	using travis-ci.com badge	2020-10-29 12:45:39 +01:00
JanLJL	30e0ad038d	ignore pickles in data/ and support py3.9	2020-10-29 11:06:20 +01:00
Julian Hammer	decec86e56	fixed py3.5 compatability	2020-10-29 10:59:00 +01:00
JanLJL	9af689b28c	fixed bug in tests and removed unused imports	2020-10-28 19:29:48 +01:00
Julian Hammer	3aea3f2b49	Merge branch 'master' of github.com:RRZE-HPC/OSACA	2020-10-28 17:16:43 +01:00
Julian Hammer	a6cb09cf1f	added cache files to package and building during setup	2020-10-28 17:16:03 +01:00
Julian Hammer	9d2ea8603f	new caching structure with support for distribution	2020-10-28 16:29:55 +01:00
JanLJL	a7918db145	enhanced hanlding for immediates with shifting	2020-10-21 12:14:21 +02:00
Julian Hammer	b5b1a1f2b2	version bump	2020-10-20 14:36:43 +02:00
Julian	dd59af16b2	Merge pull request #51 from RRZE-HPC/A64FX A64FX support and several Arm bugfixes and enhancements including better TP scheduling	2020-10-16 10:44:47 +02:00
JanLJL	d9325724e2	removed duplicate cmp entry	2020-10-16 10:11:51 +02:00
JanLJL	7e7269c2bc	refactored operand checking in post-processing	2020-10-16 10:05:08 +02:00
JanLJL	c64a24ae1b	no \t replacement before any other point than user output	2020-10-16 09:44:18 +02:00
JanLJL	e8b78e4cc6	Merge branch 'master' into A64FX	2020-10-15 22:44:12 +02:00
JanLJL	cd5a706f56	adjusted tests for AArch64	2020-10-15 17:56:08 +02:00
Jan	13426358d0	Merge pull request #50 from RRZE-HPC/fix/increment_handling Fixing Increment Handling	2020-10-15 17:00:11 +02:00
Julian Hammer	c80088b628	Merge branch 'master' into fix/increment_handling	2020-10-15 16:36:29 +02:00
Julian Hammer	748474cd81	added more cmp versions	2020-10-15 16:23:14 +02:00
Julian Hammer	2fec0bf810	Merge branch 'master' into fix/increment_handling	2020-10-15 13:55:34 +02:00
Julian Hammer	711a41d18e	extended and cleaned up marker tests	2020-10-15 13:54:18 +02:00
Julian Hammer	cf4a9cddcb	Merge branch 'master' into fix/increment_handling	2020-10-15 13:17:02 +02:00
Julian Hammer	5a5a1e74f5	added CMP to aarch64 to exclude first op from destinations	2020-10-15 13:15:54 +02:00
Julian Hammer	4865e7ea72	fixed ignoring of last line without end marker	2020-10-15 11:59:51 +02:00
Julian Hammer	d03398ddf9	treating post- and pre-incremeted memory references no longer as src_dst the incremented register is now considered src_dst instead	2020-10-13 19:25:29 +02:00
Julian Hammer	edb8df3205	considering split AVX loads on SNB and IVB	2020-10-13 11:25:13 +02:00
Julian Hammer	489050723c	removed a nother set of no-maker tests	2020-10-13 09:03:13 +02:00
Julian Hammer	0cc0d35ce9	removed maker missing tests	2020-10-12 19:34:04 +02:00
Julian Hammer	7f65bdb022	version bump	2020-10-12 15:39:49 +02:00
Julian Hammer	04360cc897	fixed label identifiers by splitting	2020-10-12 15:39:32 +02:00
Julian Hammer	5e7a12f9bb	paranthesis now suppored in identifier strings	2020-10-12 15:05:52 +02:00
Julian Hammer	1def12ee79	if not markes were found, use whole code	2020-10-12 15:04:55 +02:00
Julian Hammer	7269156854	added `--out` argument	2020-10-12 15:04:18 +02:00
Julian Hammer	d6529ced73	fixed push and added pop	2020-10-12 15:03:03 +02:00
Julian Hammer	eac728dc9f	added tx2 support for `ldp d1, d2, [x3]`	2020-10-07 13:57:57 +02:00
JanLJL	451ba62959	added vector mov	2020-09-23 10:07:43 +02:00
JanLJL	57cf1bfe6f	Merge branch 'master' of github.com:RRZE-HPC/osaca	2020-09-17 22:28:56 +02:00
JanLJL	44b921aa73	added BS4 dependency	2020-09-17 22:27:37 +02:00
JanLJL	accb52ce53	Merge branch 'master' of github.com:RRZE-HPC/osaca	2020-09-17 22:15:20 +02:00
JanLJL	9e78f85475	added instructions	2020-09-17 22:14:14 +02:00
JanLJL	64da89ec3d	enhancecd ARM identifier to support immediate offsets	2020-09-17 22:12:12 +02:00
JanLJL	adeae88665	instr update	2020-09-17 21:21:15 +02:00
JanLJL	1698ed1776	gather enhancement	2020-09-03 13:48:00 +02:00
JanLJL	2ef6051e64	added gather load instruction	2020-09-03 09:30:19 +02:00
Julian Hammer	3308f5d68f	version bump	2020-08-05 10:59:10 +02:00
Julian Hammer	bd61b94669	ignoring b.none branched in basic block detection	2020-08-03 19:23:33 +02:00
JanLJL	0db8b6bcbf	fixed first character match for symbolic identifiers	2020-08-03 18:30:29 +02:00
Jan	40755b2080	Merge pull request #49 from RRZE-HPC/coherent_label_parsing Coherent label parsing	2020-08-03 18:25:20 +02:00
JanLJL	269148c2a1	save b/f in numeric identifier as suffix tag	2020-08-03 18:08:29 +02:00
JanLJL	12a8506530	removed unnecessary code	2020-08-03 17:14:58 +02:00
JanLJL	e715badcf9	detects numeric label as label	2020-08-03 16:59:48 +02:00
Julian Hammer	d6b4355a77	labels may now start with numbers	2020-08-03 15:53:29 +02:00
JanLJL	5361b63b52	version bump	2020-08-03 09:38:50 +02:00
JanLJL	cc39342047	minor enhancement for mask parsing	2020-08-03 09:07:45 +02:00
JanLJL	addcdeda85	added sve instructions	2020-08-03 08:55:37 +02:00
JanLJL	23d36a651b	enhancements for SVE support	2020-08-03 08:54:59 +02:00
JanLJL	b052ab4151	bugfix in OoO scheduling	2020-07-28 14:52:30 +02:00
JanLJL	673da99fba	minor enhancements for scheduling	2020-07-23 15:55:56 +02:00
JanLJL	6c72281d65	prepared for aarch64 8.2 support	2020-07-23 15:54:54 +02:00
JanLJL	5520362e65	adjustments and bugfixes	2020-07-13 18:53:19 +02:00
JanLJL	93060eee43	Merge branch 'master' into A64FX	2020-07-13 14:41:49 +02:00
JanLJL	0e77b7bc9a	enhanced TP scheduling	2020-07-06 18:49:46 +02:00
JanLJL	ce8c3ff9ab	bugfixes for A64FX	2020-07-06 18:48:54 +02:00
Jan	acbde7a19c	Merge pull request #48 from RRZE-HPC/n1 initial implementation of Neoverse N1 support	2020-07-02 09:32:54 +02:00
Cloud User	34e978d2ae	initial implementation of Neoverse N1 support	2020-06-30 20:28:57 +00:00
JanLJL	6294e2e9da	initial commit for trying to support a64fx	2020-06-26 05:20:40 +02:00
JanLJL	6801229275	PEP8 adjustments	2020-06-25 21:56:18 +02:00
JanLJL	d3d1a89600	two new instrs	2020-06-25 21:55:10 +02:00
JanLJL	93c1951097	prettified aarch64 ISA DB	2020-06-25 21:54:52 +02:00
JanLJL	7211dd0799	improvements for uops.info importer script	2020-06-25 21:53:41 +02:00
JanLJL	5258d65c8e	few more instructions	2020-06-24 17:41:30 +02:00
JanLJL	379fe80169	added initial support for Intel Ice Lake (ICL)	2020-06-22 22:15:14 +02:00
JanLJL	94d7d35c0b	more instructions	2020-05-04 18:50:58 +02:00
JanLJL	1009c60d2d	fixed wrong output format for 3-digit TP numbers	2020-04-08 21:28:50 +02:00
JanLJL	229b316b6d	added some instructions	2020-04-08 15:54:31 +02:00
JanLJL	c0753be899	added python 3.7/3.8 to tests	2020-04-02 09:20:08 +02:00
JanLJL	eaa56792ab	added bs4 dependency for Travis	2020-04-02 09:08:08 +02:00
JanLJL	3425fa3024	added tests	2020-04-02 08:57:26 +02:00
JanLJL	38924b6ec1	more instructions	2020-03-30 18:27:33 +02:00
JanLJL	d6ae457de4	removed duplicates in CSX DB	2020-03-30 18:18:35 +02:00
JanLJL	a5c2ab1a4a	bugfix for online check of operands	2020-03-26 11:46:46 +01:00
JanLJL	e4393189dc	minor update	2020-03-26 11:06:11 +01:00
JanLJL	3016fc7c46	added more tests	2020-03-26 10:19:14 +01:00
JanLJL	82f47d217c	Merge branch 'master' of github.com:RRZE-HPC/osaca	2020-03-26 10:03:23 +01:00
JanLJL	1754df42d2	enhanced x86 parser for directives	2020-03-26 10:02:39 +01:00
Julian Hammer	ac1295aac2	flag string in output now in line with required flags	2020-03-24 16:02:40 +01:00
Julian Hammer	9624e6c109	closing cache file after dump	2020-03-24 15:20:49 +01:00
Julian Hammer	2d16037c44	Merge branch 'master' of github.com:RRZE-HPC/OSACA	2020-03-21 17:18:37 +01:00
Julian Hammer	c5801cfe2f	closing cache file	2020-03-21 17:18:04 +01:00
Julian Hammer	3e960dd4ac	closing cache file	2020-03-20 15:02:30 +01:00
JanLJL	680774267d	fixed wrong import of mm registers	2020-03-17 12:56:12 +01:00
JanLJL	1aa710f195	enhanced MachineModel to support mask/zeroing differentiation for instruction forms	2020-03-17 12:55:37 +01:00