Merge pull request #60 from RRZE-HPC/a72

Add support for ARM Cortex-A72
2026-01-05 10:40:06 +01:00 · 2021-10-14 18:10:36 +02:00
parent 8c94378437 c36fab40cb
commit 2995f1873d
7 changed files with 5714 additions and 2 deletions
--- a/README.rst
+++ b/README.rst
@@ -95,7 +95,7 @@ The usage of OSACA can be listed as:
 --arch ARCH
  needs to be replaced with the target architecture abbreviation.
  Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX`` and ``ICL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
-  Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse  and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
+  Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72  and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
  If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
 --fixed
  Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
--- a/osaca/data/a72.yml
+++ b/osaca/data/a72.yml
--- a/osaca/data/a72/mapping_pmevo.json
+++ b/osaca/data/a72/mapping_pmevo.json
--- a/osaca/data/a72/template.yml
+++ b/osaca/data/a72/template.yml
@@ -0,0 +1,808 @@
+osaca_version: 0.3.11
+micro_architecture: Cortex A-72
+arch_code: a72
+isa: aarch64
+hidden_loads: false
+ports: ['0', '1', '2', '3', '4', '5', '6', '7']
+port_model_scheme: |
+  +-------------------------------------------------------------------------------------+
+  |                                      scheduler                                      |
+  +-------------------------------------------------------------------------------------+
+    0 |I      1 |L       2 |M     3 |S        4 |F1       5 |I       6 |F0      7 |B
+      \/        \/         \/       \/          \/          \/         \/         \/
+   +-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
+   |INT ALU| |  LOAD | |  MUL  | | STORE | |   ASIMD   | |INT ALU| |  ASIMD  | | Branch|
+   +-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
+   +-------+           +-------+           +-----------+ +-------+ +---------+
+   |  AGU  |           |  DIV  |           |   FP ALU  | |  AGU  | |ASIMD MUL|
+   +-------+           +-------+           +-----------+ +-------+ +---------+
+                       +-------+           +-----------+           +---------+
+                       | SHIFT |           |   FP MUL  |           |  FP ALU |
+                       +-------+           +-----------+           +---------+
+                       +-------+           +-----------+           +---------+
+                       |  CRC  |           |   FP DIV  |           |  FP MUL |
+                       +-------+           +-----------+           +---------+
+                       +-------+           +-----------+           +---------+
+                       |  USAD |           |  FP SQRT  |           |  FP DIV |
+                       +-------+           +-----------+           +---------+
+                                           +-----------+           +---------+
+                                           |ASIMD SHIFT|           | FP CONV |
+                                           +-----------+           +---------+
+                                                                   +---------+
+                                                                   |  CRYPTO |
+                                                                   +---------+
+# The port pressues do not always correctly match this schema, because most
+# instructions are imported from an experimentally determined mapping, which
+# is not always correct.
+load_latency: {x: 4.0, s: 5.0, d: 5.0, h: 6.0, q: 6.0}
+load_throughput: []
+load_throughput_default: [[1, '1']]
+store_throughput: []
+store_throughput_default: [[2, '3']]
+instruction_forms:
+
+# Branch
+- name: b
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: bne
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: b.ne
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: br
+  operands:
+  - class: register
+    prefix: x
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: ret
+  operands:
+  - class: register
+    prefix: x
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: bl
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '05'], [1, '7']]
+  throughput: 1.0
+- name: blr
+  operands:
+  - class: register
+    prefix: x
+  latency: 1.0
+  port_pressure: [[1, '05'], [1, '7']]
+  throughput: 1.0
+
+# Load GPR
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 5.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+
+# Load FP d
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 5.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+
+# Load FP q
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: true
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: true
+  latency: 5.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 6.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+
+# Store GPR
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+
+# Store FP d
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+
+# Store FP q
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: true
+  latency: 2.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3'], [2, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 4.0
+  port_pressure: [[2, '3'], [2, '05']]
+  throughput: 2.0
+
+# Load unscaled GPR
+- name: ldur
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 4.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+
+# Load unscaled FP q
+- name: ldur
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 5.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+
+# Store unscaled GPR
+- name: stur
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 1.0
+  port_pressure: [[1, '3']]
+  throughput: 1.0
+
+# Store unscaled FP q
+- name: stur
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 2.0
+  port_pressure: [[2, '3']]
+  throughput: 2.0
+
+# Load pair GPR
+- name: ldp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 4.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+
+# Load pair FP q
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[2, '1']]
+  throughput: 2.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[2, '1'], [1, '05']]
+  throughput: 2.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 6.0
+  port_pressure: [[2, '1'], [1, '05']]
+  throughput: 2.0
+
+# Store pair GPR
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 2.0
+  port_pressure: [[2, '3']]
+  throughput: 2.0
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 2.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 2.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+
+# Store pair FP q
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[4, '3'], [1, '05']]
+  throughput: 4.0
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[4, '3'], [1, '05']]
+  throughput: 4.0
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 4.0
+  port_pressure: [[4, '3'], [1, '05']]
+  throughput: 4.0
+
+# Fast-forward (measures 4 cycles, but can be 3)
+# Lower bound is used in order to ensure no over-estimates are possible.
+# Ports do not match documentation, but "fixing" requires also "fixing" almost
+# the entire rest of the model.
+- name: fadd
+  operands:
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fadd
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+- name: fsub
+  operands:
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fsub
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+
+# Automatically generated instructions
--- a/osaca/data/pmevo_importer.py
+++ b/osaca/data/pmevo_importer.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import math
+import re
+import sys
+
+from asmbench import bench, op
+from osaca.semantics import MachineModel
+
+
+def build_bench_instruction(name, operands):
+    # Converts an OSACA model instruction to an asmbench one.
+    # Returns `None` in case something went wrong.
+    asmbench_inst = name
+    direction = "dst"
+    separator = " "
+    shift = ""
+    for operand in operands:
+        if operand["class"] == "register" or operand["class"] == "register_shift":
+            if operand["prefix"] == "x":
+                shape = "i64"
+                constraint = "r"
+            elif operand["prefix"] == "s":
+                shape = "float"
+                constraint = "w"
+            elif operand["prefix"] == "d":
+                shape = "double"
+                constraint = "w"
+            elif operand["prefix"] == "v":
+                constraint = "w"
+                if operand["shape"] == "b":
+                    shape = "<16 x i8>"
+                elif operand["shape"] == "h":
+                    shape = "<8 x i16>"
+                elif operand["shape"] == "s":
+                    shape = "<4 x float>"
+                elif operand["shape"] == "d":
+                    shape = "<2 x double>"
+                else:
+                    return None
+            else:
+                return None
+            if operand["class"] == "register_shift":
+                shift = ", {}".format(operand["shift_op"])
+                if operand["shift"] is not None:
+                    shift += " {}".format(operand["shift"])
+        elif operand["class"] == "immediate" or operand["class"] == "immediate_shift":
+            shape = "i32"
+            # Different instructions have different ranges for literaly,
+            # so need to pick something "reasonable" for each.
+            if name in [
+                "cmeq",
+                "cmge",
+                "cmgt",
+                "cmle",
+                "cmlt",
+                "fcmeq",
+                "fcmge",
+                "fcmgt",
+                "fcmle",
+                "fcmlt",
+                "fcmp",
+            ]:
+                constraint = "0"
+            elif name in ["and", "ands", "eor", "eors", "orr", "orrs"]:
+                constraint = "255"
+            elif name in ["bfi", "extr", "sbfiz", "sbfx", "shl", "sshr", "ubfiz", "ubfx", "ushr"]:
+                constraint = "7"
+            else:
+                constraint = "42"
+            if operand["class"] == "immediate_shift":
+                shift = ", {}".format(operand["shift_op"])
+                if operand["shift"] is not None:
+                    shift += " {}".format(operand["shift"])
+        else:
+            return None
+        asmbench_inst += "{}{{{}:{}:{}}}{}".format(separator, direction, shape, constraint, shift)
+        direction = "src"
+        separator = ", "
+    return asmbench_inst
+
+
+def bench_instruction(name, operands):
+    # Converts an OSACA model instruction to an asmbench one and benchmarks it.
+    # Returned tuple may contain a `None` in case something went wrong.
+    asmbench_inst = build_bench_instruction(name, operands)
+    if asmbench_inst is None:
+        return (None, None)
+    return bench.bench_instructions([op.Instruction.from_string(asmbench_inst)])
+
+
+def round_cycles(value):
+    if value < 0.9:
+        # Frequently found, so we might want to include them.
+        # Measurements over-estimate a lot here, hence the high bound.
+        return 0.5
+    else:
+        # Measurements usually over-estimate, so usually round down,
+        # but still allow slightly smaller values.
+        return float(math.floor(value + 0.1))
+
+
+def operand_parse(op, state):
+    # Parses an operand from an PMEvo instruction and emits an OSACA model one.
+    # State object is used to keep track of types for future operands, e.g. literals.
+    # Future invocations may also modify previously returned objects.
+    parameter = {}
+
+    if op.startswith("_((REG:"):
+        parts = op.split(".")
+        register = parts[0][7:-2]
+        read_write, register_type, bits = register.split(":")
+
+        parameter["class"] = "register"
+        if register_type == "G":
+            if bits == "32":
+                parameter["prefix"] = "r"
+            elif bits == "64":
+                parameter["prefix"] = "x"
+            else:
+                raise ValueError("Invalid register bits for {} {}".format(register_type, bits))
+        elif register_type == "F":
+            if bits == "32":
+                parameter["prefix"] = "s"
+                state["type"] = "float"
+            elif bits == "64":
+                parameter["prefix"] = "d"
+                state["type"] = "double"
+            elif bits == "128":
+                parameter["prefix"] = "q"
+            elif bits == "VEC":
+                vec_shape = parts[1]
+                parameter["prefix"] = "v"
+                if vec_shape == "16b":
+                    parameter["shape"] = "b"
+                elif vec_shape == "8h":
+                    parameter["shape"] = "h"
+                elif vec_shape == "4s":
+                    parameter["shape"] = "s"
+                    state["type"] = "float"
+                elif vec_shape == "2d":
+                    parameter["shape"] = "d"
+                    state["type"] = "double"
+                else:
+                    raise ValueError("Invalid vector shape {}".format(vec_shape))
+            else:
+                raise ValueError("Invalid register bits for {} {}".format(register_type, bits))
+        else:
+            raise ValueError("Unknown register type {}".format(register_type))
+    elif op.startswith("_[((MEM:"):
+        bits = op[8:-2].split(":")[0]
+        if bits == "64":
+            state["memory_base"] = "x"
+        else:
+            raise ValueError("Invalid register bits for MEM {}".format(bits))
+        return None
+    elif op.startswith("_((MIMM:"):
+        bits = op[8:-3].split(":")[0]
+        if bits == "16":
+            parameter["class"] = "memory"
+            parameter["base"] = state["memory_base"]
+            parameter["offset"] = "imd"
+            parameter["index"] = "*"
+            parameter["scale"] = "*"
+            parameter["post-indexed"] = False
+            parameter["pre-indexed"] = False
+        else:
+            raise ValueError("Invalid register bits for MEM {}".format(bits))
+    elif re.fullmatch("_#?-?(0x)?[0-9a-f]+", op):
+        parameter["class"] = "immediate"
+        parameter["imd"] = "int"
+    elif re.fullmatch("_#?-?[0-9]*\\.[0-9]*", op):
+        parameter["class"] = "immediate"
+        parameter["imd"] = state["type"]
+    elif re.fullmatch("_((sxt|uxt)[bhw]|lsl|lsr|asr|rol|ror)(_[0-9]+)?", op):
+        # split = op[1:].split('_')
+        # shift_op = split[0]
+        # shift = None
+        # if len(split) >= 2:
+        #     shift = split[1]
+        # state['previous']['class'] += '_shift'
+        # state['previous']['shift_op'] = shift_op
+        # if shift != None:
+        #     state['previous']['shift'] = shift
+        # return None
+        raise ValueError("Skipping instruction with shift operand: {}".format(op))
+    else:
+        raise ValueError("Unknown operand {}".format(op))
+
+    state["previous"] = parameter
+    return parameter
+
+
+def port_convert(ports):
+    # Try to merge repeated entries together and emit in OSACA's format.
+    # FIXME: This does not handle having more than 10 ports.
+    pressures = []
+    previous = None
+    cycles = 0
+
+    for entry in ports:
+        possible_ports = "".join(entry)
+
+        if possible_ports != previous:
+            if previous is not None:
+                pressures.append([cycles, previous])
+            previous = possible_ports
+            cycles = 0
+
+        cycles += 1
+
+    if previous is not None:
+        pressures.append([cycles, previous])
+
+    return pressures
+
+
+def throughput_guess(ports):
+    # Minimum amount of possible ports per cycle should determine throughput
+    # to some degree of accuracy. (THIS IS *NOT* ALWAYS TRUE!)
+    bottleneck_ports = min(map(lambda it: len(it), ports))
+    return float(len(ports)) / bottleneck_ports
+
+
+def latency_guess(ports):
+    # Each entry in the ports array equates to one cycle on any of the ports.
+    # So this is about as good as it is going to get.
+    return float(len(ports))
+
+
+def extract_model(mapping, arch, template_model, asmbench):
+    try:
+        isa = MachineModel.get_isa_for_arch(arch)
+    except ValueError:
+        print("Skipping...", file=sys.stderr)
+        return None
+    if template_model is None:
+        mm = MachineModel(isa=isa)
+    else:
+        mm = template_model
+
+    for port in mapping["arch"]["ports"]:
+        mm.add_port(port)
+
+    for insn in mapping["arch"]["insns"]:
+        try:
+            ports = mapping["assignment"][insn]
+
+            # Parse instruction
+            insn_split = insn.split("_")
+            name = insn_split[1]
+            insn_parts = list(("_" + "_".join(insn_split[2:])).split(","))
+            operands = []
+            state = {}
+            for operand in insn_parts:
+                parsed = operand_parse(operand, state)
+                if parsed is not None:
+                    operands.append(parsed)
+
+            # Port pressures from mapping
+            port_pressure = port_convert(ports)
+
+            # Initial guessed throughput and latency
+            throughput = throughput_guess(ports)
+            latency = latency_guess(ports)
+
+            # Benchmark with asmbench
+            # print(build_bench_instruction(name, operands))
+            if asmbench:
+                bench_latency, bench_throughput = bench_instruction(name, operands)
+                if bench_throughput is not None:
+                    throughput = round_cycles(bench_throughput)
+                else:
+                    print("Failed to measure throughput for instruction {}.".format(insn))
+                if bench_latency is not None:
+                    latency = round_cycles(bench_latency)
+                else:
+                    print("Failed to measure latency for instruction {}.".format(insn))
+
+            # No u-ops data available
+            uops = None
+
+            # Insert instruction if not already found (can happen with template)
+            if mm.get_instruction(name, operands) is None:
+                mm.set_instruction(name, operands, latency, port_pressure, throughput, uops)
+        except ValueError as e:
+            print("Failed to parse instruction {}: {}.".format(insn, e))
+
+    return mm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("json", help="path of mapping.json")
+    parser.add_argument("yaml", help="path of template.yml", nargs="?")
+    parser.add_argument(
+        "--asmbench", help="Benchmark latency and throughput using asmbench.", action="store_true"
+    )
+    args = parser.parse_args()
+
+    json_file = open(args.json, "r")
+    mapping = json.load(json_file)
+    arch = mapping["arch"]["name"].lower()
+    json_file.close()
+
+    template_model = None
+    if args.yaml is not None:
+        template_model = MachineModel(path_to_yaml=args.yaml)
+
+    if args.asmbench:
+        bench.setup_llvm()
+
+    model = extract_model(mapping, arch, template_model, args.asmbench)
+
+    with open("{}.yml".format(arch.lower()), "w") as f:
+        f.write(model.dump())
+
+
+if __name__ == "__main__":
+    main()
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -32,6 +32,7 @@ SUPPORTED_ARCHS = [
    "TX2",
    "N1",
    "A64FX",
+    "A72",
 ]
 DEFAULT_ARCHS = {
    "aarch64": "A64FX",
@@ -95,7 +96,7 @@ def create_parser(parser=None):
        "--arch",
        type=str,
        help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ZEN1, ZEN2, TX2, N1, "
-        "A64FX). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
+        "A64FX, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
    )
    parser.add_argument(
        "--fixed",
--- a/osaca/semantics/hw_model.py
+++ b/osaca/semantics/hw_model.py
@@ -266,11 +266,13 @@ class MachineModel(object):
        """Return ISA for given micro-arch ``arch``."""
        arch_dict = {
            "a64fx": "aarch64",
+            "a72": "aarch64",
            "tx2": "aarch64",
            "n1": "aarch64",
            "zen1": "x86",
            "zen+": "x86",
            "zen2": "x86",
+            "icl": "x86",
            "con": "x86",  # Intel Conroe
            "wol": "x86",  # Intel Wolfdale
            "snb": "x86",