version bump

black formatting
Merge branch 'master' of github.com:RRZE-HPC/OSACA
2025-12-16 09:00:05 +01:00 · 2021-11-04 14:56:23 +01:00 · 2021-11-04 12:11:15 +01:00 · 2021-11-04 12:09:57 +01:00 · 2021-11-04 12:09:44 +01:00 · 2021-11-04 11:58:40 +01:00
11 changed files with 5838 additions and 15 deletions
--- a/README.rst
+++ b/README.rst
@@ -95,7 +95,7 @@ The usage of OSACA can be listed as:
 --arch ARCH
  needs to be replaced with the target architecture abbreviation.
  Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX`` and ``ICL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
-  Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse  and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
+  Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72  and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
  If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
 --fixed
  Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
--- a/osaca/init.py
+++ b/osaca/init.py
@@ -1,6 +1,6 @@
 """Open Source Architecture Code Analyzer"""
 name = "osaca"
-__version__ = "0.4.6"
+__version__ = "0.4.7"

 # To trigger travis deployment to pypi, do the following:
 # 1. Increment __version___
--- a/osaca/data/a72.yml
+++ b/osaca/data/a72.yml
--- a/osaca/data/a72/mapping_pmevo.json
+++ b/osaca/data/a72/mapping_pmevo.json
--- a/osaca/data/a72/template.yml
+++ b/osaca/data/a72/template.yml
@@ -0,0 +1,808 @@
+osaca_version: 0.3.11
+micro_architecture: Cortex A-72
+arch_code: a72
+isa: aarch64
+hidden_loads: false
+ports: ['0', '1', '2', '3', '4', '5', '6', '7']
+port_model_scheme: |
+  +-------------------------------------------------------------------------------------+
+  |                                      scheduler                                      |
+  +-------------------------------------------------------------------------------------+
+    0 |I      1 |L       2 |M     3 |S        4 |F1       5 |I       6 |F0      7 |B
+      \/        \/         \/       \/          \/          \/         \/         \/
+   +-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
+   |INT ALU| |  LOAD | |  MUL  | | STORE | |   ASIMD   | |INT ALU| |  ASIMD  | | Branch|
+   +-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
+   +-------+           +-------+           +-----------+ +-------+ +---------+
+   |  AGU  |           |  DIV  |           |   FP ALU  | |  AGU  | |ASIMD MUL|
+   +-------+           +-------+           +-----------+ +-------+ +---------+
+                       +-------+           +-----------+           +---------+
+                       | SHIFT |           |   FP MUL  |           |  FP ALU |
+                       +-------+           +-----------+           +---------+
+                       +-------+           +-----------+           +---------+
+                       |  CRC  |           |   FP DIV  |           |  FP MUL |
+                       +-------+           +-----------+           +---------+
+                       +-------+           +-----------+           +---------+
+                       |  USAD |           |  FP SQRT  |           |  FP DIV |
+                       +-------+           +-----------+           +---------+
+                                           +-----------+           +---------+
+                                           |ASIMD SHIFT|           | FP CONV |
+                                           +-----------+           +---------+
+                                                                   +---------+
+                                                                   |  CRYPTO |
+                                                                   +---------+
+# The port pressues do not always correctly match this schema, because most
+# instructions are imported from an experimentally determined mapping, which
+# is not always correct.
+load_latency: {x: 4.0, s: 5.0, d: 5.0, h: 6.0, q: 6.0}
+load_throughput: []
+load_throughput_default: [[1, '1']]
+store_throughput: []
+store_throughput_default: [[2, '3']]
+instruction_forms:
+
+# Branch
+- name: b
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: bne
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: b.ne
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: br
+  operands:
+  - class: register
+    prefix: x
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: ret
+  operands:
+  - class: register
+    prefix: x
+  latency: 1.0
+  port_pressure: [[1, '7']]
+  throughput: 1.0
+- name: bl
+  operands:
+  - class: identifier
+  latency: 1.0
+  port_pressure: [[1, '05'], [1, '7']]
+  throughput: 1.0
+- name: blr
+  operands:
+  - class: register
+    prefix: x
+  latency: 1.0
+  port_pressure: [[1, '05'], [1, '7']]
+  throughput: 1.0
+
+# Load GPR
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 5.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+
+# Load FP d
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 5.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+
+# Load FP q
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: true
+    pre-indexed: false
+  latency: 5.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: true
+  latency: 5.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+- name: ldr
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 6.0
+  port_pressure: [[1, '1'], [2, '05']]
+  throughput: 1.0
+
+# Store GPR
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+
+# Store FP d
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+- name: str
+  operands:
+  - class: register
+    prefix: d
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 1.0
+  port_pressure: [[1, '3'], [1, '05']]
+  throughput: 1.0
+
+# Store FP q
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: 1
+    post-indexed: false
+    pre-indexed: true
+  latency: 2.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[2, '3'], [2, '05']]
+  throughput: 2.0
+- name: str
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 4.0
+  port_pressure: [[2, '3'], [2, '05']]
+  throughput: 2.0
+
+# Load unscaled GPR
+- name: ldur
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 4.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+
+# Load unscaled FP q
+- name: ldur
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 5.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+
+# Store unscaled GPR
+- name: stur
+  operands:
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 1.0
+  port_pressure: [[1, '3']]
+  throughput: 1.0
+
+# Store unscaled FP q
+- name: stur
+  operands:
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: '*'
+    pre-indexed: '*'
+  latency: 2.0
+  port_pressure: [[2, '3']]
+  throughput: 2.0
+
+# Load pair GPR
+- name: ldp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[1, '1']]
+  throughput: 1.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 4.0
+  port_pressure: [[1, '1'], [1, '05']]
+  throughput: 1.0
+
+# Load pair FP q
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[2, '1']]
+  throughput: 2.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 6.0
+  port_pressure: [[2, '1'], [1, '05']]
+  throughput: 2.0
+- name: ldp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 6.0
+  port_pressure: [[2, '1'], [1, '05']]
+  throughput: 2.0
+
+# Store pair GPR
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 2.0
+  port_pressure: [[2, '3']]
+  throughput: 2.0
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 2.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+- name: stp
+  operands:
+  - class: register
+    prefix: x
+  - class: register
+    prefix: x
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 2.0
+  port_pressure: [[2, '3'], [1, '05']]
+  throughput: 2.0
+
+# Store pair FP q
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[4, '3'], [1, '05']]
+  throughput: 4.0
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: true
+    pre-indexed: false
+  latency: 4.0
+  port_pressure: [[4, '3'], [1, '05']]
+  throughput: 4.0
+- name: stp
+  operands:
+  - class: register
+    prefix: q
+  - class: register
+    prefix: q
+  - class: memory
+    base: x
+    offset: '*'
+    index: '*'
+    scale: '*'
+    post-indexed: false
+    pre-indexed: true
+  latency: 4.0
+  port_pressure: [[4, '3'], [1, '05']]
+  throughput: 4.0
+
+# Fast-forward (measures 4 cycles, but can be 3)
+# Lower bound is used in order to ensure no over-estimates are possible.
+# Ports do not match documentation, but "fixing" requires also "fixing" almost
+# the entire rest of the model.
+- name: fadd
+  operands:
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fadd
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+- name: fadd
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+- name: fsub
+  operands:
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  - class: register
+    prefix: s
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fsub
+  operands:
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  - class: register
+    prefix: d
+  latency: 3.0
+  port_pressure: [[1, '45']]
+  throughput: 0.5
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  - class: register
+    prefix: v
+    shape: s
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+- name: fsub
+  operands:
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  - class: register
+    prefix: v
+    shape: d
+  latency: 3.0
+  port_pressure: [[1, '5']]
+  throughput: 1.0
+
+# Automatically generated instructions
--- a/osaca/data/isa/x86.yml
+++ b/osaca/data/isa/x86.yml
@@ -212,6 +212,67 @@ instruction_forms:
          name: "xmm"
          source: true
          destination: true
+    - name: not
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: true
+    - name: not
+      operands:
+        - class: "memory"
+          base: "*"
+          offset: "*"
+          index: "*"
+          scale: "*"
+          source: true
+          destination: true
+    - name: or
+      operands:
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: true
+    - name: or
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: true
+    - name: or
+      operands:
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "*"
+          offset: "*"
+          index: "*"
+          scale: "*"
+          source: true
+          destination: true
+    - name: or
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "*"
+          offset: "*"
+          index: "*"
+          scale: "*"
+          source: true
+          destination: true
    - name: and
      operands:
        - class: "immediate"
@@ -4085,6 +4146,55 @@ instruction_forms:
          name: "*"
          source: false
          destination: true
+    - name: xor
+      operands:
+        - class: "memory"
+          base: "*"
+          offset: "*"
+          index: "*"
+          scale: "*"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+    - name: xor
+      operands:
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "*"
+          offset: "*"
+          index: "*"
+          scale: "*"
+          source: true
+          destination: true
+    - name: xor
+      operands:
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+        - class: "memory"
+          base: "*"
+          offset: "*"
+          index: "*"
+          scale: "*"
+          source: true
+          destination: true
+    - name: xor
+      operands:
+        - class: "immediate"
+          imd: "int"
+          source: true
+          destination: false
+        - class: "register"
+          name: "gpr"
+          source: true
+          destination: false
    - name: xor
      breaks_dependency_on_equal_operands: true
      operands:
--- a/osaca/data/pmevo_importer.py
+++ b/osaca/data/pmevo_importer.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import math
+import re
+import sys
+
+from asmbench import bench, op
+from osaca.semantics import MachineModel
+
+
+def build_bench_instruction(name, operands):
+    # Converts an OSACA model instruction to an asmbench one.
+    # Returns `None` in case something went wrong.
+    asmbench_inst = name
+    direction = "dst"
+    separator = " "
+    shift = ""
+    for operand in operands:
+        if operand["class"] == "register" or operand["class"] == "register_shift":
+            if operand["prefix"] == "x":
+                shape = "i64"
+                constraint = "r"
+            elif operand["prefix"] == "s":
+                shape = "float"
+                constraint = "w"
+            elif operand["prefix"] == "d":
+                shape = "double"
+                constraint = "w"
+            elif operand["prefix"] == "v":
+                constraint = "w"
+                if operand["shape"] == "b":
+                    shape = "<16 x i8>"
+                elif operand["shape"] == "h":
+                    shape = "<8 x i16>"
+                elif operand["shape"] == "s":
+                    shape = "<4 x float>"
+                elif operand["shape"] == "d":
+                    shape = "<2 x double>"
+                else:
+                    return None
+            else:
+                return None
+            if operand["class"] == "register_shift":
+                shift = ", {}".format(operand["shift_op"])
+                if operand["shift"] is not None:
+                    shift += " {}".format(operand["shift"])
+        elif operand["class"] == "immediate" or operand["class"] == "immediate_shift":
+            shape = "i32"
+            # Different instructions have different ranges for literaly,
+            # so need to pick something "reasonable" for each.
+            if name in [
+                "cmeq",
+                "cmge",
+                "cmgt",
+                "cmle",
+                "cmlt",
+                "fcmeq",
+                "fcmge",
+                "fcmgt",
+                "fcmle",
+                "fcmlt",
+                "fcmp",
+            ]:
+                constraint = "0"
+            elif name in ["and", "ands", "eor", "eors", "orr", "orrs"]:
+                constraint = "255"
+            elif name in ["bfi", "extr", "sbfiz", "sbfx", "shl", "sshr", "ubfiz", "ubfx", "ushr"]:
+                constraint = "7"
+            else:
+                constraint = "42"
+            if operand["class"] == "immediate_shift":
+                shift = ", {}".format(operand["shift_op"])
+                if operand["shift"] is not None:
+                    shift += " {}".format(operand["shift"])
+        else:
+            return None
+        asmbench_inst += "{}{{{}:{}:{}}}{}".format(separator, direction, shape, constraint, shift)
+        direction = "src"
+        separator = ", "
+    return asmbench_inst
+
+
+def bench_instruction(name, operands):
+    # Converts an OSACA model instruction to an asmbench one and benchmarks it.
+    # Returned tuple may contain a `None` in case something went wrong.
+    asmbench_inst = build_bench_instruction(name, operands)
+    if asmbench_inst is None:
+        return (None, None)
+    return bench.bench_instructions([op.Instruction.from_string(asmbench_inst)])
+
+
+def round_cycles(value):
+    if value < 0.9:
+        # Frequently found, so we might want to include them.
+        # Measurements over-estimate a lot here, hence the high bound.
+        return 0.5
+    else:
+        # Measurements usually over-estimate, so usually round down,
+        # but still allow slightly smaller values.
+        return float(math.floor(value + 0.1))
+
+
+def operand_parse(op, state):
+    # Parses an operand from an PMEvo instruction and emits an OSACA model one.
+    # State object is used to keep track of types for future operands, e.g. literals.
+    # Future invocations may also modify previously returned objects.
+    parameter = {}
+
+    if op.startswith("_((REG:"):
+        parts = op.split(".")
+        register = parts[0][7:-2]
+        read_write, register_type, bits = register.split(":")
+
+        parameter["class"] = "register"
+        if register_type == "G":
+            if bits == "32":
+                parameter["prefix"] = "r"
+            elif bits == "64":
+                parameter["prefix"] = "x"
+            else:
+                raise ValueError("Invalid register bits for {} {}".format(register_type, bits))
+        elif register_type == "F":
+            if bits == "32":
+                parameter["prefix"] = "s"
+                state["type"] = "float"
+            elif bits == "64":
+                parameter["prefix"] = "d"
+                state["type"] = "double"
+            elif bits == "128":
+                parameter["prefix"] = "q"
+            elif bits == "VEC":
+                vec_shape = parts[1]
+                parameter["prefix"] = "v"
+                if vec_shape == "16b":
+                    parameter["shape"] = "b"
+                elif vec_shape == "8h":
+                    parameter["shape"] = "h"
+                elif vec_shape == "4s":
+                    parameter["shape"] = "s"
+                    state["type"] = "float"
+                elif vec_shape == "2d":
+                    parameter["shape"] = "d"
+                    state["type"] = "double"
+                else:
+                    raise ValueError("Invalid vector shape {}".format(vec_shape))
+            else:
+                raise ValueError("Invalid register bits for {} {}".format(register_type, bits))
+        else:
+            raise ValueError("Unknown register type {}".format(register_type))
+    elif op.startswith("_[((MEM:"):
+        bits = op[8:-2].split(":")[0]
+        if bits == "64":
+            state["memory_base"] = "x"
+        else:
+            raise ValueError("Invalid register bits for MEM {}".format(bits))
+        return None
+    elif op.startswith("_((MIMM:"):
+        bits = op[8:-3].split(":")[0]
+        if bits == "16":
+            parameter["class"] = "memory"
+            parameter["base"] = state["memory_base"]
+            parameter["offset"] = "imd"
+            parameter["index"] = "*"
+            parameter["scale"] = "*"
+            parameter["post-indexed"] = False
+            parameter["pre-indexed"] = False
+        else:
+            raise ValueError("Invalid register bits for MEM {}".format(bits))
+    elif re.fullmatch("_#?-?(0x)?[0-9a-f]+", op):
+        parameter["class"] = "immediate"
+        parameter["imd"] = "int"
+    elif re.fullmatch("_#?-?[0-9]*\\.[0-9]*", op):
+        parameter["class"] = "immediate"
+        parameter["imd"] = state["type"]
+    elif re.fullmatch("_((sxt|uxt)[bhw]|lsl|lsr|asr|rol|ror)(_[0-9]+)?", op):
+        # split = op[1:].split('_')
+        # shift_op = split[0]
+        # shift = None
+        # if len(split) >= 2:
+        #     shift = split[1]
+        # state['previous']['class'] += '_shift'
+        # state['previous']['shift_op'] = shift_op
+        # if shift != None:
+        #     state['previous']['shift'] = shift
+        # return None
+        raise ValueError("Skipping instruction with shift operand: {}".format(op))
+    else:
+        raise ValueError("Unknown operand {}".format(op))
+
+    state["previous"] = parameter
+    return parameter
+
+
+def port_convert(ports):
+    # Try to merge repeated entries together and emit in OSACA's format.
+    # FIXME: This does not handle having more than 10 ports.
+    pressures = []
+    previous = None
+    cycles = 0
+
+    for entry in ports:
+        possible_ports = "".join(entry)
+
+        if possible_ports != previous:
+            if previous is not None:
+                pressures.append([cycles, previous])
+            previous = possible_ports
+            cycles = 0
+
+        cycles += 1
+
+    if previous is not None:
+        pressures.append([cycles, previous])
+
+    return pressures
+
+
+def throughput_guess(ports):
+    # Minimum amount of possible ports per cycle should determine throughput
+    # to some degree of accuracy. (THIS IS *NOT* ALWAYS TRUE!)
+    bottleneck_ports = min(map(lambda it: len(it), ports))
+    return float(len(ports)) / bottleneck_ports
+
+
+def latency_guess(ports):
+    # Each entry in the ports array equates to one cycle on any of the ports.
+    # So this is about as good as it is going to get.
+    return float(len(ports))
+
+
+def extract_model(mapping, arch, template_model, asmbench):
+    try:
+        isa = MachineModel.get_isa_for_arch(arch)
+    except ValueError:
+        print("Skipping...", file=sys.stderr)
+        return None
+    if template_model is None:
+        mm = MachineModel(isa=isa)
+    else:
+        mm = template_model
+
+    for port in mapping["arch"]["ports"]:
+        mm.add_port(port)
+
+    for insn in mapping["arch"]["insns"]:
+        try:
+            ports = mapping["assignment"][insn]
+
+            # Parse instruction
+            insn_split = insn.split("_")
+            name = insn_split[1]
+            insn_parts = list(("_" + "_".join(insn_split[2:])).split(","))
+            operands = []
+            state = {}
+            for operand in insn_parts:
+                parsed = operand_parse(operand, state)
+                if parsed is not None:
+                    operands.append(parsed)
+
+            # Port pressures from mapping
+            port_pressure = port_convert(ports)
+
+            # Initial guessed throughput and latency
+            throughput = throughput_guess(ports)
+            latency = latency_guess(ports)
+
+            # Benchmark with asmbench
+            # print(build_bench_instruction(name, operands))
+            if asmbench:
+                bench_latency, bench_throughput = bench_instruction(name, operands)
+                if bench_throughput is not None:
+                    throughput = round_cycles(bench_throughput)
+                else:
+                    print("Failed to measure throughput for instruction {}.".format(insn))
+                if bench_latency is not None:
+                    latency = round_cycles(bench_latency)
+                else:
+                    print("Failed to measure latency for instruction {}.".format(insn))
+
+            # No u-ops data available
+            uops = None
+
+            # Insert instruction if not already found (can happen with template)
+            if mm.get_instruction(name, operands) is None:
+                mm.set_instruction(name, operands, latency, port_pressure, throughput, uops)
+        except ValueError as e:
+            print("Failed to parse instruction {}: {}.".format(insn, e))
+
+    return mm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("json", help="path of mapping.json")
+    parser.add_argument("yaml", help="path of template.yml", nargs="?")
+    parser.add_argument(
+        "--asmbench", help="Benchmark latency and throughput using asmbench.", action="store_true"
+    )
+    args = parser.parse_args()
+
+    json_file = open(args.json, "r")
+    mapping = json.load(json_file)
+    arch = mapping["arch"]["name"].lower()
+    json_file.close()
+
+    template_model = None
+    if args.yaml is not None:
+        template_model = MachineModel(path_to_yaml=args.yaml)
+
+    if args.asmbench:
+        bench.setup_llvm()
+
+    model = extract_model(mapping, arch, template_model, args.asmbench)
+
+    with open("{}.yml".format(arch.lower()), "w") as f:
+        f.write(model.dump())
+
+
+if __name__ == "__main__":
+    main()
--- a/osaca/frontend.py
+++ b/osaca/frontend.py
@@ -234,7 +234,7 @@ class Frontend(object):
        separator += "--" + len(str(kernel[-1]["line_number"])) * "-"
        col_sep = "|"
        # for LCD/CP column
-        separator += "-" * (2 * 6 + len(col_sep)) + "-" * len(col_sep)
+        separator += "-" * (2 * 6 + len(col_sep)) + "-" * len(col_sep) + "--"
        sep_list = self._get_separator_list(col_sep)
        headline = "Port pressure in cycles"
        headline_str = "{{:^{}}}".format(len(separator))
@@ -249,17 +249,14 @@ class Frontend(object):
                instr["line_number"]: lat for instr, lat in dep_dict[longest_lcd]["dependencies"]
            }

-        s += headline_str.format(headline) + "\n"
-        s += (
-            (
-                lineno_filler
-                + self._get_port_number_line(port_len, separator=col_sep)
-                + "{}{:^6}{}{:^6}{}".format(col_sep, "CP", col_sep, "LCD", col_sep)
-            )
-            + "\n"
-            + separator
-            + "\n"
+        port_line = (
+            lineno_filler
+            + self._get_port_number_line(port_len, separator=col_sep)
+            + "{}{:^6}{}{:^6}{}".format(col_sep, "CP", col_sep, "LCD", col_sep)
        )
+        separator = "-" * len(port_line)
+        s += headline_str.format(headline) + "\n"
+        s += port_line + "\n" + separator + "\n"
        for instruction_form in kernel:
            if show_cmnts is False and self._is_comment(instruction_form):
                continue
@@ -300,7 +297,7 @@ class Frontend(object):
            s += (
                lineno_filler
                + self._get_port_pressure(tp_sum, port_len, separator=" ")
-                + " {:^6} {:^6}\n".format(cp_sum, lcd_sum)
+                + " {:>5}  {:>5}  \n".format(cp_sum, lcd_sum)
            )
        return s

--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -32,6 +32,7 @@ SUPPORTED_ARCHS = [
    "TX2",
    "N1",
    "A64FX",
+    "A72",
 ]
 DEFAULT_ARCHS = {
    "aarch64": "A64FX",
@@ -95,7 +96,7 @@ def create_parser(parser=None):
        "--arch",
        type=str,
        help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ZEN1, ZEN2, TX2, N1, "
-        "A64FX). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
+        "A64FX, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
    )
    parser.add_argument(
        "--fixed",
--- a/osaca/semantics/hw_model.py
+++ b/osaca/semantics/hw_model.py
@@ -266,11 +266,13 @@ class MachineModel(object):
        """Return ISA for given micro-arch ``arch``."""
        arch_dict = {
            "a64fx": "aarch64",
+            "a72": "aarch64",
            "tx2": "aarch64",
            "n1": "aarch64",
            "zen1": "x86",
            "zen+": "x86",
            "zen2": "x86",
+            "icl": "x86",
            "con": "x86",  # Intel Conroe
            "wol": "x86",  # Intel Wolfdale
            "snb": "x86",
--- a/osaca/semantics/kernel_dg.py
+++ b/osaca/semantics/kernel_dg.py
@@ -215,6 +215,7 @@ class KernelDG(nx.DiGraph):
        max_latency_instr = max(self.kernel, key=lambda k: k["latency"])
        if nx.algorithms.dag.is_directed_acyclic_graph(self.dg):
            longest_path = nx.algorithms.dag.dag_longest_path(self.dg, weight="latency")
+            # TODO verify that we can remove the next two lince due to earlier initialization
            for line_number in longest_path:
                self._get_node_by_lineno(int(line_number))["latency_cp"] = 0
            # set cp latency to instruction
@@ -223,6 +224,9 @@ class KernelDG(nx.DiGraph):
                node = self._get_node_by_lineno(int(s))
                node["latency_cp"] = self.dg.edges[(s, d)]["latency"]
                path_latency += node["latency_cp"]
+            # add latency for last instruction
+            node = self._get_node_by_lineno(longest_path[-1])
+            node["latency_cp"] = node["latency"]
            if max_latency_instr["latency"] > path_latency:
                max_latency_instr["latency_cp"] = float(max_latency_instr["latency"])
                return [max_latency_instr]
Author	SHA1	Message	Date
JanLJL	c97f93c39b	version bump	2021-11-04 14:56:23 +01:00
JanLJL	968c71b7b6	black formatting	2021-11-04 12:11:15 +01:00
JanLJL	df26edd075	Merge branch 'master' of github.com:RRZE-HPC/OSACA	2021-11-04 12:09:57 +01:00
JanLJL	a767b7f290	Closes #78 , closes #79 ; added unary/binary logical operators	2021-11-04 12:09:44 +01:00
JanLJL	ba45038ad7	add latency of last instruction in CP	2021-11-04 11:58:40 +01:00
JanLJL	72e85075c2	better output formatting	2021-11-04 11:55:48 +01:00
Jan	40839384ec	Merge pull request #60 from RRZE-HPC/a72 Add support for ARM Cortex-A72	2021-10-14 18:10:36 +02:00
JanLJL	ab615547e5	added Cortex A72 in README	2021-10-14 17:10:08 +02:00
JanLJL	9c16f8bc56	formatted	2021-10-14 10:59:55 +02:00
JanLJL	be891d45d4	formatted	2021-10-14 10:53:34 +02:00
JanLJL	5735291d27	Merge branch 'master' into a72	2021-10-14 10:37:05 +02:00
JanLJL	ab368cded1	unified format	2021-10-14 09:23:35 +02:00
Git out :V	12044e3ac4	Initial support for the Cortex-A72 (Raspberry Pi 4)	2020-12-16 18:49:16 +01:00
Git out :V	8454edef73	Data for creating A72 model Add PMEvo mapping from https://github.com/cdl-saarland/pmevo-artifact/blob/master/vm_setup/data/A72/mapping_pmevo.json together with a template file to allow generating an OSACA model for the A72.	2020-12-16 18:48:55 +01:00
Git out :V	9165306808	PMEvo port mapping to OSACA model converter script Tool for semi-automatically creating an OSACA model using a PMEvo port mapping, optionally using asmbench to measure latency and throughput, which otherwise are not available in the port mapping. This is only designed to handle AArch64 architectures, in particular the Cortex-A72, used on the Raspberry Pi 4. Usefulness for other models may be limited.	2020-12-16 18:47:49 +01:00