Compare commits

...

15 Commits

Author SHA1 Message Date
JanLJL
c97f93c39b version bump 2021-11-04 14:56:23 +01:00
JanLJL
968c71b7b6 black formatting 2021-11-04 12:11:15 +01:00
JanLJL
df26edd075 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2021-11-04 12:09:57 +01:00
JanLJL
a767b7f290 Closes #78, closes #79; added unary/binary logical operators 2021-11-04 12:09:44 +01:00
JanLJL
ba45038ad7 add latency of last instruction in CP 2021-11-04 11:58:40 +01:00
JanLJL
72e85075c2 better output formatting 2021-11-04 11:55:48 +01:00
Jan
40839384ec Merge pull request #60 from RRZE-HPC/a72
Add support for ARM Cortex-A72
2021-10-14 18:10:36 +02:00
JanLJL
ab615547e5 added Cortex A72 in README 2021-10-14 17:10:08 +02:00
JanLJL
9c16f8bc56 formatted 2021-10-14 10:59:55 +02:00
JanLJL
be891d45d4 formatted 2021-10-14 10:53:34 +02:00
JanLJL
5735291d27 Merge branch 'master' into a72 2021-10-14 10:37:05 +02:00
JanLJL
ab368cded1 unified format 2021-10-14 09:23:35 +02:00
Git out :V
12044e3ac4 Initial support for the Cortex-A72 (Raspberry Pi 4) 2020-12-16 18:49:16 +01:00
Git out :V
8454edef73 Data for creating A72 model
Add PMEvo mapping from
https://github.com/cdl-saarland/pmevo-artifact/blob/master/vm_setup/data/A72/mapping_pmevo.json
together with a template file to allow generating an OSACA model for the
A72.
2020-12-16 18:48:55 +01:00
Git out :V
9165306808 PMEvo port mapping to OSACA model converter script
Tool for semi-automatically creating an OSACA model using a PMEvo port
mapping, optionally using asmbench to measure latency and throughput,
which otherwise are not available in the port mapping.

This is only designed to handle AArch64 architectures, in particular the
Cortex-A72, used on the Raspberry Pi 4. Usefulness for other models may
be limited.
2020-12-16 18:47:49 +01:00
11 changed files with 5838 additions and 15 deletions

View File

@@ -95,7 +95,7 @@ The usage of OSACA can be listed as:
--arch ARCH
needs to be replaced with the target architecture abbreviation.
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX`` and ``ICL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72 and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
--fixed
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.

View File

@@ -1,6 +1,6 @@
"""Open Source Architecture Code Analyzer"""
name = "osaca"
__version__ = "0.4.6"
__version__ = "0.4.7"
# To trigger travis deployment to pypi, do the following:
# 1. Increment __version___

4179
osaca/data/a72.yml Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

808
osaca/data/a72/template.yml Normal file
View File

@@ -0,0 +1,808 @@
osaca_version: 0.3.11
micro_architecture: Cortex A-72
arch_code: a72
isa: aarch64
hidden_loads: false
ports: ['0', '1', '2', '3', '4', '5', '6', '7']
port_model_scheme: |
+-------------------------------------------------------------------------------------+
| scheduler |
+-------------------------------------------------------------------------------------+
0 |I 1 |L 2 |M 3 |S 4 |F1 5 |I 6 |F0 7 |B
\/ \/ \/ \/ \/ \/ \/ \/
+-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
|INT ALU| | LOAD | | MUL | | STORE | | ASIMD | |INT ALU| | ASIMD | | Branch|
+-------+ +-------+ +-------+ +-------+ +-----------+ +-------+ +---------+ +-------+
+-------+ +-------+ +-----------+ +-------+ +---------+
| AGU | | DIV | | FP ALU | | AGU | |ASIMD MUL|
+-------+ +-------+ +-----------+ +-------+ +---------+
+-------+ +-----------+ +---------+
| SHIFT | | FP MUL | | FP ALU |
+-------+ +-----------+ +---------+
+-------+ +-----------+ +---------+
| CRC | | FP DIV | | FP MUL |
+-------+ +-----------+ +---------+
+-------+ +-----------+ +---------+
| USAD | | FP SQRT | | FP DIV |
+-------+ +-----------+ +---------+
+-----------+ +---------+
|ASIMD SHIFT| | FP CONV |
+-----------+ +---------+
+---------+
| CRYPTO |
+---------+
# The port pressues do not always correctly match this schema, because most
# instructions are imported from an experimentally determined mapping, which
# is not always correct.
load_latency: {x: 4.0, s: 5.0, d: 5.0, h: 6.0, q: 6.0}
load_throughput: []
load_throughput_default: [[1, '1']]
store_throughput: []
store_throughput_default: [[2, '3']]
instruction_forms:
# Branch
- name: b
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: bne
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: b.ne
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: br
operands:
- class: register
prefix: x
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: ret
operands:
- class: register
prefix: x
latency: 1.0
port_pressure: [[1, '7']]
throughput: 1.0
- name: bl
operands:
- class: identifier
latency: 1.0
port_pressure: [[1, '05'], [1, '7']]
throughput: 1.0
- name: blr
operands:
- class: register
prefix: x
latency: 1.0
port_pressure: [[1, '05'], [1, '7']]
throughput: 1.0
# Load GPR
- name: ldr
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 4.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 5.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 5.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
# Load FP d
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 5.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 5.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 5.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
# Load FP q
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post-indexed: false
pre-indexed: false
latency: 5.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post-indexed: true
pre-indexed: false
latency: 5.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post-indexed: false
pre-indexed: true
latency: 5.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 6.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 6.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
- name: ldr
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 6.0
port_pressure: [[1, '1'], [2, '05']]
throughput: 1.0
# Store GPR
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 1.0
port_pressure: [[1, '3']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
# Store FP d
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
- name: str
operands:
- class: register
prefix: d
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 1.0
port_pressure: [[1, '3'], [1, '05']]
throughput: 1.0
# Store FP q
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post-indexed: false
pre-indexed: false
latency: 4.0
port_pressure: [[2, '3']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post-indexed: true
pre-indexed: false
latency: 4.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: 1
post-indexed: false
pre-indexed: true
latency: 2.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 4.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 4.0
port_pressure: [[2, '3'], [2, '05']]
throughput: 2.0
- name: str
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 4.0
port_pressure: [[2, '3'], [2, '05']]
throughput: 2.0
# Load unscaled GPR
- name: ldur
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: '*'
pre-indexed: '*'
latency: 4.0
port_pressure: [[1, '1']]
throughput: 1.0
# Load unscaled FP q
- name: ldur
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: '*'
pre-indexed: '*'
latency: 5.0
port_pressure: [[1, '1']]
throughput: 1.0
# Store unscaled GPR
- name: stur
operands:
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: '*'
pre-indexed: '*'
latency: 1.0
port_pressure: [[1, '3']]
throughput: 1.0
# Store unscaled FP q
- name: stur
operands:
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: '*'
pre-indexed: '*'
latency: 2.0
port_pressure: [[2, '3']]
throughput: 2.0
# Load pair GPR
- name: ldp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 4.0
port_pressure: [[1, '1']]
throughput: 1.0
- name: ldp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 4.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
- name: ldp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 4.0
port_pressure: [[1, '1'], [1, '05']]
throughput: 1.0
# Load pair FP q
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 6.0
port_pressure: [[2, '1']]
throughput: 2.0
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 6.0
port_pressure: [[2, '1'], [1, '05']]
throughput: 2.0
- name: ldp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 6.0
port_pressure: [[2, '1'], [1, '05']]
throughput: 2.0
# Store pair GPR
- name: stp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 2.0
port_pressure: [[2, '3']]
throughput: 2.0
- name: stp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 2.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
- name: stp
operands:
- class: register
prefix: x
- class: register
prefix: x
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 2.0
port_pressure: [[2, '3'], [1, '05']]
throughput: 2.0
# Store pair FP q
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
latency: 4.0
port_pressure: [[4, '3'], [1, '05']]
throughput: 4.0
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: true
pre-indexed: false
latency: 4.0
port_pressure: [[4, '3'], [1, '05']]
throughput: 4.0
- name: stp
operands:
- class: register
prefix: q
- class: register
prefix: q
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: true
latency: 4.0
port_pressure: [[4, '3'], [1, '05']]
throughput: 4.0
# Fast-forward (measures 4 cycles, but can be 3)
# Lower bound is used in order to ensure no over-estimates are possible.
# Ports do not match documentation, but "fixing" requires also "fixing" almost
# the entire rest of the model.
- name: fadd
operands:
- class: register
prefix: s
- class: register
prefix: s
- class: register
prefix: s
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fadd
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: register
prefix: d
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fadd
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
- name: fadd
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
- name: fsub
operands:
- class: register
prefix: s
- class: register
prefix: s
- class: register
prefix: s
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fsub
operands:
- class: register
prefix: d
- class: register
prefix: d
- class: register
prefix: d
latency: 3.0
port_pressure: [[1, '45']]
throughput: 0.5
- name: fsub
operands:
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
- class: register
prefix: v
shape: s
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
- name: fsub
operands:
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
- class: register
prefix: v
shape: d
latency: 3.0
port_pressure: [[1, '5']]
throughput: 1.0
# Automatically generated instructions

View File

@@ -212,6 +212,67 @@ instruction_forms:
name: "xmm"
source: true
destination: true
- name: not
operands:
- class: "register"
name: "gpr"
source: true
destination: true
- name: not
operands:
- class: "memory"
base: "*"
offset: "*"
index: "*"
scale: "*"
source: true
destination: true
- name: or
operands:
- class: "immediate"
imd: "int"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: true
- name: or
operands:
- class: "register"
name: "gpr"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: true
- name: or
operands:
- class: "immediate"
imd: "int"
source: true
destination: false
- class: "memory"
base: "*"
offset: "*"
index: "*"
scale: "*"
source: true
destination: true
- name: or
operands:
- class: "register"
name: "gpr"
source: true
destination: false
- class: "memory"
base: "*"
offset: "*"
index: "*"
scale: "*"
source: true
destination: true
- name: and
operands:
- class: "immediate"
@@ -4085,6 +4146,55 @@ instruction_forms:
name: "*"
source: false
destination: true
- name: xor
operands:
- class: "memory"
base: "*"
offset: "*"
index: "*"
scale: "*"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: false
- name: xor
operands:
- class: "register"
name: "gpr"
source: true
destination: false
- class: "memory"
base: "*"
offset: "*"
index: "*"
scale: "*"
source: true
destination: true
- name: xor
operands:
- class: "immediate"
imd: "int"
source: true
destination: false
- class: "memory"
base: "*"
offset: "*"
index: "*"
scale: "*"
source: true
destination: true
- name: xor
operands:
- class: "immediate"
imd: "int"
source: true
destination: false
- class: "register"
name: "gpr"
source: true
destination: false
- name: xor
breaks_dependency_on_equal_operands: true
operands:

321
osaca/data/pmevo_importer.py Executable file
View File

@@ -0,0 +1,321 @@
#!/usr/bin/env python3
import argparse
import json
import math
import re
import sys
from asmbench import bench, op
from osaca.semantics import MachineModel
def build_bench_instruction(name, operands):
# Converts an OSACA model instruction to an asmbench one.
# Returns `None` in case something went wrong.
asmbench_inst = name
direction = "dst"
separator = " "
shift = ""
for operand in operands:
if operand["class"] == "register" or operand["class"] == "register_shift":
if operand["prefix"] == "x":
shape = "i64"
constraint = "r"
elif operand["prefix"] == "s":
shape = "float"
constraint = "w"
elif operand["prefix"] == "d":
shape = "double"
constraint = "w"
elif operand["prefix"] == "v":
constraint = "w"
if operand["shape"] == "b":
shape = "<16 x i8>"
elif operand["shape"] == "h":
shape = "<8 x i16>"
elif operand["shape"] == "s":
shape = "<4 x float>"
elif operand["shape"] == "d":
shape = "<2 x double>"
else:
return None
else:
return None
if operand["class"] == "register_shift":
shift = ", {}".format(operand["shift_op"])
if operand["shift"] is not None:
shift += " {}".format(operand["shift"])
elif operand["class"] == "immediate" or operand["class"] == "immediate_shift":
shape = "i32"
# Different instructions have different ranges for literaly,
# so need to pick something "reasonable" for each.
if name in [
"cmeq",
"cmge",
"cmgt",
"cmle",
"cmlt",
"fcmeq",
"fcmge",
"fcmgt",
"fcmle",
"fcmlt",
"fcmp",
]:
constraint = "0"
elif name in ["and", "ands", "eor", "eors", "orr", "orrs"]:
constraint = "255"
elif name in ["bfi", "extr", "sbfiz", "sbfx", "shl", "sshr", "ubfiz", "ubfx", "ushr"]:
constraint = "7"
else:
constraint = "42"
if operand["class"] == "immediate_shift":
shift = ", {}".format(operand["shift_op"])
if operand["shift"] is not None:
shift += " {}".format(operand["shift"])
else:
return None
asmbench_inst += "{}{{{}:{}:{}}}{}".format(separator, direction, shape, constraint, shift)
direction = "src"
separator = ", "
return asmbench_inst
def bench_instruction(name, operands):
# Converts an OSACA model instruction to an asmbench one and benchmarks it.
# Returned tuple may contain a `None` in case something went wrong.
asmbench_inst = build_bench_instruction(name, operands)
if asmbench_inst is None:
return (None, None)
return bench.bench_instructions([op.Instruction.from_string(asmbench_inst)])
def round_cycles(value):
if value < 0.9:
# Frequently found, so we might want to include them.
# Measurements over-estimate a lot here, hence the high bound.
return 0.5
else:
# Measurements usually over-estimate, so usually round down,
# but still allow slightly smaller values.
return float(math.floor(value + 0.1))
def operand_parse(op, state):
# Parses an operand from an PMEvo instruction and emits an OSACA model one.
# State object is used to keep track of types for future operands, e.g. literals.
# Future invocations may also modify previously returned objects.
parameter = {}
if op.startswith("_((REG:"):
parts = op.split(".")
register = parts[0][7:-2]
read_write, register_type, bits = register.split(":")
parameter["class"] = "register"
if register_type == "G":
if bits == "32":
parameter["prefix"] = "r"
elif bits == "64":
parameter["prefix"] = "x"
else:
raise ValueError("Invalid register bits for {} {}".format(register_type, bits))
elif register_type == "F":
if bits == "32":
parameter["prefix"] = "s"
state["type"] = "float"
elif bits == "64":
parameter["prefix"] = "d"
state["type"] = "double"
elif bits == "128":
parameter["prefix"] = "q"
elif bits == "VEC":
vec_shape = parts[1]
parameter["prefix"] = "v"
if vec_shape == "16b":
parameter["shape"] = "b"
elif vec_shape == "8h":
parameter["shape"] = "h"
elif vec_shape == "4s":
parameter["shape"] = "s"
state["type"] = "float"
elif vec_shape == "2d":
parameter["shape"] = "d"
state["type"] = "double"
else:
raise ValueError("Invalid vector shape {}".format(vec_shape))
else:
raise ValueError("Invalid register bits for {} {}".format(register_type, bits))
else:
raise ValueError("Unknown register type {}".format(register_type))
elif op.startswith("_[((MEM:"):
bits = op[8:-2].split(":")[0]
if bits == "64":
state["memory_base"] = "x"
else:
raise ValueError("Invalid register bits for MEM {}".format(bits))
return None
elif op.startswith("_((MIMM:"):
bits = op[8:-3].split(":")[0]
if bits == "16":
parameter["class"] = "memory"
parameter["base"] = state["memory_base"]
parameter["offset"] = "imd"
parameter["index"] = "*"
parameter["scale"] = "*"
parameter["post-indexed"] = False
parameter["pre-indexed"] = False
else:
raise ValueError("Invalid register bits for MEM {}".format(bits))
elif re.fullmatch("_#?-?(0x)?[0-9a-f]+", op):
parameter["class"] = "immediate"
parameter["imd"] = "int"
elif re.fullmatch("_#?-?[0-9]*\\.[0-9]*", op):
parameter["class"] = "immediate"
parameter["imd"] = state["type"]
elif re.fullmatch("_((sxt|uxt)[bhw]|lsl|lsr|asr|rol|ror)(_[0-9]+)?", op):
# split = op[1:].split('_')
# shift_op = split[0]
# shift = None
# if len(split) >= 2:
# shift = split[1]
# state['previous']['class'] += '_shift'
# state['previous']['shift_op'] = shift_op
# if shift != None:
# state['previous']['shift'] = shift
# return None
raise ValueError("Skipping instruction with shift operand: {}".format(op))
else:
raise ValueError("Unknown operand {}".format(op))
state["previous"] = parameter
return parameter
def port_convert(ports):
# Try to merge repeated entries together and emit in OSACA's format.
# FIXME: This does not handle having more than 10 ports.
pressures = []
previous = None
cycles = 0
for entry in ports:
possible_ports = "".join(entry)
if possible_ports != previous:
if previous is not None:
pressures.append([cycles, previous])
previous = possible_ports
cycles = 0
cycles += 1
if previous is not None:
pressures.append([cycles, previous])
return pressures
def throughput_guess(ports):
# Minimum amount of possible ports per cycle should determine throughput
# to some degree of accuracy. (THIS IS *NOT* ALWAYS TRUE!)
bottleneck_ports = min(map(lambda it: len(it), ports))
return float(len(ports)) / bottleneck_ports
def latency_guess(ports):
# Each entry in the ports array equates to one cycle on any of the ports.
# So this is about as good as it is going to get.
return float(len(ports))
def extract_model(mapping, arch, template_model, asmbench):
try:
isa = MachineModel.get_isa_for_arch(arch)
except ValueError:
print("Skipping...", file=sys.stderr)
return None
if template_model is None:
mm = MachineModel(isa=isa)
else:
mm = template_model
for port in mapping["arch"]["ports"]:
mm.add_port(port)
for insn in mapping["arch"]["insns"]:
try:
ports = mapping["assignment"][insn]
# Parse instruction
insn_split = insn.split("_")
name = insn_split[1]
insn_parts = list(("_" + "_".join(insn_split[2:])).split(","))
operands = []
state = {}
for operand in insn_parts:
parsed = operand_parse(operand, state)
if parsed is not None:
operands.append(parsed)
# Port pressures from mapping
port_pressure = port_convert(ports)
# Initial guessed throughput and latency
throughput = throughput_guess(ports)
latency = latency_guess(ports)
# Benchmark with asmbench
# print(build_bench_instruction(name, operands))
if asmbench:
bench_latency, bench_throughput = bench_instruction(name, operands)
if bench_throughput is not None:
throughput = round_cycles(bench_throughput)
else:
print("Failed to measure throughput for instruction {}.".format(insn))
if bench_latency is not None:
latency = round_cycles(bench_latency)
else:
print("Failed to measure latency for instruction {}.".format(insn))
# No u-ops data available
uops = None
# Insert instruction if not already found (can happen with template)
if mm.get_instruction(name, operands) is None:
mm.set_instruction(name, operands, latency, port_pressure, throughput, uops)
except ValueError as e:
print("Failed to parse instruction {}: {}.".format(insn, e))
return mm
def main():
parser = argparse.ArgumentParser()
parser.add_argument("json", help="path of mapping.json")
parser.add_argument("yaml", help="path of template.yml", nargs="?")
parser.add_argument(
"--asmbench", help="Benchmark latency and throughput using asmbench.", action="store_true"
)
args = parser.parse_args()
json_file = open(args.json, "r")
mapping = json.load(json_file)
arch = mapping["arch"]["name"].lower()
json_file.close()
template_model = None
if args.yaml is not None:
template_model = MachineModel(path_to_yaml=args.yaml)
if args.asmbench:
bench.setup_llvm()
model = extract_model(mapping, arch, template_model, args.asmbench)
with open("{}.yml".format(arch.lower()), "w") as f:
f.write(model.dump())
if __name__ == "__main__":
main()

View File

@@ -234,7 +234,7 @@ class Frontend(object):
separator += "--" + len(str(kernel[-1]["line_number"])) * "-"
col_sep = "|"
# for LCD/CP column
separator += "-" * (2 * 6 + len(col_sep)) + "-" * len(col_sep)
separator += "-" * (2 * 6 + len(col_sep)) + "-" * len(col_sep) + "--"
sep_list = self._get_separator_list(col_sep)
headline = "Port pressure in cycles"
headline_str = "{{:^{}}}".format(len(separator))
@@ -249,17 +249,14 @@ class Frontend(object):
instr["line_number"]: lat for instr, lat in dep_dict[longest_lcd]["dependencies"]
}
s += headline_str.format(headline) + "\n"
s += (
(
lineno_filler
+ self._get_port_number_line(port_len, separator=col_sep)
+ "{}{:^6}{}{:^6}{}".format(col_sep, "CP", col_sep, "LCD", col_sep)
)
+ "\n"
+ separator
+ "\n"
port_line = (
lineno_filler
+ self._get_port_number_line(port_len, separator=col_sep)
+ "{}{:^6}{}{:^6}{}".format(col_sep, "CP", col_sep, "LCD", col_sep)
)
separator = "-" * len(port_line)
s += headline_str.format(headline) + "\n"
s += port_line + "\n" + separator + "\n"
for instruction_form in kernel:
if show_cmnts is False and self._is_comment(instruction_form):
continue
@@ -300,7 +297,7 @@ class Frontend(object):
s += (
lineno_filler
+ self._get_port_pressure(tp_sum, port_len, separator=" ")
+ " {:^6} {:^6}\n".format(cp_sum, lcd_sum)
+ " {:>5} {:>5} \n".format(cp_sum, lcd_sum)
)
return s

View File

@@ -32,6 +32,7 @@ SUPPORTED_ARCHS = [
"TX2",
"N1",
"A64FX",
"A72",
]
DEFAULT_ARCHS = {
"aarch64": "A64FX",
@@ -95,7 +96,7 @@ def create_parser(parser=None):
"--arch",
type=str,
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ZEN1, ZEN2, TX2, N1, "
"A64FX). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
"A64FX, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
)
parser.add_argument(
"--fixed",

View File

@@ -266,11 +266,13 @@ class MachineModel(object):
"""Return ISA for given micro-arch ``arch``."""
arch_dict = {
"a64fx": "aarch64",
"a72": "aarch64",
"tx2": "aarch64",
"n1": "aarch64",
"zen1": "x86",
"zen+": "x86",
"zen2": "x86",
"icl": "x86",
"con": "x86", # Intel Conroe
"wol": "x86", # Intel Wolfdale
"snb": "x86",

View File

@@ -215,6 +215,7 @@ class KernelDG(nx.DiGraph):
max_latency_instr = max(self.kernel, key=lambda k: k["latency"])
if nx.algorithms.dag.is_directed_acyclic_graph(self.dg):
longest_path = nx.algorithms.dag.dag_longest_path(self.dg, weight="latency")
# TODO verify that we can remove the next two lince due to earlier initialization
for line_number in longest_path:
self._get_node_by_lineno(int(line_number))["latency_cp"] = 0
# set cp latency to instruction
@@ -223,6 +224,9 @@ class KernelDG(nx.DiGraph):
node = self._get_node_by_lineno(int(s))
node["latency_cp"] = self.dg.edges[(s, d)]["latency"]
path_latency += node["latency_cp"]
# add latency for last instruction
node = self._get_node_by_lineno(longest_path[-1])
node["latency_cp"] = node["latency"]
if max_latency_instr["latency"] > path_latency:
max_latency_instr["latency_cp"] = float(max_latency_instr["latency"])
return [max_latency_instr]