Compare commits

...

23 Commits

Author SHA1 Message Date
JanLJL
d61cb287ce version bump 2022-10-11 15:43:46 +02:00
JanLJL
5c21e18e36 more instructions 2022-10-11 15:43:19 +02:00
JanLJL
8807e3eda6 added few instrucitons 2022-10-07 00:51:09 +02:00
JanLJL
7c907e2432 bugfix in store throughput calculation 2022-09-28 14:21:46 +02:00
JanLJL
1ea1e68b4e updated due to master branch sunset 2022-09-28 11:00:39 +02:00
JanLJL
907f64d452 updated due to master branch sunset 2022-09-28 10:59:16 +02:00
JanLJL
24de7a762b version bump 2022-09-28 10:36:15 +02:00
JanLJL
87411ab822 updated CPU archs 2022-09-28 10:33:28 +02:00
JanLJL
2fa25e3099 formatting for flake8 2022-09-28 10:05:18 +02:00
JanLJL
0b440e4da9 updated and bugfixed DB 2022-09-28 10:01:26 +02:00
JanLJL
08e6a4be36 updated DB 2022-09-28 10:01:14 +02:00
JanLJL
7724ce27c7 added Zen3 support 2022-09-27 18:39:14 +02:00
JanLJL
4f8e37d9fd bugfixes and more features 2022-09-27 18:04:59 +02:00
JanLJL
d5f1654aa8 bugfixes 2022-09-27 18:04:33 +02:00
JanLJL
81f40604cb version bump 2022-09-08 10:17:41 +02:00
JanLJL
df747b8c48 more instruction forms 2022-09-07 12:48:39 +02:00
JanLJL
4e25a29a8a removed invalid char 2022-09-07 10:48:45 +02:00
JanLJL
016061f72c more instruction forms 2022-09-07 10:33:28 +02:00
JanLJL
ddff8c5012 added option of explicitly mentioning k regs in DB (not simply gpr) 2022-09-07 10:33:16 +02:00
JanLJL
2306cb58d0 added more instructions for ICX 2022-09-01 15:49:28 +02:00
JanLJL
660a9d0f41 Merge branch 'master' of github.com:RRZE-HPC/osaca 2022-08-31 14:20:07 +02:00
JanLJL
3b453de617 added AND instr 2022-08-31 14:19:23 +02:00
JanLJL
b93d911bb7 fix bug in port util 2022-08-31 14:17:52 +02:00
13 changed files with 9166 additions and 95 deletions

View File

@@ -34,7 +34,7 @@ jobs:
python setup.py build sdist bdist_wheel
- name: Publish to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
skip_existing: true
user: __token__

View File

@@ -94,8 +94,8 @@ The usage of OSACA can be listed as:
shows the programs version number.
--arch ARCH
needs to be replaced with the target architecture abbreviation.
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX`` and ``ICL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72 and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ICL`` (Client), ``ICX`` (Server) for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2``, and ``ZEN3`` for AMD Zen architectures.
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72, ``TSV110`` for the HiSilicon TaiShan v110, and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
--fixed
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.

View File

@@ -1,6 +1,6 @@
"""Open Source Architecture Code Analyzer"""
name = "osaca"
__version__ = "0.4.9"
__version__ = "0.4.12"
# To trigger travis deployment to pypi, do the following:
# 1. Increment __version___

View File

@@ -16,8 +16,11 @@ class EntryBuilder:
def classify(operands_types):
load = "mem" in operands_types[:-1]
store = "mem" in operands_types[-1:]
vec = False
if any([vecr in operands_types for vecr in ["mm", "xmm", "ymm", "zmm"]]):
vec = True
assert not (load and store), "Can not process a combined load-store instruction."
return load, store
return load, store, vec
def build_description(
self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
@@ -26,7 +29,7 @@ class EntryBuilder:
comment = " # " + comment
else:
comment = ""
description = "- name: {}{}\n operands:\n".format(instruction_name, comment)
description = "- name: {}{}\n operands: {}\n".format(instruction_name, comment, "[]" if len(operand_types) == 0 else "")
for ot in operand_types:
if ot == "imd":
@@ -64,14 +67,18 @@ class EntryBuilder:
def parse_port_pressure(self, port_pressure_str):
"""
Example:
1*p45+2*p0 -> [[1, '45'], [2, '0']]
1*p45+2*p0+2*p10,11 -> [[1, '45'], [2, '0'], [2, ['10', '11']]]
"""
port_pressure = []
if port_pressure_str:
for p in port_pressure_str.split("+"):
cycles, ports = p.split("*p")
if ports.startswith("(") and ports.endswith(")"):
ports = ports[1:-1].split(",")
ports = ports.split(",")
if len(ports) == 1:
ports = ports[0]
else:
ports = list(filter(lambda p: len(p) > 0, ports))
port_pressure.append([int(cycles), ports])
return port_pressure
@@ -92,23 +99,42 @@ class EntryBuilder:
return self.build_description(instruction_name, operand_types, port_pressure, latency)
class EntryBuilderIntelPort9(EntryBuilder):
# for ICX
class ArchEntryBuilder(EntryBuilder):
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
load, store = self.classify(operand_types)
# Intel ICX
# LD_pressure = [[1, "23"], [1, ["2D", "3D"]]]
# LD_pressure_vec = LD_pressure
# ST_pressure = [[1, "79"], [1, "48"]]
# ST_pressure_vec = ST_pressure
# LD_lat = 5
# ST_lat = 0
# Zen3
LD_pressure = [[1, ["11", "12", "13"]]]
LD_pressure_vec = [[1, ["11", "12"]]]
ST_pressure = [[1, ["12", "13"]]]
ST_pressure_vec = [[1, ["4"]], [1, ["13"]]]
LD_lat = 4
ST_lat = 0
load, store, vec = self.classify(operand_types)
if load:
port_pressure += [[1, "23"], [1, ["2D", "3D"]]]
latency += 5
if vec:
port_pressure += LD_pressure_vec
else:
port_pressure += LD_pressure
latency += LD_lat
comment = "with load"
return EntryBuilder.build_description(
self, instruction_name, operand_types, port_pressure, latency, comment
)
if store:
port_pressure = port_pressure + [[1, "79"], [1, "48"]]
if vec:
port_pressure = port_pressure + ST_pressure_vec
else:
port_pressure = port_pressure + ST_pressure
operands = ["mem" if o == "mem" else o for o in operand_types]
latency += 0
latency += ST_lat
return EntryBuilder.build_description(
self,
instruction_name,
@@ -125,7 +151,7 @@ class EntryBuilderIntelPort9(EntryBuilder):
def get_description(instruction_form, port_pressure, latency, rhs_comment=None):
entry = EntryBuilderIntelPort9().process_item(instruction_form, (port_pressure, latency))
entry = ArchEntryBuilder().process_item(instruction_form, (port_pressure, latency))
if rhs_comment is not None:
max_length = max([len(line) for line in entry.split("\n")])

View File

@@ -16,8 +16,11 @@ class MOVEntryBuilder:
def classify(operands_types):
load = "mem" in operands_types[:-1]
store = "mem" in operands_types[-1:]
vec = False
if any([vecr in operands_types for vecr in ["mm", "xmm", "ymm", "zmm"]]):
vec = True
assert not (load and store), "Can not process a combined load-store instruction."
return load, store
return load, store, vec
def build_description(
self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
@@ -65,6 +68,9 @@ class MOVEntryBuilder:
if port_pressure_str:
for p in port_pressure_str.split("+"):
cycles, ports = p.split("*p")
ports = ports.split(",")
if len(ports) == 1:
ports = ports[0]
port_pressure.append([int(cycles), ports])
return port_pressure
@@ -84,7 +90,7 @@ class MOVEntryBuilder:
class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
# for SNB and IVB
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
load, store = self.classify(operand_types)
load, store, vec = self.classify(operand_types)
comment = None
if load:
@@ -111,9 +117,8 @@ class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
class MOVEntryBuilderIntelPort9(MOVEntryBuilder):
# for ICX
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
load, store = self.classify(operand_types)
load, store, vec = self.classify(operand_types)
if load:
port_pressure += [[1, "23"], [1, ["2D", "3D"]]]
@@ -141,6 +146,380 @@ class MOVEntryBuilderIntelPort9(MOVEntryBuilder):
)
class MOVEntryBuilderAMDZen3(MOVEntryBuilder):
# for Zen 3
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
load, store, vec = self.classify(operand_types)
if load and vec:
port_pressure += [[1, ["11", "12"]]]
latency += 4
comment = "with load"
return MOVEntryBuilder.build_description(
self, instruction_name, operand_types, port_pressure, latency, comment
)
elif load:
port_pressure += [[1, ["11", "12", "13"]]]
latency += 4
comment = "with load"
return MOVEntryBuilder.build_description(
self, instruction_name, operand_types, port_pressure, latency, comment
)
if store and vec:
port_pressure = port_pressure + [[1, ["4"]], [1, ["13"]]]
operands = ["mem" if o == "mem" else o for o in operand_types]
latency += 0
return MOVEntryBuilder.build_description(
self,
instruction_name,
operands,
port_pressure,
latency,
"with store",
)
elif store:
port_pressure = port_pressure + [[1, ["12", "13"]]]
operands = ["mem" if o == "mem" else o for o in operand_types]
latency += 0
return MOVEntryBuilder.build_description(
self,
instruction_name,
operands,
port_pressure,
latency,
"with store",
)
# Register only:
return MOVEntryBuilder.build_description(
self, instruction_name, operand_types, port_pressure, latency
)
#############################################################################
z3 = MOVEntryBuilderAMDZen3()
zen3_mov_instructions = [
# https://www.felixcloutier.com/x86/mov
("mov gpr gpr", ("1*p6789", 1)),
("mov gpr mem", ("", 0)),
("mov mem gpr", ("", 0)),
("mov imd gpr", ("1*p6789", 1)),
("mov imd mem", ("", 0)),
("movabs imd gpr", ("1*p6789", 1)), # AT&T version, port util to be verified
# https://www.felixcloutier.com/x86/movapd
("movapd xmm xmm", ("1*p0123", 1)),
("movapd xmm mem", ("", 0)),
("movapd mem xmm", ("", 0)),
("vmovapd xmm xmm", ("1*p0123", 1)),
("vmovapd xmm mem", ("", 0)),
("vmovapd mem xmm", ("", 0)),
("vmovapd ymm ymm", ("1*p0123", 1)),
("vmovapd ymm mem", ("", 0)),
("vmovapd mem ymm", ("", 0)),
# https://www.felixcloutier.com/x86/movaps
("movaps xmm xmm", ("1*p0123", 1)),
("movaps xmm mem", ("", 0)),
("movaps mem xmm", ("", 0)),
("vmovaps xmm xmm", ("1*p0123", 1)),
("vmovaps xmm mem", ("", 0)),
("vmovaps mem xmm", ("", 0)),
("vmovaps ymm ymm", ("1*p0123", 1)),
("vmovaps ymm mem", ("", 0)),
("vmovaps mem ymm", ("", 0)),
# https://www.felixcloutier.com/x86/movd:movq
("movd gpr mm", ("1*p0123", 1)),
("movd mem mm", ("", 0)),
("movq gpr mm", ("1*p0123", 1)),
("movq mem mm", ("", 0)),
("movd mm gpr", ("1*p0123", 1)),
("movd mm mem", ("", 0)),
("movq mm gpr", ("1*p0123", 1)),
("movq mm mem", ("", 0)),
("movd gpr xmm", ("1*p0123", 1)),
("movd mem xmm", ("", 0)),
("movq gpr xmm", ("1*p0123", 1)),
("movq mem xmm", ("", 0)),
("movd xmm gpr", ("1*p0123", 1)),
("movd xmm mem", ("", 0)),
("movq xmm gpr", ("1*p0123", 1)),
("movq xmm mem", ("", 0)),
("vmovd gpr xmm", ("1*p0123", 1)),
("vmovd mem xmm", ("", 0)),
("vmovq gpr xmm", ("1*p0123", 1)),
("vmovq mem xmm", ("", 0)),
("vmovd xmm gpr", ("1*p0123", 1)),
("vmovd xmm mem", ("", 0)),
("vmovq xmm gpr", ("1*p0123", 1)),
("vmovq xmm mem", ("", 0)),
# https://www.felixcloutier.com/x86/movddup
("movddup xmm xmm", ("1*p12", 1)),
("movddup mem xmm", ("", 0)),
("vmovddup xmm xmm", ("1*p12", 1)),
("vmovddup mem xmm", ("", 0)),
("vmovddup ymm ymm", ("1*p12", 1)),
("vmovddup mem ymm", ("", 0)),
# https://www.felixcloutier.com/x86/movdq2q
("movdq2q xmm mm", ("1*p0123", 1)),
# https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
("movdqa xmm xmm", ("1*p0123", 1)),
("movdqa mem xmm", ("", 0)),
("movdqa xmm mem", ("", 0)),
("vmovdqa xmm xmm", ("1*p0123", 1)),
("vmovdqa mem xmm", ("", 0)),
("vmovdqa xmm mem", ("", 0)),
("vmovdqa ymm ymm", ("1*p0123", 1)),
("vmovdqa mem ymm", ("", 0)),
("vmovdqa ymm mem", ("", 0)),
# https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
("movdqu xmm xmm", ("1*p0123", 1)),
("movdqu mem xmm", ("", 0)),
("movdqu xmm mem", ("", 0)),
("vmovdqu xmm xmm", ("1*p0123", 1)),
("vmovdqu mem xmm", ("", 0)),
("vmovdqu xmm mem", ("", 0)),
("vmovdqu ymm ymm", ("1*p0123", 1)),
("vmovdqu mem ymm", ("", 0)),
("vmovdqu ymm mem", ("", 0)),
# https://www.felixcloutier.com/x86/movhlps
("movhlps xmm xmm", ("1*p12", 1)),
("vmovhlps xmm xmm xmm", ("1*p12", 1)),
# https://www.felixcloutier.com/x86/movhpd
("movhpd mem xmm", ("1*p12", 1)),
("vmovhpd mem xmm xmm", ("1*p12", 1)),
("movhpd xmm mem", ("", 0)),
("vmovhpd mem xmm", ("", 0)),
# https://www.felixcloutier.com/x86/movhps
("movhps mem xmm", ("1*p12", 1)),
("vmovhps mem xmm xmm", ("1*p12", 1)),
("movhps xmm mem", ("", 0)),
("vmovhps mem xmm", ("", 0)),
# https://www.felixcloutier.com/x86/movlhps
("movlhps xmm xmm", ("1*p12", 1)),
("vmovlhps xmm xmm xmm", ("1*p12", 1)),
# https://www.felixcloutier.com/x86/movlpd
("movlpd mem xmm", ("1*p12", 1)),
("vmovlpd mem xmm xmm", ("1*p12", 1)),
("movlpd xmm mem", ("1*p12", 0)),
("vmovlpd mem xmm", ("1*p12", 1)),
# https://www.felixcloutier.com/x86/movlps
("movlps mem xmm", ("1*p12", 1)),
("vmovlps mem xmm xmm", ("1*p12", 1)),
("movlps xmm mem", ("1*p12", 0)),
("vmovlps mem xmm", ("1*p12", 1)),
# https://www.felixcloutier.com/x86/movmskpd
("movmskpd xmm gpr", ("1*p0123", 1)),
("vmovmskpd xmm gpr", ("1*p0123", 1)),
("vmovmskpd ymm gpr", ("1*p0123", 1)),
# https://www.felixcloutier.com/x86/movmskps
("movmskps xmm gpr", ("1*p0123", 1)),
("vmovmskps xmm gpr", ("1*p0123", 1)),
("vmovmskps ymm gpr", ("1*p0123", 1)),
# https://www.felixcloutier.com/x86/movntdq
("movntdq xmm mem", ("", 0)), # TODO NT-store: what latency to use?
("vmovntdq xmm mem", ("", 0)), # TODO NT-store: what latency to use?
("vmovntdq ymm mem", ("", 0)), # TODO NT-store: what latency to use?
# https://www.felixcloutier.com/x86/movntdqa
("movntdqa mem xmm", ("", 0)), # TODO NT-store: what latency to use?
("vmovntdqa mem xmm", ("", 0)), # TODO NT-store: what latency to use?
("vmovntdqa mem ymm", ("", 0)), # TODO NT-store: what latency to use?
# https://www.felixcloutier.com/x86/movnti
("movnti gpr mem", ("", 0)), # TODO NT-store: what latency to use?
# https://www.felixcloutier.com/x86/movntpd
("movntpd xmm mem", ("", 0)), # TODO NT-store: what latency to use?
("vmovntpd xmm mem", ("", 0)), # TODO NT-store: what latency to use?
("vmovntpd ymm mem", ("", 0)), # TODO NT-store: what latency to use?
# https://www.felixcloutier.com/x86/movntps
("movntps xmm mem", ("", 0)), # TODO NT-store: what latency to use?
("vmovntps xmm mem", ("", 0)), # TODO NT-store: what latency to use?
("vmovntps ymm mem", ("", 0)), # TODO NT-store: what latency to use?
# https://www.felixcloutier.com/x86/movntq
("movntq mm mem", ("", 0)), # TODO NT-store: what latency to use?
# https://www.felixcloutier.com/x86/movq
("movq mm mm", ("", 0)),
("movq mem mm", ("", 0)),
("movq mm mem", ("", 0)),
("movq xmm xmm", ("1*p0123", 1)),
("movq mem xmm", ("", 0)),
("movq xmm mem", ("", 0)),
("vmovq xmm xmm", ("1*p0123", 1)),
("vmovq mem xmm", ("", 0)),
("vmovq xmm mem", ("", 0)),
# https://www.felixcloutier.com/x86/movs:movsb:movsw:movsd:movsq
# TODO combined load-store is currently not supported
# ('movs mem mem', ()),
# https://www.felixcloutier.com/x86/movsd
("movsd xmm xmm", ("1*p0123", 1)),
("movsd mem xmm", ("", 0)),
("movsd xmm mem", ("", 0)),
("vmovsd xmm xmm xmm", ("1*p0123", 1)),
("vmovsd mem xmm", ("", 0)),
("vmovsd xmm mem", ("", 0)),
# https://www.felixcloutier.com/x86/movshdup
("movshdup xmm xmm", ("1*p12", 1)),
("movshdup mem xmm", ("", 0)),
("vmovshdup xmm xmm", ("1*p12", 1)),
("vmovshdup mem xmm", ("", 0)),
("vmovshdup ymm ymm", ("1*p12", 1)),
("vmovshdup mem ymm", ("", 0)),
# https://www.felixcloutier.com/x86/movsldup
("movsldup xmm xmm", ("1*p12", 1)),
("movsldup mem xmm", ("", 0)),
("vmovsldup xmm xmm", ("1*p12", 1)),
("vmovsldup mem xmm", ("", 0)),
("vmovsldup ymm ymm", ("1*p12", 1)),
("vmovsldup mem ymm", ("", 0)),
# https://www.felixcloutier.com/x86/movss
("movss xmm xmm", ("1*p0123", 1)),
("movss mem xmm", ("", 0)),
("vmovss xmm xmm xmm", ("1*p0123", 1)),
("vmovss mem xmm", ("", 0)),
("vmovss xmm xmm", ("1*p0123", 1)),
("vmovss xmm mem", ("", 0)),
("movss mem xmm", ("", 0)),
# https://www.felixcloutier.com/x86/movsx:movsxd
("movsx gpr gpr", ("1*p6789", 1)),
("movsx mem gpr", ("", 0)),
("movsxd gpr gpr", ("", 0)),
("movsxd mem gpr", ("", 0)),
("movsb gpr gpr", ("1*p6789", 1)), # AT&T version
("movsb mem gpr", ("", 0)), # AT&T version
("movsw gpr gpr", ("1*p6789", 1)), # AT&T version
("movsw mem gpr", ("", 0)), # AT&T version
("movsl gpr gpr", ("1*p6789", 1)), # AT&T version
("movsl mem gpr", ("", 0)), # AT&T version
("movsq gpr gpr", ("1*p6789", 1)), # AT&T version
("movsq mem gpr", ("", 0)), # AT&T version
# https://www.felixcloutier.com/x86/movupd
("movupd xmm xmm", ("1*p0123", 1)),
("movupd mem xmm", ("", 0)),
("movupd xmm mem", ("", 0)),
("vmovupd xmm xmm", ("1*p0123", 1)),
("vmovupd mem xmm", ("", 0)),
("vmovupd xmm mem", ("", 0)),
("vmovupd ymm ymm", ("1*p0123", 1)),
("vmovupd mem ymm", ("", 0)),
("vmovupd ymm mem", ("", 0)),
# https://www.felixcloutier.com/x86/movups
("movups xmm xmm", ("1*p0123", 1)),
("movups mem xmm", ("", 0)),
("movups xmm mem", ("", 0)),
("vmovups xmm xmm", ("1*p0123", 1)),
("vmovups mem xmm", ("", 0)),
("vmovups xmm mem", ("", 0)),
("vmovups ymm ymm", ("1*p0123", 1)),
("vmovups mem ymm", ("", 0)),
("vmovups ymm mem", ("", 0)),
# https://www.felixcloutier.com/x86/movzx
("movzx gpr gpr", ("1*p6789", 1)),
("movzx mem gpr", ("", 0)),
("movzb gpr gpr", ("1*p6789", 1)), # AT&T version
("movzb mem gpr", ("", 0)), # AT&T version
("movzw gpr gpr", ("1*p6789", 1)), # AT&T version
("movzw mem gpr", ("", 0)), # AT&T version
("movzl gpr gpr", ("1*p6789", 1)), # AT&T version
("movzl mem gpr", ("", 0)), # AT&T version
("movzq gpr gpr", ("1*p6789", 1)), # AT&T version
("movzq mem gpr", ("", 0)), # AT&T version
# https://www.felixcloutier.com/x86/cmovcc
("cmova gpr gpr", ("1*p69", 1)),
("cmova mem gpr", ("", 0)),
("cmovae gpr gpr", ("1*p69", 1)),
("cmovae mem gpr", ("", 0)),
("cmovb gpr gpr", ("1*p69", 1)),
("cmovb mem gpr", ("", 0)),
("cmovbe gpr gpr", ("1*p69", 1)),
("cmovbe mem gpr", ("", 0)),
("cmovc gpr gpr", ("1*p69", 1)),
("cmovc mem gpr", ("", 0)),
("cmove gpr gpr", ("1*p69", 1)),
("cmove mem gpr", ("", 0)),
("cmovg gpr gpr", ("1*p69", 1)),
("cmovg mem gpr", ("", 0)),
("cmovge gpr gpr", ("1*p69", 1)),
("cmovge mem gpr", ("", 0)),
("cmovl gpr gpr", ("1*p69", 1)),
("cmovl mem gpr", ("", 0)),
("cmovle gpr gpr", ("1*p69", 1)),
("cmovle mem gpr", ("", 0)),
("cmovna gpr gpr", ("1*p69", 1)),
("cmovna mem gpr", ("", 0)),
("cmovnae gpr gpr", ("1*p69", 1)),
("cmovnae mem gpr", ("", 0)),
("cmovnb gpr gpr", ("1*p69", 1)),
("cmovnb mem gpr", ("", 0)),
("cmovnbe gpr gpr", ("1*p69", 1)),
("cmovnbe mem gpr", ("", 0)),
("cmovnc gpr gpr", ("1*p69", 1)),
("cmovnc mem gpr", ("", 0)),
("cmovne gpr gpr", ("1*p69", 1)),
("cmovne mem gpr", ("", 0)),
("cmovng gpr gpr", ("1*p69", 1)),
("cmovng mem gpr", ("", 0)),
("cmovnge gpr gpr", ("1*p69", 1)),
("cmovnge mem gpr", ("", 0)),
("cmovnl gpr gpr", ("1*p69", 1)),
("cmovnl mem gpr", ("", 0)),
("cmovno gpr gpr", ("1*p69", 1)),
("cmovno mem gpr", ("", 0)),
("cmovnp gpr gpr", ("1*p69", 1)),
("cmovnp mem gpr", ("", 0)),
("cmovns gpr gpr", ("1*p69", 1)),
("cmovns mem gpr", ("", 0)),
("cmovnz gpr gpr", ("1*p69", 1)),
("cmovnz mem gpr", ("", 0)),
("cmovo gpr gpr", ("1*p69", 1)),
("cmovo mem gpr", ("", 0)),
("cmovp gpr gpr", ("1*p69", 1)),
("cmovp mem gpr", ("", 0)),
("cmovpe gpr gpr", ("1*p69", 1)),
("cmovpe mem gpr", ("", 0)),
("cmovpo gpr gpr", ("1*p69", 1)),
("cmovpo mem gpr", ("", 0)),
("cmovs gpr gpr", ("1*p69", 1)),
("cmovs mem gpr", ("", 0)),
("cmovz gpr gpr", ("1*p69", 1)),
("cmovz mem gpr", ("", 0)),
# https://www.felixcloutier.com/x86/pmovmskb
("pmovmskb mm gpr", ("1*p0123", 1)),
("pmovmskb xmm gpr", ("1*p0123", 1)),
("vpmovmskb xmm gpr", ("1*p0123", 1)),
# https://www.felixcloutier.com/x86/pmovsx
("pmovsxbw xmm xmm", ("1*p12", 1)),
("pmovsxbw mem xmm", ("1*p12", 1)),
("pmovsxbd xmm xmm", ("1*p12", 1)),
("pmovsxbd mem xmm", ("1*p12", 1)),
("pmovsxbq xmm xmm", ("1*p12", 1)),
("pmovsxbq mem xmm", ("1*p12", 1)),
("vpmovsxbw xmm xmm", ("1*p12", 1)),
("vpmovsxbw mem xmm", ("1*p12", 1)),
("vpmovsxbd xmm xmm", ("1*p12", 1)),
("vpmovsxbd mem xmm", ("1*p12", 1)),
("vpmovsxbq xmm xmm", ("1*p12", 1)),
("vpmovsxbq mem xmm", ("1*p12", 1)),
("vpmovsxbw xmm ymm", ("1*p0123", 1)),
("vpmovsxbw mem ymm", ("1*p12", 1)),
("vpmovsxbd xmm ymm", ("1*p0123", 1)),
("vpmovsxbd mem ymm", ("1*p12", 1)),
("vpmovsxbq xmm ymm", ("1*p0123", 1)),
("vpmovsxbq mem ymm", ("1*p12", 1)),
# https://www.felixcloutier.com/x86/pmovzx
("pmovzxbw xmm xmm", ("1*p12", 1)),
("pmovzxbw mem xmm", ("1*p12", 1)),
("vpmovzxbw xmm xmm", ("1*p12", 1)),
("vpmovzxbw mem xmm", ("1*p12", 1)),
("vpmovzxbw xmm ymm", ("1*p0123", 1)),
("vpmovzxbw mem ymm", ("1*p12", 1)),
#################################################################
# https://www.felixcloutier.com/x86/movbe
("movbe gpr mem", ("1*p67", 5)),
("movbe mem gpr", ("1*p67", 5)),
################################################
# https://www.felixcloutier.com/x86/movq2dq
("movq2dq mm xmm", ("2*p0123", 1)),
]
p9 = MOVEntryBuilderIntelPort9()
icx_mov_instructions = [
@@ -517,11 +896,11 @@ icx_mov_instructions = [
("vpmovsxbd mem xmm", ("1*p15", 1)),
("vpmovsxbq xmm xmm", ("1*p15", 1)),
("vpmovsxbq mem xmm", ("1*p15", 1)),
("vpmovsxbw ymm ymm", ("1*p5", 1)),
("vpmovsxbw xmm ymm", ("1*p5", 1)),
("vpmovsxbw mem ymm", ("1*p5", 1)),
("vpmovsxbd ymm ymm", ("1*p5", 1)),
("vpmovsxbd xmm ymm", ("1*p5", 1)),
("vpmovsxbd mem ymm", ("1*p5", 1)),
("vpmovsxbq ymm ymm", ("1*p5", 1)),
("vpmovsxbq xmm ymm", ("1*p5", 1)),
("vpmovsxbq mem ymm", ("1*p5", 1)),
("vpmovsxbw ymm zmm", ("1*p5", 3)),
("vpmovsxbw mem zmm", ("1*p5", 1)),
@@ -572,7 +951,7 @@ class MOVEntryBuilderIntelWithPort7AGU(MOVEntryBuilder):
# for HSW, BDW, SKX and CSX
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
load, store = self.classify(operand_types)
load, store, vec = self.classify(operand_types)
if load:
port_pressure += [[1, "23"], [1, ["2D", "3D"]]]
@@ -794,7 +1173,6 @@ snb_mov_instructions = [
("movss mem xmm", ("", 0)),
("vmovss xmm xmm xmm", ("1*p5", 1)),
("vmovss mem xmm", ("", 0)),
("vmovss xmm xmm", ("1*p5", 1)),
("vmovss xmm mem", ("", 0)),
("movss mem xmm", ("", 0)),
# https://www.felixcloutier.com/x86/movsx:movsxd
@@ -919,11 +1297,11 @@ snb_mov_instructions = [
("vpmovsxbd mem xmm", ("1*p15", 1)),
("vpmovsxbq xmm xmm", ("1*p15", 1)),
("vpmovsxbq mem xmm", ("1*p15", 1)),
("vpmovsxbw ymm ymm", ("1*p15", 1)),
("vpmovsxbw xmm ymm", ("1*p15", 1)),
("vpmovsxbw mem ymm", ("1*p15", 1)),
("vpmovsxbd ymm ymm", ("1*p15", 1)),
("vpmovsxbd xmm ymm", ("1*p15", 1)),
("vpmovsxbd mem ymm", ("1*p15", 1)),
("vpmovsxbq ymm ymm", ("1*p15", 1)),
("vpmovsxbq xmm ymm", ("1*p15", 1)),
("vpmovsxbq mem ymm", ("1*p15", 1)),
# https://www.felixcloutier.com/x86/pmovzx
("pmovzxbw xmm xmm", ("1*p15", 1)),
@@ -1079,11 +1457,11 @@ hsw_mov_instructions = list(
("vpmovsxbd mem xmm", ("1*p5", 1)),
("vpmovsxbq xmm xmm", ("1*p5", 1)),
("vpmovsxbq mem xmm", ("1*p5", 1)),
("vpmovsxbw ymm ymm", ("1*p5", 1)),
("vpmovsxbw xmm ymm", ("1*p5", 1)),
("vpmovsxbw mem ymm", ("1*p5", 1)),
("vpmovsxbd ymm ymm", ("1*p5", 1)),
("vpmovsxbd xmm ymm", ("1*p5", 1)),
("vpmovsxbd mem ymm", ("1*p5", 1)),
("vpmovsxbq ymm ymm", ("1*p5", 1)),
("vpmovsxbq xmm ymm", ("1*p5", 1)),
("vpmovsxbq mem ymm", ("1*p5", 1)),
# https://www.felixcloutier.com/x86/pmovzx
("pmovzxbw xmm xmm", ("1*p5", 1)),
@@ -1234,6 +1612,7 @@ def get_description(arch, rhs_comment=None):
"skx": "\n".join([p7.process_item(*item) for item in skx_mov_instructions]),
"csx": "\n".join([p7.process_item(*item) for item in csx_mov_instructions]),
"icx": "\n".join([p9.process_item(*item) for item in icx_mov_instructions]),
"zen3": "\n".join([z3.process_item(*item) for item in zen3_mov_instructions]),
}
description = descriptions[arch]
@@ -1255,7 +1634,7 @@ if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: {} (snb|ivb|hsw|bdw|skx|csx|icx)".format(sys.argv[0]))
print("Usage: {} (snb|ivb|hsw|bdw|skx|csx|icx|zen3)".format(sys.argv[0]))
sys.exit(0)
try:

File diff suppressed because it is too large Load Diff

View File

@@ -3424,6 +3424,12 @@ instruction_forms:
name: "ID"
source: true
destination: false
- name: [sar, sal, shl, shr]
operands:
- class: "register"
name: "gpr"
source: true
destination: true
- name: sbb
operands:
- class: "register"

5194
osaca/data/zen3.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -30,6 +30,7 @@ SUPPORTED_ARCHS = [
"ICX",
"ZEN1",
"ZEN2",
"ZEN3",
"TX2",
"N1",
"A64FX",
@@ -97,7 +98,7 @@ def create_parser(parser=None):
parser.add_argument(
"--arch",
type=str,
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, TX2, N1, "
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, ZEN3, TX2, N1, "
"A64FX, TSV110, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
)
parser.add_argument(
@@ -329,6 +330,7 @@ def inspect(args, output_file=sys.stdout):
# Do optimal schedule for kernel throughput if wished
if not args.fixed:
semantics.assign_optimal_throughput(kernel)
semantics.assign_optimal_throughput(kernel)
# Create DiGrahps
kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)

View File

@@ -1,9 +1,11 @@
#!/usr/bin/env python3
"""Semantics opbject responsible for architecture specific semantic operations"""
import sys
import warnings
from itertools import chain
from operator import itemgetter
from copy import deepcopy
from .hw_model import MachineModel
from .isa_semantics import INSTR_FLAGS, ISASemantics
@@ -31,7 +33,7 @@ class ArchSemantics(ISASemantics):
if self._machine_model.has_hidden_loads():
self.set_hidden_loads(kernel)
def assign_optimal_throughput(self, kernel):
def assign_optimal_throughput(self, kernel, start=0):
"""
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
@@ -40,7 +42,26 @@ class ArchSemantics(ISASemantics):
INC = 0.01
kernel.reverse()
port_list = self._machine_model.get_ports()
for instruction_form in kernel:
for idx, instruction_form in enumerate(kernel[start:], start):
multiple_assignments = False
# if iform has multiple possible port assignments, check all in a DFS manner and take the best
if isinstance(instruction_form["port_uops"], dict):
best_kernel = None
best_kernel_tp = sys.maxsize
for port_util_alt in list(instruction_form["port_uops"].values())[1:]:
k_tmp = deepcopy(kernel)
k_tmp[idx]["port_uops"] = deepcopy(port_util_alt)
k_tmp[idx]["port_pressure"] = self._machine_model.average_port_pressure(
k_tmp[idx]["port_uops"]
)
k_tmp.reverse()
self.assign_optimal_throughput(k_tmp, idx)
if max(self.get_throughput_sum(k_tmp)) < best_kernel_tp:
best_kernel = k_tmp
best_kernel_tp = max(self.get_throughput_sum(best_kernel))
# check the first option in the main branch and compare against the best option later
multiple_assignments = True
kernel[idx]["port_uops"] = list(instruction_form["port_uops"].values())[0]
for uop in instruction_form["port_uops"]:
cycles = uop[0]
ports = list(uop[1])
@@ -84,6 +105,7 @@ class ArchSemantics(ISASemantics):
p
for p in indices
if round(instruction_form["port_pressure"][p], 2) == 0
or instruction_form["port_pressure"][p] < 0.00
][0]
instruction_form["port_pressure"][zero_index] = 0.0
# Remove from further balancing
@@ -108,6 +130,11 @@ class ArchSemantics(ISASemantics):
itemgetter(*indices)(self.get_throughput_sum(kernel))
)
kernel.reverse()
if multiple_assignments:
if max(self.get_throughput_sum(kernel)) > best_kernel_tp:
for i, instr in enumerate(best_kernel):
kernel[i]["port_uops"] = best_kernel[i]["port_uops"]
kernel[i]["port_pressure"] = best_kernel[i]["port_pressure"]
def set_hidden_loads(self, kernel):
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
@@ -209,11 +236,12 @@ class ArchSemantics(ISASemantics):
operands.index(self._create_reg_wildcard())
]
)
dummy_reg = {"class": "register", "name": reg_type}
data_port_pressure = [0.0 for _ in range(port_number)]
data_port_uops = []
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
# LOAD performance data
data_port_uops = self._machine_model.get_load_throughput(
load_perf_data = self._machine_model.get_load_throughput(
[
x["memory"]
for x in instruction_form["semantic_operands"]["source"]
@@ -221,6 +249,19 @@ class ArchSemantics(ISASemantics):
if "memory" in x
][0]
)
# if multiple options, choose based on reg type
data_port_uops = [
ldp["port_pressure"]
for ldp in load_perf_data
if "dst" in ldp
and self._machine_model._check_operands(
dummy_reg, {"register": {"name": ldp["dst"]}}
)
]
if len(data_port_uops) < 1:
data_port_uops = load_perf_data[0]["port_pressure"]
else:
data_port_uops = data_port_uops[0]
data_port_pressure = self._machine_model.average_port_pressure(
data_port_uops
)
@@ -235,9 +276,22 @@ class ArchSemantics(ISASemantics):
instruction_form["semantic_operands"]["destination"]
+ instruction_form["semantic_operands"]["src_dst"]
)
st_data_port_uops = self._machine_model.get_store_throughput(
store_perf_data = self._machine_model.get_store_throughput(
[x["memory"] for x in destinations if "memory" in x][0]
)
# if multiple options, choose based on reg type
st_data_port_uops = [
stp["port_pressure"]
for stp in store_perf_data
if "src" in stp
and self._machine_model._check_operands(
dummy_reg, {"register": {"name": stp["src"]}}
)
]
if len(st_data_port_uops) < 1:
st_data_port_uops = store_perf_data[0]["port_pressure"]
else:
st_data_port_uops = st_data_port_uops[0]
# zero data port pressure and remove HAS_ST flag if
# - no mem operand in dst &&
# - all mem operands in src_dst are pre-/post-indexed

View File

@@ -143,11 +143,16 @@ class MachineModel(object):
print("\nname: {}\noperands: {}".format(name, operands))
raise TypeError from e
def average_port_pressure(self, port_pressure):
def average_port_pressure(self, port_pressure, option=0):
"""Construct average port pressure list from instruction data."""
port_list = self._data["ports"]
average_pressure = [0.0] * len(port_list)
for cycles, ports in port_pressure:
# if there are multiple port utilization options and none is selected, choose first one
if isinstance(port_pressure, dict):
used_pp = port_pressure[option]
else:
used_pp = port_pressure
for cycles, ports in used_pp:
for p in ports:
try:
average_pressure[port_list.index(p)] += cycles / len(ports)
@@ -221,8 +226,8 @@ class MachineModel(object):
"""Return load thorughput for given register type."""
ld_tp = [m for m in self._data["load_throughput"] if self._match_mem_entries(memory, m)]
if len(ld_tp) > 0:
return ld_tp[0]["port_pressure"].copy()
return self._data["load_throughput_default"].copy()
return ld_tp.copy()
return [{"port_pressure": self._data["load_throughput_default"].copy()}]
def get_store_latency(self, reg_type):
"""Return store latency for given register type."""
@@ -233,8 +238,8 @@ class MachineModel(object):
"""Return store throughput for given register type."""
st_tp = [m for m in self._data["store_throughput"] if self._match_mem_entries(memory, m)]
if len(st_tp) > 0:
return st_tp[0]["port_pressure"].copy()
return self._data["store_throughput_default"].copy()
return st_tp.copy()
return [{"port_pressure": self._data["store_throughput_default"].copy()}]
def _match_mem_entries(self, mem, i_mem):
"""Check if memory addressing ``mem`` and ``i_mem`` are of the same type."""
@@ -273,6 +278,7 @@ class MachineModel(object):
"zen1": "x86",
"zen+": "x86",
"zen2": "x86",
"zen3": "x86",
"con": "x86", # Intel Conroe
"wol": "x86", # Intel Wolfdale
"snb": "x86",
@@ -690,6 +696,8 @@ class MachineModel(object):
return False
return True
else:
if reg["name"].rstrip(string.digits).lower() == i_reg_name:
return True
if i_reg_name == "gpr":
return True
return False

View File

@@ -61,6 +61,24 @@ port_model_scheme: |
+-------+ | VNNI |
+-------+
instruction_forms:
- name: fantasyinstr1
operands:
- class: register
name: gpr
- class: register
name: gpr
port_pressure: {0: [[1, '015']], 1: [[1, '56']]}
throughput: 0.333333
latency: 1.0
- name: fantasyinstr2
operands:
- class: register
name: gpr
- class: register
name: gpr
port_pressure: [[1, '0'], [1, '1'], [1, '5']]
throughput: 0.5
latency: 1.0
- name: LEA
operands:
- class: memory

View File

@@ -175,7 +175,7 @@ class TestSemanticTools(unittest.TestCase):
self.assertEqual(
test_mm_x86.get_store_throughput(
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
),
)[0]["port_pressure"],
[[2, "237"], [2, "4"]],
)
self.assertEqual(
@@ -186,13 +186,13 @@ class TestSemanticTools(unittest.TestCase):
"index": "NOT_NONE",
"scale": 1,
}
),
)[0]["port_pressure"],
[[1, "23"], [1, "4"]],
)
self.assertEqual(
test_mm_arm.get_store_throughput(
{"base": {"prefix": "x"}, "offset": None, "index": None, "scale": 1}
),
)[0]["port_pressure"],
[[2, "34"], [2, "5"]],
)
self.assertEqual(
@@ -203,7 +203,7 @@ class TestSemanticTools(unittest.TestCase):
"index": None,
"scale": 1,
}
),
)[0]["port_pressure"],
[[1, "34"], [1, "5"]],
)
@@ -228,7 +228,7 @@ class TestSemanticTools(unittest.TestCase):
self.assertEqual(
test_mm_x86.get_load_throughput(
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
),
)[0]["port_pressure"],
[[1, "23"], [1, ["2D", "3D"]]],
)
@@ -288,6 +288,21 @@ class TestSemanticTools(unittest.TestCase):
tp_optimal = self.semantics_csx.get_throughput_sum(kernel_optimal)
self.assertNotEqual(tp_fixed, tp_optimal)
self.assertTrue(max(tp_optimal) <= max(tp_fixed))
# test multiple port assignment options
test_mm_x86 = MachineModel(path_to_yaml=self._find_file("test_db_x86.yml"))
tmp_semantics = ArchSemantics(test_mm_x86)
tmp_code_1 = "fantasyinstr1 %rax, %rax\n"
tmp_code_2 = "fantasyinstr1 %rax, %rax\nfantasyinstr2 %rbx, %rbx\n"
tmp_kernel_1 = self.parser_x86.parse_file(tmp_code_1)
tmp_kernel_2 = self.parser_x86.parse_file(tmp_code_2)
tmp_semantics.add_semantics(tmp_kernel_1)
tmp_semantics.add_semantics(tmp_kernel_2)
tmp_semantics.assign_optimal_throughput(tmp_kernel_1)
tmp_semantics.assign_optimal_throughput(tmp_kernel_2)
k1i1_pp = [round(x, 2) for x in tmp_kernel_1[0]["port_pressure"]]
k2i1_pp = [round(x, 2) for x in tmp_kernel_2[0]["port_pressure"]]
self.assertEqual(k1i1_pp, [0.33, 0.0, 0.33, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33, 0.0, 0.0])
self.assertEqual(k2i1_pp, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])
# arm
kernel_fixed = deepcopy(self.kernel_AArch64)