mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2025-12-16 09:00:05 +01:00
Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7c907e2432 | ||
|
|
1ea1e68b4e | ||
|
|
907f64d452 | ||
|
|
24de7a762b | ||
|
|
87411ab822 | ||
|
|
2fa25e3099 | ||
|
|
0b440e4da9 | ||
|
|
08e6a4be36 | ||
|
|
7724ce27c7 | ||
|
|
4f8e37d9fd | ||
|
|
d5f1654aa8 | ||
|
|
81f40604cb | ||
|
|
df747b8c48 | ||
|
|
4e25a29a8a | ||
|
|
016061f72c | ||
|
|
ddff8c5012 | ||
|
|
2306cb58d0 | ||
|
|
660a9d0f41 | ||
|
|
3b453de617 | ||
|
|
b93d911bb7 | ||
|
|
21cfb8d011 | ||
|
|
32d60e7966 | ||
|
|
ba60703fb2 | ||
|
|
76542782c8 | ||
|
|
671f7f5591 | ||
|
|
f96f5d7ad1 | ||
|
|
d81c53ef91 |
4
.github/workflows/test-n-publish.yml
vendored
4
.github/workflows/test-n-publish.yml
vendored
@@ -7,7 +7,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [3.6, 3.7, 3.8, 3.9]
|
||||
python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
python setup.py build sdist bdist_wheel
|
||||
- name: Publish to PyPI
|
||||
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
|
||||
uses: pypa/gh-action-pypi-publish@master
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
skip_existing: true
|
||||
user: __token__
|
||||
|
||||
@@ -94,8 +94,8 @@ The usage of OSACA can be listed as:
|
||||
shows the program’s version number.
|
||||
--arch ARCH
|
||||
needs to be replaced with the target architecture abbreviation.
|
||||
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX`` and ``ICL`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2`` for AMD Zen architectures.
|
||||
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72 and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
|
||||
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ICL`` (Client), ``ICX`` (Server) for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN1``, ``ZEN2``, and ``ZEN3`` for AMD Zen architectures.
|
||||
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72, ``TSV110`` for the HiSilicon TaiShan v110, and ``A64FX`` for Fujitsu's HPC ARM architecture are available.
|
||||
If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
|
||||
--fixed
|
||||
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Open Source Architecture Code Analyzer"""
|
||||
name = "osaca"
|
||||
__version__ = "0.4.8"
|
||||
__version__ = "0.4.11"
|
||||
|
||||
# To trigger travis deployment to pypi, do the following:
|
||||
# 1. Increment __version___
|
||||
|
||||
178
osaca/data/create_db_entry.py
Executable file
178
osaca/data/create_db_entry.py
Executable file
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
from collections import defaultdict
|
||||
from fractions import Fraction
|
||||
|
||||
|
||||
class EntryBuilder:
|
||||
@staticmethod
|
||||
def compute_throughput(port_pressure):
|
||||
port_occupancy = defaultdict(Fraction)
|
||||
for uops, ports in port_pressure:
|
||||
for p in ports:
|
||||
port_occupancy[p] += Fraction(uops, len(ports))
|
||||
return float(max(list(port_occupancy.values()) + [0]))
|
||||
|
||||
@staticmethod
|
||||
def classify(operands_types):
|
||||
load = "mem" in operands_types[:-1]
|
||||
store = "mem" in operands_types[-1:]
|
||||
vec = False
|
||||
if any([vecr in operands_types for vecr in ["mm", "xmm", "ymm", "zmm"]]):
|
||||
vec = True
|
||||
assert not (load and store), "Can not process a combined load-store instruction."
|
||||
return load, store, vec
|
||||
|
||||
def build_description(
|
||||
self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
|
||||
):
|
||||
if comment:
|
||||
comment = " # " + comment
|
||||
else:
|
||||
comment = ""
|
||||
description = "- name: {}{}\n operands: {}\n".format(instruction_name, comment, "[]" if len(operand_types) == 0 else "")
|
||||
|
||||
for ot in operand_types:
|
||||
if ot == "imd":
|
||||
description += " - class: immediate\n imd: int\n"
|
||||
elif ot.startswith("mem"):
|
||||
description += " - class: memory\n" ' base: "*"\n' ' offset: "*"\n'
|
||||
if ot == "mem_simple":
|
||||
description += " index: ~\n"
|
||||
elif ot == "mem_complex":
|
||||
description += " index: gpr\n"
|
||||
else:
|
||||
description += ' index: "*"\n'
|
||||
description += ' scale: "*"\n'
|
||||
else:
|
||||
if "{k}" in ot:
|
||||
description += " - class: register\n name: {}\n mask: True\n".format(
|
||||
ot.replace("{k}", "")
|
||||
)
|
||||
else:
|
||||
description += " - class: register\n name: {}\n".format(ot)
|
||||
|
||||
description += (
|
||||
" latency: {latency}\n"
|
||||
" port_pressure: {port_pressure!r}\n"
|
||||
" throughput: {throughput}\n"
|
||||
" uops: {uops}\n"
|
||||
).format(
|
||||
latency=latency,
|
||||
port_pressure=port_pressure,
|
||||
throughput=self.compute_throughput(port_pressure),
|
||||
uops=sum([i for i, p in port_pressure]),
|
||||
)
|
||||
return description
|
||||
|
||||
def parse_port_pressure(self, port_pressure_str):
|
||||
"""
|
||||
Example:
|
||||
1*p45+2*p0+2*p10,11 -> [[1, '45'], [2, '0'], [2, ['10', '11']]]
|
||||
"""
|
||||
port_pressure = []
|
||||
if port_pressure_str:
|
||||
for p in port_pressure_str.split("+"):
|
||||
cycles, ports = p.split("*p")
|
||||
ports = ports.split(",")
|
||||
if len(ports) == 1:
|
||||
ports = ports[0]
|
||||
else:
|
||||
ports = list(filter(lambda p: len(p) > 0, ports))
|
||||
|
||||
port_pressure.append([int(cycles), ports])
|
||||
return port_pressure
|
||||
|
||||
def process_item(self, instruction_form, resources):
|
||||
"""
|
||||
Example:
|
||||
('mov xmm mem', ('1*p45+2*p0', 7) -> ('mov', ['xmm', 'mem'], [[1, '45'], [2, '0']], 7)
|
||||
"""
|
||||
if instruction_form.startswith("[") and "]" in instruction_form:
|
||||
instr_elements = instruction_form.split("]")
|
||||
instr_elements = [instr_elements[0] + "]"] + instr_elements[1].strip().split(" ")
|
||||
else:
|
||||
instr_elements = instruction_form.split(" ")
|
||||
latency = int(resources[1])
|
||||
port_pressure = self.parse_port_pressure(resources[0])
|
||||
instruction_name = instr_elements[0]
|
||||
operand_types = instr_elements[1:]
|
||||
return self.build_description(instruction_name, operand_types, port_pressure, latency)
|
||||
|
||||
|
||||
class ArchEntryBuilder(EntryBuilder):
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
# Intel ICX
|
||||
# LD_pressure = [[1, "23"], [1, ["2D", "3D"]]]
|
||||
# LD_pressure_vec = LD_pressure
|
||||
# ST_pressure = [[1, "79"], [1, "48"]]
|
||||
# ST_pressure_vec = ST_pressure
|
||||
# LD_lat = 5
|
||||
# ST_lat = 0
|
||||
# Zen3
|
||||
LD_pressure = [[1, ["11", "12", "13"]]]
|
||||
LD_pressure_vec = [[1, ["11", "12"]]]
|
||||
ST_pressure = [[1, ["12", "13"]]]
|
||||
ST_pressure_vec = [[1, ["4"]], [1, ["13"]]]
|
||||
LD_lat = 4
|
||||
ST_lat = 0
|
||||
|
||||
load, store, vec = self.classify(operand_types)
|
||||
|
||||
if load:
|
||||
if vec:
|
||||
port_pressure += LD_pressure_vec
|
||||
else:
|
||||
port_pressure += LD_pressure
|
||||
latency += LD_lat
|
||||
comment = "with load"
|
||||
return EntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency, comment
|
||||
)
|
||||
if store:
|
||||
if vec:
|
||||
port_pressure = port_pressure + ST_pressure_vec
|
||||
else:
|
||||
port_pressure = port_pressure + ST_pressure
|
||||
operands = ["mem" if o == "mem" else o for o in operand_types]
|
||||
latency += ST_lat
|
||||
return EntryBuilder.build_description(
|
||||
self,
|
||||
instruction_name,
|
||||
operands,
|
||||
port_pressure,
|
||||
latency,
|
||||
"with store",
|
||||
)
|
||||
|
||||
# Register only:
|
||||
return EntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency
|
||||
)
|
||||
|
||||
|
||||
def get_description(instruction_form, port_pressure, latency, rhs_comment=None):
|
||||
entry = ArchEntryBuilder().process_item(instruction_form, (port_pressure, latency))
|
||||
|
||||
if rhs_comment is not None:
|
||||
max_length = max([len(line) for line in entry.split("\n")])
|
||||
|
||||
commented_entry = ""
|
||||
for line in entry.split("\n"):
|
||||
commented_entry += ("{:<" + str(max_length) + "} # {}\n").format(line, rhs_comment)
|
||||
entry = commented_entry
|
||||
|
||||
return entry
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 4 and len(sys.argv) != 5:
|
||||
print("Usage: {} <INSTRUCTION> <PORT_PRESSURE> <LATENCY> [COMMENT]".format(sys.argv[0]))
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
print(get_description(*sys.argv[1:]))
|
||||
except KeyError:
|
||||
print("Unknown architecture.")
|
||||
sys.exit(1)
|
||||
@@ -16,8 +16,11 @@ class MOVEntryBuilder:
|
||||
def classify(operands_types):
|
||||
load = "mem" in operands_types[:-1]
|
||||
store = "mem" in operands_types[-1:]
|
||||
vec = False
|
||||
if any([vecr in operands_types for vecr in ["mm", "xmm", "ymm", "zmm"]]):
|
||||
vec = True
|
||||
assert not (load and store), "Can not process a combined load-store instruction."
|
||||
return load, store
|
||||
return load, store, vec
|
||||
|
||||
def build_description(
|
||||
self, instruction_name, operand_types, port_pressure=[], latency=0, comment=None
|
||||
@@ -65,6 +68,9 @@ class MOVEntryBuilder:
|
||||
if port_pressure_str:
|
||||
for p in port_pressure_str.split("+"):
|
||||
cycles, ports = p.split("*p")
|
||||
ports = ports.split(",")
|
||||
if len(ports) == 1:
|
||||
ports = ports[0]
|
||||
port_pressure.append([int(cycles), ports])
|
||||
return port_pressure
|
||||
|
||||
@@ -84,7 +90,7 @@ class MOVEntryBuilder:
|
||||
class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
|
||||
# for SNB and IVB
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
load, store = self.classify(operand_types)
|
||||
load, store, vec = self.classify(operand_types)
|
||||
|
||||
comment = None
|
||||
if load:
|
||||
@@ -109,11 +115,843 @@ class MOVEntryBuilderIntelNoPort7AGU(MOVEntryBuilder):
|
||||
)
|
||||
|
||||
|
||||
class MOVEntryBuilderIntelPort9(MOVEntryBuilder):
|
||||
# for ICX
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
load, store, vec = self.classify(operand_types)
|
||||
|
||||
if load:
|
||||
port_pressure += [[1, "23"], [1, ["2D", "3D"]]]
|
||||
latency += 5
|
||||
comment = "with load"
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency, comment
|
||||
)
|
||||
if store:
|
||||
port_pressure = port_pressure + [[1, "79"], [1, "48"]]
|
||||
operands = ["mem" if o == "mem" else o for o in operand_types]
|
||||
latency += 0
|
||||
return MOVEntryBuilder.build_description(
|
||||
self,
|
||||
instruction_name,
|
||||
operands,
|
||||
port_pressure,
|
||||
latency,
|
||||
"with store",
|
||||
)
|
||||
|
||||
# Register only:
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency
|
||||
)
|
||||
|
||||
|
||||
class MOVEntryBuilderAMDZen3(MOVEntryBuilder):
|
||||
# for Zen 3
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
load, store, vec = self.classify(operand_types)
|
||||
|
||||
if load and vec:
|
||||
port_pressure += [[1, ["11", "12"]]]
|
||||
latency += 4
|
||||
comment = "with load"
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency, comment
|
||||
)
|
||||
elif load:
|
||||
port_pressure += [[1, ["11", "12", "13"]]]
|
||||
latency += 4
|
||||
comment = "with load"
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency, comment
|
||||
)
|
||||
if store and vec:
|
||||
port_pressure = port_pressure + [[1, ["4"]], [1, ["13"]]]
|
||||
operands = ["mem" if o == "mem" else o for o in operand_types]
|
||||
latency += 0
|
||||
return MOVEntryBuilder.build_description(
|
||||
self,
|
||||
instruction_name,
|
||||
operands,
|
||||
port_pressure,
|
||||
latency,
|
||||
"with store",
|
||||
)
|
||||
elif store:
|
||||
port_pressure = port_pressure + [[1, ["12", "13"]]]
|
||||
operands = ["mem" if o == "mem" else o for o in operand_types]
|
||||
latency += 0
|
||||
return MOVEntryBuilder.build_description(
|
||||
self,
|
||||
instruction_name,
|
||||
operands,
|
||||
port_pressure,
|
||||
latency,
|
||||
"with store",
|
||||
)
|
||||
# Register only:
|
||||
return MOVEntryBuilder.build_description(
|
||||
self, instruction_name, operand_types, port_pressure, latency
|
||||
)
|
||||
|
||||
|
||||
#############################################################################
|
||||
|
||||
z3 = MOVEntryBuilderAMDZen3()
|
||||
|
||||
zen3_mov_instructions = [
|
||||
# https://www.felixcloutier.com/x86/mov
|
||||
("mov gpr gpr", ("1*p6789", 1)),
|
||||
("mov gpr mem", ("", 0)),
|
||||
("mov mem gpr", ("", 0)),
|
||||
("mov imd gpr", ("1*p6789", 1)),
|
||||
("mov imd mem", ("", 0)),
|
||||
("movabs imd gpr", ("1*p6789", 1)), # AT&T version, port util to be verified
|
||||
# https://www.felixcloutier.com/x86/movapd
|
||||
("movapd xmm xmm", ("1*p0123", 1)),
|
||||
("movapd xmm mem", ("", 0)),
|
||||
("movapd mem xmm", ("", 0)),
|
||||
("vmovapd xmm xmm", ("1*p0123", 1)),
|
||||
("vmovapd xmm mem", ("", 0)),
|
||||
("vmovapd mem xmm", ("", 0)),
|
||||
("vmovapd ymm ymm", ("1*p0123", 1)),
|
||||
("vmovapd ymm mem", ("", 0)),
|
||||
("vmovapd mem ymm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movaps
|
||||
("movaps xmm xmm", ("1*p0123", 1)),
|
||||
("movaps xmm mem", ("", 0)),
|
||||
("movaps mem xmm", ("", 0)),
|
||||
("vmovaps xmm xmm", ("1*p0123", 1)),
|
||||
("vmovaps xmm mem", ("", 0)),
|
||||
("vmovaps mem xmm", ("", 0)),
|
||||
("vmovaps ymm ymm", ("1*p0123", 1)),
|
||||
("vmovaps ymm mem", ("", 0)),
|
||||
("vmovaps mem ymm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movd:movq
|
||||
("movd gpr mm", ("1*p0123", 1)),
|
||||
("movd mem mm", ("", 0)),
|
||||
("movq gpr mm", ("1*p0123", 1)),
|
||||
("movq mem mm", ("", 0)),
|
||||
("movd mm gpr", ("1*p0123", 1)),
|
||||
("movd mm mem", ("", 0)),
|
||||
("movq mm gpr", ("1*p0123", 1)),
|
||||
("movq mm mem", ("", 0)),
|
||||
("movd gpr xmm", ("1*p0123", 1)),
|
||||
("movd mem xmm", ("", 0)),
|
||||
("movq gpr xmm", ("1*p0123", 1)),
|
||||
("movq mem xmm", ("", 0)),
|
||||
("movd xmm gpr", ("1*p0123", 1)),
|
||||
("movd xmm mem", ("", 0)),
|
||||
("movq xmm gpr", ("1*p0123", 1)),
|
||||
("movq xmm mem", ("", 0)),
|
||||
("vmovd gpr xmm", ("1*p0123", 1)),
|
||||
("vmovd mem xmm", ("", 0)),
|
||||
("vmovq gpr xmm", ("1*p0123", 1)),
|
||||
("vmovq mem xmm", ("", 0)),
|
||||
("vmovd xmm gpr", ("1*p0123", 1)),
|
||||
("vmovd xmm mem", ("", 0)),
|
||||
("vmovq xmm gpr", ("1*p0123", 1)),
|
||||
("vmovq xmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movddup
|
||||
("movddup xmm xmm", ("1*p12", 1)),
|
||||
("movddup mem xmm", ("", 0)),
|
||||
("vmovddup xmm xmm", ("1*p12", 1)),
|
||||
("vmovddup mem xmm", ("", 0)),
|
||||
("vmovddup ymm ymm", ("1*p12", 1)),
|
||||
("vmovddup mem ymm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movdq2q
|
||||
("movdq2q xmm mm", ("1*p0123", 1)),
|
||||
# https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
|
||||
("movdqa xmm xmm", ("1*p0123", 1)),
|
||||
("movdqa mem xmm", ("", 0)),
|
||||
("movdqa xmm mem", ("", 0)),
|
||||
("vmovdqa xmm xmm", ("1*p0123", 1)),
|
||||
("vmovdqa mem xmm", ("", 0)),
|
||||
("vmovdqa xmm mem", ("", 0)),
|
||||
("vmovdqa ymm ymm", ("1*p0123", 1)),
|
||||
("vmovdqa mem ymm", ("", 0)),
|
||||
("vmovdqa ymm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
|
||||
("movdqu xmm xmm", ("1*p0123", 1)),
|
||||
("movdqu mem xmm", ("", 0)),
|
||||
("movdqu xmm mem", ("", 0)),
|
||||
("vmovdqu xmm xmm", ("1*p0123", 1)),
|
||||
("vmovdqu mem xmm", ("", 0)),
|
||||
("vmovdqu xmm mem", ("", 0)),
|
||||
("vmovdqu ymm ymm", ("1*p0123", 1)),
|
||||
("vmovdqu mem ymm", ("", 0)),
|
||||
("vmovdqu ymm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movhlps
|
||||
("movhlps xmm xmm", ("1*p12", 1)),
|
||||
("vmovhlps xmm xmm xmm", ("1*p12", 1)),
|
||||
# https://www.felixcloutier.com/x86/movhpd
|
||||
("movhpd mem xmm", ("1*p12", 1)),
|
||||
("vmovhpd mem xmm xmm", ("1*p12", 1)),
|
||||
("movhpd xmm mem", ("", 0)),
|
||||
("vmovhpd mem xmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movhps
|
||||
("movhps mem xmm", ("1*p12", 1)),
|
||||
("vmovhps mem xmm xmm", ("1*p12", 1)),
|
||||
("movhps xmm mem", ("", 0)),
|
||||
("vmovhps mem xmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movlhps
|
||||
("movlhps xmm xmm", ("1*p12", 1)),
|
||||
("vmovlhps xmm xmm xmm", ("1*p12", 1)),
|
||||
# https://www.felixcloutier.com/x86/movlpd
|
||||
("movlpd mem xmm", ("1*p12", 1)),
|
||||
("vmovlpd mem xmm xmm", ("1*p12", 1)),
|
||||
("movlpd xmm mem", ("1*p12", 0)),
|
||||
("vmovlpd mem xmm", ("1*p12", 1)),
|
||||
# https://www.felixcloutier.com/x86/movlps
|
||||
("movlps mem xmm", ("1*p12", 1)),
|
||||
("vmovlps mem xmm xmm", ("1*p12", 1)),
|
||||
("movlps xmm mem", ("1*p12", 0)),
|
||||
("vmovlps mem xmm", ("1*p12", 1)),
|
||||
# https://www.felixcloutier.com/x86/movmskpd
|
||||
("movmskpd xmm gpr", ("1*p0123", 1)),
|
||||
("vmovmskpd xmm gpr", ("1*p0123", 1)),
|
||||
("vmovmskpd ymm gpr", ("1*p0123", 1)),
|
||||
# https://www.felixcloutier.com/x86/movmskps
|
||||
("movmskps xmm gpr", ("1*p0123", 1)),
|
||||
("vmovmskps xmm gpr", ("1*p0123", 1)),
|
||||
("vmovmskps ymm gpr", ("1*p0123", 1)),
|
||||
# https://www.felixcloutier.com/x86/movntdq
|
||||
("movntdq xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdq xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdq ymm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntdqa
|
||||
("movntdqa mem xmm", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdqa mem xmm", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdqa mem ymm", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movnti
|
||||
("movnti gpr mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntpd
|
||||
("movntpd xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntpd xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntpd ymm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntps
|
||||
("movntps xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntps xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntps ymm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntq
|
||||
("movntq mm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movq
|
||||
("movq mm mm", ("", 0)),
|
||||
("movq mem mm", ("", 0)),
|
||||
("movq mm mem", ("", 0)),
|
||||
("movq xmm xmm", ("1*p0123", 1)),
|
||||
("movq mem xmm", ("", 0)),
|
||||
("movq xmm mem", ("", 0)),
|
||||
("vmovq xmm xmm", ("1*p0123", 1)),
|
||||
("vmovq mem xmm", ("", 0)),
|
||||
("vmovq xmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movs:movsb:movsw:movsd:movsq
|
||||
# TODO combined load-store is currently not supported
|
||||
# ('movs mem mem', ()),
|
||||
# https://www.felixcloutier.com/x86/movsd
|
||||
("movsd xmm xmm", ("1*p0123", 1)),
|
||||
("movsd mem xmm", ("", 0)),
|
||||
("movsd xmm mem", ("", 0)),
|
||||
("vmovsd xmm xmm xmm", ("1*p0123", 1)),
|
||||
("vmovsd mem xmm", ("", 0)),
|
||||
("vmovsd xmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movshdup
|
||||
("movshdup xmm xmm", ("1*p12", 1)),
|
||||
("movshdup mem xmm", ("", 0)),
|
||||
("vmovshdup xmm xmm", ("1*p12", 1)),
|
||||
("vmovshdup mem xmm", ("", 0)),
|
||||
("vmovshdup ymm ymm", ("1*p12", 1)),
|
||||
("vmovshdup mem ymm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movsldup
|
||||
("movsldup xmm xmm", ("1*p12", 1)),
|
||||
("movsldup mem xmm", ("", 0)),
|
||||
("vmovsldup xmm xmm", ("1*p12", 1)),
|
||||
("vmovsldup mem xmm", ("", 0)),
|
||||
("vmovsldup ymm ymm", ("1*p12", 1)),
|
||||
("vmovsldup mem ymm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movss
|
||||
("movss xmm xmm", ("1*p0123", 1)),
|
||||
("movss mem xmm", ("", 0)),
|
||||
("vmovss xmm xmm xmm", ("1*p0123", 1)),
|
||||
("vmovss mem xmm", ("", 0)),
|
||||
("vmovss xmm xmm", ("1*p0123", 1)),
|
||||
("vmovss xmm mem", ("", 0)),
|
||||
("movss mem xmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movsx:movsxd
|
||||
("movsx gpr gpr", ("1*p6789", 1)),
|
||||
("movsx mem gpr", ("", 0)),
|
||||
("movsxd gpr gpr", ("", 0)),
|
||||
("movsxd mem gpr", ("", 0)),
|
||||
("movsb gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movsb mem gpr", ("", 0)), # AT&T version
|
||||
("movsw gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movsw mem gpr", ("", 0)), # AT&T version
|
||||
("movsl gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movsl mem gpr", ("", 0)), # AT&T version
|
||||
("movsq gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movsq mem gpr", ("", 0)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/movupd
|
||||
("movupd xmm xmm", ("1*p0123", 1)),
|
||||
("movupd mem xmm", ("", 0)),
|
||||
("movupd xmm mem", ("", 0)),
|
||||
("vmovupd xmm xmm", ("1*p0123", 1)),
|
||||
("vmovupd mem xmm", ("", 0)),
|
||||
("vmovupd xmm mem", ("", 0)),
|
||||
("vmovupd ymm ymm", ("1*p0123", 1)),
|
||||
("vmovupd mem ymm", ("", 0)),
|
||||
("vmovupd ymm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movups
|
||||
("movups xmm xmm", ("1*p0123", 1)),
|
||||
("movups mem xmm", ("", 0)),
|
||||
("movups xmm mem", ("", 0)),
|
||||
("vmovups xmm xmm", ("1*p0123", 1)),
|
||||
("vmovups mem xmm", ("", 0)),
|
||||
("vmovups xmm mem", ("", 0)),
|
||||
("vmovups ymm ymm", ("1*p0123", 1)),
|
||||
("vmovups mem ymm", ("", 0)),
|
||||
("vmovups ymm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movzx
|
||||
("movzx gpr gpr", ("1*p6789", 1)),
|
||||
("movzx mem gpr", ("", 0)),
|
||||
("movzb gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movzb mem gpr", ("", 0)), # AT&T version
|
||||
("movzw gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movzw mem gpr", ("", 0)), # AT&T version
|
||||
("movzl gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movzl mem gpr", ("", 0)), # AT&T version
|
||||
("movzq gpr gpr", ("1*p6789", 1)), # AT&T version
|
||||
("movzq mem gpr", ("", 0)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/cmovcc
|
||||
("cmova gpr gpr", ("1*p69", 1)),
|
||||
("cmova mem gpr", ("", 0)),
|
||||
("cmovae gpr gpr", ("1*p69", 1)),
|
||||
("cmovae mem gpr", ("", 0)),
|
||||
("cmovb gpr gpr", ("1*p69", 1)),
|
||||
("cmovb mem gpr", ("", 0)),
|
||||
("cmovbe gpr gpr", ("1*p69", 1)),
|
||||
("cmovbe mem gpr", ("", 0)),
|
||||
("cmovc gpr gpr", ("1*p69", 1)),
|
||||
("cmovc mem gpr", ("", 0)),
|
||||
("cmove gpr gpr", ("1*p69", 1)),
|
||||
("cmove mem gpr", ("", 0)),
|
||||
("cmovg gpr gpr", ("1*p69", 1)),
|
||||
("cmovg mem gpr", ("", 0)),
|
||||
("cmovge gpr gpr", ("1*p69", 1)),
|
||||
("cmovge mem gpr", ("", 0)),
|
||||
("cmovl gpr gpr", ("1*p69", 1)),
|
||||
("cmovl mem gpr", ("", 0)),
|
||||
("cmovle gpr gpr", ("1*p69", 1)),
|
||||
("cmovle mem gpr", ("", 0)),
|
||||
("cmovna gpr gpr", ("1*p69", 1)),
|
||||
("cmovna mem gpr", ("", 0)),
|
||||
("cmovnae gpr gpr", ("1*p69", 1)),
|
||||
("cmovnae mem gpr", ("", 0)),
|
||||
("cmovnb gpr gpr", ("1*p69", 1)),
|
||||
("cmovnb mem gpr", ("", 0)),
|
||||
("cmovnbe gpr gpr", ("1*p69", 1)),
|
||||
("cmovnbe mem gpr", ("", 0)),
|
||||
("cmovnc gpr gpr", ("1*p69", 1)),
|
||||
("cmovnc mem gpr", ("", 0)),
|
||||
("cmovne gpr gpr", ("1*p69", 1)),
|
||||
("cmovne mem gpr", ("", 0)),
|
||||
("cmovng gpr gpr", ("1*p69", 1)),
|
||||
("cmovng mem gpr", ("", 0)),
|
||||
("cmovnge gpr gpr", ("1*p69", 1)),
|
||||
("cmovnge mem gpr", ("", 0)),
|
||||
("cmovnl gpr gpr", ("1*p69", 1)),
|
||||
("cmovnl mem gpr", ("", 0)),
|
||||
("cmovno gpr gpr", ("1*p69", 1)),
|
||||
("cmovno mem gpr", ("", 0)),
|
||||
("cmovnp gpr gpr", ("1*p69", 1)),
|
||||
("cmovnp mem gpr", ("", 0)),
|
||||
("cmovns gpr gpr", ("1*p69", 1)),
|
||||
("cmovns mem gpr", ("", 0)),
|
||||
("cmovnz gpr gpr", ("1*p69", 1)),
|
||||
("cmovnz mem gpr", ("", 0)),
|
||||
("cmovo gpr gpr", ("1*p69", 1)),
|
||||
("cmovo mem gpr", ("", 0)),
|
||||
("cmovp gpr gpr", ("1*p69", 1)),
|
||||
("cmovp mem gpr", ("", 0)),
|
||||
("cmovpe gpr gpr", ("1*p69", 1)),
|
||||
("cmovpe mem gpr", ("", 0)),
|
||||
("cmovpo gpr gpr", ("1*p69", 1)),
|
||||
("cmovpo mem gpr", ("", 0)),
|
||||
("cmovs gpr gpr", ("1*p69", 1)),
|
||||
("cmovs mem gpr", ("", 0)),
|
||||
("cmovz gpr gpr", ("1*p69", 1)),
|
||||
("cmovz mem gpr", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/pmovmskb
|
||||
("pmovmskb mm gpr", ("1*p0123", 1)),
|
||||
("pmovmskb xmm gpr", ("1*p0123", 1)),
|
||||
("vpmovmskb xmm gpr", ("1*p0123", 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovsx
|
||||
("pmovsxbw xmm xmm", ("1*p12", 1)),
|
||||
("pmovsxbw mem xmm", ("1*p12", 1)),
|
||||
("pmovsxbd xmm xmm", ("1*p12", 1)),
|
||||
("pmovsxbd mem xmm", ("1*p12", 1)),
|
||||
("pmovsxbq xmm xmm", ("1*p12", 1)),
|
||||
("pmovsxbq mem xmm", ("1*p12", 1)),
|
||||
("vpmovsxbw xmm xmm", ("1*p12", 1)),
|
||||
("vpmovsxbw mem xmm", ("1*p12", 1)),
|
||||
("vpmovsxbd xmm xmm", ("1*p12", 1)),
|
||||
("vpmovsxbd mem xmm", ("1*p12", 1)),
|
||||
("vpmovsxbq xmm xmm", ("1*p12", 1)),
|
||||
("vpmovsxbq mem xmm", ("1*p12", 1)),
|
||||
("vpmovsxbw xmm ymm", ("1*p0123", 1)),
|
||||
("vpmovsxbw mem ymm", ("1*p12", 1)),
|
||||
("vpmovsxbd xmm ymm", ("1*p0123", 1)),
|
||||
("vpmovsxbd mem ymm", ("1*p12", 1)),
|
||||
("vpmovsxbq xmm ymm", ("1*p0123", 1)),
|
||||
("vpmovsxbq mem ymm", ("1*p12", 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovzx
|
||||
("pmovzxbw xmm xmm", ("1*p12", 1)),
|
||||
("pmovzxbw mem xmm", ("1*p12", 1)),
|
||||
("vpmovzxbw xmm xmm", ("1*p12", 1)),
|
||||
("vpmovzxbw mem xmm", ("1*p12", 1)),
|
||||
("vpmovzxbw xmm ymm", ("1*p0123", 1)),
|
||||
("vpmovzxbw mem ymm", ("1*p12", 1)),
|
||||
#################################################################
|
||||
# https://www.felixcloutier.com/x86/movbe
|
||||
("movbe gpr mem", ("1*p67", 5)),
|
||||
("movbe mem gpr", ("1*p67", 5)),
|
||||
################################################
|
||||
# https://www.felixcloutier.com/x86/movq2dq
|
||||
("movq2dq mm xmm", ("2*p0123", 1)),
|
||||
]
|
||||
|
||||
|
||||
p9 = MOVEntryBuilderIntelPort9()
|
||||
|
||||
icx_mov_instructions = [
|
||||
# https://www.felixcloutier.com/x86/mov
|
||||
("mov gpr gpr", ("1*p0156", 1)),
|
||||
("mov gpr mem", ("", 0)),
|
||||
("mov mem gpr", ("", 0)),
|
||||
("mov imd gpr", ("1*p0156", 1)),
|
||||
("mov imd mem", ("", 0)),
|
||||
("movabs imd gpr", ("1*p0156", 1)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/movapd
|
||||
("movapd xmm xmm", ("1*p015", 1)),
|
||||
("movapd xmm mem", ("", 0)),
|
||||
("movapd mem xmm", ("", 0)),
|
||||
("vmovapd xmm xmm", ("1*p015", 1)),
|
||||
("vmovapd xmm mem", ("", 0)),
|
||||
("vmovapd mem xmm", ("", 0)),
|
||||
("vmovapd ymm ymm", ("1*p015", 1)),
|
||||
("vmovapd ymm mem", ("", 0)),
|
||||
("vmovapd mem ymm", ("", 0)),
|
||||
("vmovapd zmm zmm", ("1*p05", 1)),
|
||||
("vmovapd zmm mem", ("", 0)),
|
||||
("vmovapd mem zmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movaps
|
||||
("movaps xmm xmm", ("1*p015", 1)),
|
||||
("movaps xmm mem", ("", 0)),
|
||||
("movaps mem xmm", ("", 0)),
|
||||
("vmovaps xmm xmm", ("1*p015", 1)),
|
||||
("vmovaps xmm mem", ("", 0)),
|
||||
("vmovaps mem xmm", ("", 0)),
|
||||
("vmovaps ymm ymm", ("1*p015", 1)),
|
||||
("vmovaps ymm mem", ("", 0)),
|
||||
("vmovaps mem ymm", ("", 0)),
|
||||
("vmovaps zmm zmm", ("1*p05", 1)),
|
||||
("vmovaps zmm mem", ("", 0)),
|
||||
("vmovaps mem zmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movd:movq
|
||||
("movd gpr mm", ("1*p5", 1)),
|
||||
("movd mem mm", ("", 0)),
|
||||
("movq gpr mm", ("1*p5", 1)),
|
||||
("movq mem mm", ("", 0)),
|
||||
("movd mm gpr", ("1*p0", 1)),
|
||||
("movd mm mem", ("", 0)),
|
||||
("movq mm gpr", ("1*p0", 1)),
|
||||
("movq mm mem", ("", 0)),
|
||||
("movd gpr xmm", ("1*p5", 1)),
|
||||
("movd mem xmm", ("", 0)),
|
||||
("movq gpr xmm", ("1*p5", 1)),
|
||||
("movq mem xmm", ("", 0)),
|
||||
("movd xmm gpr", ("1*p0", 1)),
|
||||
("movd xmm mem", ("", 0)),
|
||||
("movq xmm gpr", ("1*p0", 1)),
|
||||
("movq xmm mem", ("", 0)),
|
||||
("vmovd gpr xmm", ("1*p5", 1)),
|
||||
("vmovd mem xmm", ("", 0)),
|
||||
("vmovq gpr xmm", ("1*p5", 1)),
|
||||
("vmovq mem xmm", ("", 0)),
|
||||
("vmovd xmm gpr", ("1*p0", 1)),
|
||||
("vmovd xmm mem", ("", 0)),
|
||||
("vmovq xmm gpr", ("1*p0", 1)),
|
||||
("vmovq xmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movddup
|
||||
("movddup xmm xmm", ("1*p5", 1)),
|
||||
("movddup mem xmm", ("", 0)),
|
||||
("vmovddup xmm xmm", ("1*p5", 1)),
|
||||
("vmovddup mem xmm", ("", 0)),
|
||||
("vmovddup ymm ymm", ("1*p5", 1)),
|
||||
("vmovddup mem ymm", ("", 0)),
|
||||
("vmovddup zmm zmm", ("1*p5", 1)),
|
||||
("vmovddup mem zmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movdq2q
|
||||
("movdq2q xmm mm", ("1*p015+1*p5", 1)),
|
||||
# https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
|
||||
("movdqa xmm xmm", ("1*p015", 1)),
|
||||
("movdqa mem xmm", ("", 0)),
|
||||
("movdqa xmm mem", ("", 0)),
|
||||
("vmovdqa xmm xmm", ("1*p015", 1)),
|
||||
("vmovdqa mem xmm", ("", 0)),
|
||||
("vmovdqa xmm mem", ("", 0)),
|
||||
("vmovdqa ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqa mem ymm", ("", 0)),
|
||||
("vmovdqa ymm mem", ("", 0)),
|
||||
("vmovdqa32 xmm xmm", ("1*p0156", 1)),
|
||||
("vmovdqa32 mem xmm", ("", 0)),
|
||||
("vmovdqa32 xmm mem", ("", 0)),
|
||||
("vmovdqa32 ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqa32 mem ymm", ("", 0)),
|
||||
("vmovdqa32 ymm mem", ("", 0)),
|
||||
("vmovdqa32 zmm zmm", ("1*p05", 1)),
|
||||
("vmovdqa32 mem zmm", ("", 0)),
|
||||
("vmovdqa32 zmm mem", ("", 0)),
|
||||
("vmovdqa64 xmm xmm", ("1*p0156", 1)),
|
||||
("vmovdqa64 mem xmm", ("", 0)),
|
||||
("vmovdqa64 xmm mem", ("", 0)),
|
||||
("vmovdqa64 ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqa64 mem ymm", ("", 0)),
|
||||
("vmovdqa64 ymm mem", ("", 0)),
|
||||
("vmovdqa64 zmm zmm", ("1*p05", 1)),
|
||||
("vmovdqa64 mem zmm", ("", 0)),
|
||||
("vmovdqa64 zmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
|
||||
("movdqu xmm xmm", ("1*p015", 1)),
|
||||
("movdqu mem xmm", ("", 0)),
|
||||
("movdqu xmm mem", ("", 0)),
|
||||
("vmovdqu xmm xmm", ("1*p015", 1)),
|
||||
("vmovdqu mem xmm", ("", 0)),
|
||||
("vmovdqu xmm mem", ("", 0)),
|
||||
("vmovdqu ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqu mem ymm", ("", 0)),
|
||||
("vmovdqu ymm mem", ("", 0)),
|
||||
("vmovdqu8 xmm xmm", ("1*p0156", 1)),
|
||||
("vmovdqu8 mem xmm", ("", 0)),
|
||||
("vmovdqu8 xmm mem", ("", 0)),
|
||||
("vmovdqu8 ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqu8 mem ymm", ("", 0)),
|
||||
("vmovdqu8 ymm mem", ("", 0)),
|
||||
("vmovdqu8 zmm zmm", ("1*p05", 1)),
|
||||
("vmovdqu8 mem zmm", ("", 0)),
|
||||
("vmovdqu8 zmm mem", ("", 0)),
|
||||
("vmovdqu16 xmm xmm", ("1*p0156", 1)),
|
||||
("vmovdqu16 mem xmm", ("", 0)),
|
||||
("vmovdqu16 xmm mem", ("", 0)),
|
||||
("vmovdqu16 ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqu16 mem ymm", ("", 0)),
|
||||
("vmovdqu16 ymm mem", ("", 0)),
|
||||
("vmovdqu16 zmm zmm", ("1*p05", 1)),
|
||||
("vmovdqu16 mem zmm", ("", 0)),
|
||||
("vmovdqu16 zmm mem", ("", 0)),
|
||||
("vmovdqu32 xmm xmm", ("1*p0156", 1)),
|
||||
("vmovdqu32 mem xmm", ("", 0)),
|
||||
("vmovdqu32 xmm mem", ("", 0)),
|
||||
("vmovdqu32 ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqu32 mem ymm", ("", 0)),
|
||||
("vmovdqu32 ymm mem", ("", 0)),
|
||||
("vmovdqu32 zmm zmm", ("1*p05", 1)),
|
||||
("vmovdqu32 mem zmm", ("", 0)),
|
||||
("vmovdqu32 zmm mem", ("", 0)),
|
||||
("vmovdqu64 xmm xmm", ("1*p0156", 1)),
|
||||
("vmovdqu64 mem xmm", ("", 0)),
|
||||
("vmovdqu64 xmm mem", ("", 0)),
|
||||
("vmovdqu64 ymm ymm", ("1*p015", 1)),
|
||||
("vmovdqu64 mem ymm", ("", 0)),
|
||||
("vmovdqu64 ymm mem", ("", 0)),
|
||||
("vmovdqu64 zmm zmm", ("1*p05", 1)),
|
||||
("vmovdqu64 mem zmm", ("", 0)),
|
||||
("vmovdqu64 zmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movhlps
|
||||
("movhlps xmm xmm", ("1*p5", 1)),
|
||||
("vmovhlps xmm xmm xmm", ("1*p5", 1)),
|
||||
# https://www.felixcloutier.com/x86/movhpd
|
||||
("movhpd mem xmm", ("1*p5", 1)),
|
||||
("vmovhpd mem xmm xmm", ("1*p5", 1)),
|
||||
("movhpd xmm mem", ("", 0)),
|
||||
("vmovhpd mem xmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movhps
|
||||
("movhps mem xmm", ("1*p5", 1)),
|
||||
("vmovhps mem xmm xmm", ("1*p5", 1)),
|
||||
("movhps xmm mem", ("", 0)),
|
||||
("vmovhps mem xmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movlhps
|
||||
("movlhps xmm xmm", ("1*p5", 1)),
|
||||
("vmovlhps xmm xmm xmm", ("1*p5", 1)),
|
||||
# https://www.felixcloutier.com/x86/movlpd
|
||||
("movlpd mem xmm", ("1*p5", 1)),
|
||||
("vmovlpd mem xmm xmm", ("1*p5", 1)),
|
||||
("movlpd xmm mem", ("", 0)),
|
||||
("vmovlpd mem xmm", ("1*p5", 1)),
|
||||
# https://www.felixcloutier.com/x86/movlps
|
||||
("movlps mem xmm", ("1*p5", 1)),
|
||||
("vmovlps mem xmm xmm", ("1*p5", 1)),
|
||||
("movlps xmm mem", ("", 0)),
|
||||
("vmovlps mem xmm", ("1*p5", 1)),
|
||||
# https://www.felixcloutier.com/x86/movmskpd
|
||||
("movmskpd xmm gpr", ("1*p0", 1)),
|
||||
("vmovmskpd xmm gpr", ("1*p0", 1)),
|
||||
("vmovmskpd ymm gpr", ("1*p0", 1)),
|
||||
# https://www.felixcloutier.com/x86/movmskps
|
||||
("movmskps xmm gpr", ("1*p0", 1)),
|
||||
("vmovmskps xmm gpr", ("1*p0", 1)),
|
||||
("vmovmskps ymm gpr", ("1*p0", 1)),
|
||||
# https://www.felixcloutier.com/x86/movntdq
|
||||
("movntdq xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdq xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdq ymm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdq zmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntdqa
|
||||
("movntdqa mem xmm", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdqa mem xmm", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdqa mem ymm", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntdqa mem zmm", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movnti
|
||||
("movnti gpr mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntpd
|
||||
("movntpd xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntpd xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntpd ymm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntpd zmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntps
|
||||
("movntps xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntps xmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntps ymm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
("vmovntps zmm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movntq
|
||||
("movntq mm mem", ("", 0)), # TODO NT-store: what latency to use?
|
||||
# https://www.felixcloutier.com/x86/movq
|
||||
("movq mm mm", ("", 0)),
|
||||
("movq mem mm", ("", 0)),
|
||||
("movq mm mem", ("", 0)),
|
||||
("movq xmm xmm", ("1*p015", 1)),
|
||||
("movq mem xmm", ("", 0)),
|
||||
("movq xmm mem", ("", 0)),
|
||||
("vmovq xmm xmm", ("1*p015", 1)),
|
||||
("vmovq mem xmm", ("", 0)),
|
||||
("vmovq xmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movs:movsb:movsw:movsd:movsq
|
||||
# TODO combined load-store is currently not supported
|
||||
# ('movs mem mem', ()),
|
||||
# https://www.felixcloutier.com/x86/movsd
|
||||
("movsd xmm xmm", ("1*p015", 1)),
|
||||
("movsd mem xmm", ("", 0)),
|
||||
("movsd xmm mem", ("", 0)),
|
||||
("vmovsd xmm xmm xmm", ("1*p015", 1)),
|
||||
("vmovsd mem xmm", ("", 0)),
|
||||
("vmovsd xmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movshdup
|
||||
("movshdup xmm xmm", ("1*p15", 1)),
|
||||
("movshdup mem xmm", ("", 0)),
|
||||
("vmovshdup xmm xmm", ("1*p15", 1)),
|
||||
("vmovshdup mem xmm", ("", 0)),
|
||||
("vmovshdup ymm ymm", ("1*p15", 1)),
|
||||
("vmovshdup mem ymm", ("", 0)),
|
||||
("vmovshdup zmm zmm", ("1*p5", 1)),
|
||||
("vmovshdup mem zmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movsldup
|
||||
("movsldup xmm xmm", ("1*p15", 1)),
|
||||
("movsldup mem xmm", ("", 0)),
|
||||
("vmovsldup xmm xmm", ("1*p15", 1)),
|
||||
("vmovsldup mem xmm", ("", 0)),
|
||||
("vmovsldup ymm ymm", ("1*p15", 1)),
|
||||
("vmovsldup mem ymm", ("", 0)),
|
||||
("vmovsldup zmm zmm", ("1*p5", 1)),
|
||||
("vmovsldup mem zmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movss
|
||||
("movss xmm xmm", ("1*p015", 1)),
|
||||
("movss mem xmm", ("", 0)),
|
||||
("vmovss xmm xmm xmm", ("1*p015", 1)),
|
||||
("vmovss mem xmm", ("", 0)),
|
||||
("vmovss xmm xmm", ("1*p015", 1)),
|
||||
("vmovss xmm mem", ("", 0)),
|
||||
("movss mem xmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movsx:movsxd
|
||||
("movsx gpr gpr", ("1*p0156", 1)),
|
||||
("movsx mem gpr", ("", 0)),
|
||||
("movsxd gpr gpr", ("", 0)),
|
||||
("movsxd mem gpr", ("", 0)),
|
||||
("movsb gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movsb mem gpr", ("", 0)), # AT&T version
|
||||
("movsw gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movsw mem gpr", ("", 0)), # AT&T version
|
||||
("movsl gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movsl mem gpr", ("", 0)), # AT&T version
|
||||
("movsq gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movsq mem gpr", ("", 0)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/movupd
|
||||
("movupd xmm xmm", ("1*p015", 1)),
|
||||
("movupd mem xmm", ("", 0)),
|
||||
("movupd xmm mem", ("", 0)),
|
||||
("vmovupd xmm xmm", ("1*p015", 1)),
|
||||
("vmovupd mem xmm", ("", 0)),
|
||||
("vmovupd xmm mem", ("", 0)),
|
||||
("vmovupd ymm ymm", ("1*p015", 1)),
|
||||
("vmovupd mem ymm", ("", 0)),
|
||||
("vmovupd ymm mem", ("", 0)),
|
||||
("vmovupd zmm zmm", ("1*p05", 1)),
|
||||
("vmovupd mem zmm", ("", 0)),
|
||||
("vmovupd zmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movups
|
||||
("movups xmm xmm", ("1*p015", 1)),
|
||||
("movups mem xmm", ("", 0)),
|
||||
("movups xmm mem", ("", 0)),
|
||||
("vmovups xmm xmm", ("1*p015", 1)),
|
||||
("vmovups mem xmm", ("", 0)),
|
||||
("vmovups xmm mem", ("", 0)),
|
||||
("vmovups ymm ymm", ("1*p015", 1)),
|
||||
("vmovups mem ymm", ("", 0)),
|
||||
("vmovups ymm mem", ("", 0)),
|
||||
("vmovups zmm zmm", ("1*p05", 1)),
|
||||
("vmovups mem zmm", ("", 0)),
|
||||
("vmovups zmm mem", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movzx
|
||||
("movzx gpr gpr", ("1*p0156", 1)),
|
||||
("movzx mem gpr", ("", 0)),
|
||||
("movzb gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movzb mem gpr", ("", 0)), # AT&T version
|
||||
("movzw gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movzw mem gpr", ("", 0)), # AT&T version
|
||||
("movzl gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movzl mem gpr", ("", 0)), # AT&T version
|
||||
("movzq gpr gpr", ("1*p0156", 1)), # AT&T version
|
||||
("movzq mem gpr", ("", 0)), # AT&T version
|
||||
# https://www.felixcloutier.com/x86/cmovcc
|
||||
("cmova gpr gpr", ("2*p06", 1)),
|
||||
("cmova mem gpr", ("", 0)),
|
||||
("cmovae gpr gpr", ("1*p06", 1)),
|
||||
("cmovae mem gpr", ("", 0)),
|
||||
("cmovb gpr gpr", ("2*p06", 1)),
|
||||
("cmovb mem gpr", ("", 0)),
|
||||
("cmovbe gpr gpr", ("2*p06", 1)),
|
||||
("cmovbe mem gpr", ("", 0)),
|
||||
("cmovc gpr gpr", ("1*p06", 1)),
|
||||
("cmovc mem gpr", ("", 0)),
|
||||
("cmove gpr gpr", ("1*p06", 1)),
|
||||
("cmove mem gpr", ("", 0)),
|
||||
("cmovg gpr gpr", ("1*p06", 1)),
|
||||
("cmovg mem gpr", ("", 0)),
|
||||
("cmovge gpr gpr", ("1*p06", 1)),
|
||||
("cmovge mem gpr", ("", 0)),
|
||||
("cmovl gpr gpr", ("1*p06", 1)),
|
||||
("cmovl mem gpr", ("", 0)),
|
||||
("cmovle gpr gpr", ("1*p06", 1)),
|
||||
("cmovle mem gpr", ("", 0)),
|
||||
("cmovna gpr gpr", ("2*p06", 1)),
|
||||
("cmovna mem gpr", ("", 0)),
|
||||
("cmovnae gpr gpr", ("1*p06", 1)),
|
||||
("cmovnae mem gpr", ("", 0)),
|
||||
("cmovnb gpr gpr", ("1*p06", 1)),
|
||||
("cmovnb mem gpr", ("", 0)),
|
||||
("cmovnbe gpr gpr", ("2*p06", 1)),
|
||||
("cmovnbe mem gpr", ("", 0)),
|
||||
("cmovnc gpr gpr", ("1*p06", 1)),
|
||||
("cmovnc mem gpr", ("", 0)),
|
||||
("cmovne gpr gpr", ("1*p06", 1)),
|
||||
("cmovne mem gpr", ("", 0)),
|
||||
("cmovng gpr gpr", ("1*p06", 1)),
|
||||
("cmovng mem gpr", ("", 0)),
|
||||
("cmovnge gpr gpr", ("1*p06", 1)),
|
||||
("cmovnge mem gpr", ("", 0)),
|
||||
("cmovnl gpr gpr", ("1*p06", 1)),
|
||||
("cmovnl mem gpr", ("", 0)),
|
||||
("cmovno gpr gpr", ("1*p06", 1)),
|
||||
("cmovno mem gpr", ("", 0)),
|
||||
("cmovnp gpr gpr", ("1*p06", 1)),
|
||||
("cmovnp mem gpr", ("", 0)),
|
||||
("cmovns gpr gpr", ("1*p06", 1)),
|
||||
("cmovns mem gpr", ("", 0)),
|
||||
("cmovnz gpr gpr", ("1*p06", 1)),
|
||||
("cmovnz mem gpr", ("", 0)),
|
||||
("cmovo gpr gpr", ("1*p06", 1)),
|
||||
("cmovo mem gpr", ("", 0)),
|
||||
("cmovp gpr gpr", ("1*p06", 1)),
|
||||
("cmovp mem gpr", ("", 0)),
|
||||
("cmovpe gpr gpr", ("1*p06", 1)),
|
||||
("cmovpe mem gpr", ("", 0)),
|
||||
("cmovpo gpr gpr", ("1*p06", 1)),
|
||||
("cmovpo mem gpr", ("", 0)),
|
||||
("cmovs gpr gpr", ("1*p06", 1)),
|
||||
("cmovs mem gpr", ("", 0)),
|
||||
("cmovz gpr gpr", ("1*p06", 1)),
|
||||
("cmovz mem gpr", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/pmovmskb
|
||||
("pmovmskb mm gpr", ("1*p0", 1)),
|
||||
("pmovmskb xmm gpr", ("1*p0", 1)),
|
||||
("vpmovmskb xmm gpr", ("1*p0", 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovsx
|
||||
("pmovsxbw xmm xmm", ("1*p15", 1)),
|
||||
("pmovsxbw mem xmm", ("1*p15", 1)),
|
||||
("pmovsxbd xmm xmm", ("1*p15", 1)),
|
||||
("pmovsxbd mem xmm", ("1*p15", 1)),
|
||||
("pmovsxbq xmm xmm", ("1*p15", 1)),
|
||||
("pmovsxbq mem xmm", ("1*p15", 1)),
|
||||
("vpmovsxbw xmm xmm", ("1*p15", 1)),
|
||||
("vpmovsxbw mem xmm", ("1*p15", 1)),
|
||||
("vpmovsxbd xmm xmm", ("1*p15", 1)),
|
||||
("vpmovsxbd mem xmm", ("1*p15", 1)),
|
||||
("vpmovsxbq xmm xmm", ("1*p15", 1)),
|
||||
("vpmovsxbq mem xmm", ("1*p15", 1)),
|
||||
("vpmovsxbw xmm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbw mem ymm", ("1*p5", 1)),
|
||||
("vpmovsxbd xmm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbd mem ymm", ("1*p5", 1)),
|
||||
("vpmovsxbq xmm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbq mem ymm", ("1*p5", 1)),
|
||||
("vpmovsxbw ymm zmm", ("1*p5", 3)),
|
||||
("vpmovsxbw mem zmm", ("1*p5", 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovzx
|
||||
("pmovzxbw xmm xmm", ("1*p15", 1)),
|
||||
("pmovzxbw mem xmm", ("1*p15", 1)),
|
||||
("vpmovzxbw xmm xmm", ("1*p15", 1)),
|
||||
("vpmovzxbw mem xmm", ("1*p15", 1)),
|
||||
("vpmovzxbw xmm ymm", ("1*p5", 1)),
|
||||
("vpmovzxbw mem ymm", ("1*p5", 1)),
|
||||
("vpmovzxbw ymm zmm", ("1*p5", 1)),
|
||||
("vpmovzxbw mem zmm", ("1*p5", 1)),
|
||||
#################################################################
|
||||
# https://www.felixcloutier.com/x86/movbe
|
||||
("movbe gpr mem", ("1*p15", 6)),
|
||||
("movbe mem gpr", ("1*p15", 6)),
|
||||
################################################
|
||||
# https://www.felixcloutier.com/x86/movapd
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movaps
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movddup
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movq2dq
|
||||
("movq2dq mm xmm", ("1*p0+1*p015", 1)),
|
||||
# https://www.felixcloutier.com/x86/movsd
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movshdup
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movsldup
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movss
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movupd
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/movups
|
||||
# TODO with masking!
|
||||
# https://www.felixcloutier.com/x86/pmovsx
|
||||
# TODO with masking!
|
||||
]
|
||||
|
||||
|
||||
class MOVEntryBuilderIntelWithPort7AGU(MOVEntryBuilder):
|
||||
# for HSW, BDW, SKX and CSX
|
||||
|
||||
def build_description(self, instruction_name, operand_types, port_pressure=[], latency=0):
|
||||
load, store = self.classify(operand_types)
|
||||
load, store, vec = self.classify(operand_types)
|
||||
|
||||
if load:
|
||||
port_pressure += [[1, "23"], [1, ["2D", "3D"]]]
|
||||
@@ -335,7 +1173,6 @@ snb_mov_instructions = [
|
||||
("movss mem xmm", ("", 0)),
|
||||
("vmovss xmm xmm xmm", ("1*p5", 1)),
|
||||
("vmovss mem xmm", ("", 0)),
|
||||
("vmovss xmm xmm", ("1*p5", 1)),
|
||||
("vmovss xmm mem", ("", 0)),
|
||||
("movss mem xmm", ("", 0)),
|
||||
# https://www.felixcloutier.com/x86/movsx:movsxd
|
||||
@@ -460,11 +1297,11 @@ snb_mov_instructions = [
|
||||
("vpmovsxbd mem xmm", ("1*p15", 1)),
|
||||
("vpmovsxbq xmm xmm", ("1*p15", 1)),
|
||||
("vpmovsxbq mem xmm", ("1*p15", 1)),
|
||||
("vpmovsxbw ymm ymm", ("1*p15", 1)),
|
||||
("vpmovsxbw xmm ymm", ("1*p15", 1)),
|
||||
("vpmovsxbw mem ymm", ("1*p15", 1)),
|
||||
("vpmovsxbd ymm ymm", ("1*p15", 1)),
|
||||
("vpmovsxbd xmm ymm", ("1*p15", 1)),
|
||||
("vpmovsxbd mem ymm", ("1*p15", 1)),
|
||||
("vpmovsxbq ymm ymm", ("1*p15", 1)),
|
||||
("vpmovsxbq xmm ymm", ("1*p15", 1)),
|
||||
("vpmovsxbq mem ymm", ("1*p15", 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovzx
|
||||
("pmovzxbw xmm xmm", ("1*p15", 1)),
|
||||
@@ -620,11 +1457,11 @@ hsw_mov_instructions = list(
|
||||
("vpmovsxbd mem xmm", ("1*p5", 1)),
|
||||
("vpmovsxbq xmm xmm", ("1*p5", 1)),
|
||||
("vpmovsxbq mem xmm", ("1*p5", 1)),
|
||||
("vpmovsxbw ymm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbw xmm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbw mem ymm", ("1*p5", 1)),
|
||||
("vpmovsxbd ymm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbd xmm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbd mem ymm", ("1*p5", 1)),
|
||||
("vpmovsxbq ymm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbq xmm ymm", ("1*p5", 1)),
|
||||
("vpmovsxbq mem ymm", ("1*p5", 1)),
|
||||
# https://www.felixcloutier.com/x86/pmovzx
|
||||
("pmovzxbw xmm xmm", ("1*p5", 1)),
|
||||
@@ -774,6 +1611,8 @@ def get_description(arch, rhs_comment=None):
|
||||
"bdw": "\n".join([p7.process_item(*item) for item in bdw_mov_instructions]),
|
||||
"skx": "\n".join([p7.process_item(*item) for item in skx_mov_instructions]),
|
||||
"csx": "\n".join([p7.process_item(*item) for item in csx_mov_instructions]),
|
||||
"icx": "\n".join([p9.process_item(*item) for item in icx_mov_instructions]),
|
||||
"zen3": "\n".join([z3.process_item(*item) for item in zen3_mov_instructions]),
|
||||
}
|
||||
|
||||
description = descriptions[arch]
|
||||
@@ -795,7 +1634,7 @@ if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: {} (snb|ivb|hsw|bdw|skx|csx)".format(sys.argv[0]))
|
||||
print("Usage: {} (snb|ivb|hsw|bdw|skx|csx|icx|zen3)".format(sys.argv[0]))
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
|
||||
10285
osaca/data/icx.yml
Normal file
10285
osaca/data/icx.yml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -3292,6 +3292,27 @@ instruction_forms:
|
||||
name: "xmm"
|
||||
source: true
|
||||
destination: true
|
||||
- name: push
|
||||
operands:
|
||||
- class: "memory"
|
||||
base: "*"
|
||||
offset: "*"
|
||||
index: "*"
|
||||
scale: "*"
|
||||
source: true
|
||||
destination: false
|
||||
hidden_operands:
|
||||
- class: "memory"
|
||||
base: {name: 'rsp'}
|
||||
offset: ~
|
||||
index: ~
|
||||
scale: 1
|
||||
source: false
|
||||
destination: true
|
||||
- class: "register"
|
||||
name: "rsp"
|
||||
source: true
|
||||
destination: true
|
||||
- name: push
|
||||
operands:
|
||||
- class: "register"
|
||||
@@ -3403,6 +3424,12 @@ instruction_forms:
|
||||
name: "ID"
|
||||
source: true
|
||||
destination: false
|
||||
- name: [sar, sal, shl, shr]
|
||||
operands:
|
||||
- class: "register"
|
||||
name: "gpr"
|
||||
source: true
|
||||
destination: true
|
||||
- name: sbb
|
||||
operands:
|
||||
- class: "register"
|
||||
|
||||
5146
osaca/data/zen3.yml
Normal file
5146
osaca/data/zen3.yml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -260,7 +260,7 @@ def _create_db_operand_aarch64(operand):
|
||||
|
||||
def _create_db_operand_x86(operand):
|
||||
"""Get DB operand for AArch64 by operand string."""
|
||||
if operand == "r":
|
||||
if operand.startswith("r"):
|
||||
return {"class": "register", "name": "gpr"}
|
||||
elif operand in "xyz":
|
||||
return {"class": "register", "name": operand + "mm"}
|
||||
|
||||
@@ -27,8 +27,10 @@ SUPPORTED_ARCHS = [
|
||||
"SKX",
|
||||
"CSX",
|
||||
"ICL",
|
||||
"ICX",
|
||||
"ZEN1",
|
||||
"ZEN2",
|
||||
"ZEN3",
|
||||
"TX2",
|
||||
"N1",
|
||||
"A64FX",
|
||||
@@ -37,7 +39,7 @@ SUPPORTED_ARCHS = [
|
||||
]
|
||||
DEFAULT_ARCHS = {
|
||||
"aarch64": "A64FX",
|
||||
"x86": "SKX",
|
||||
"x86": "ICX",
|
||||
}
|
||||
|
||||
|
||||
@@ -96,7 +98,7 @@ def create_parser(parser=None):
|
||||
parser.add_argument(
|
||||
"--arch",
|
||||
type=str,
|
||||
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ZEN1, ZEN2, TX2, N1, "
|
||||
help="Define architecture (SNB, IVB, HSW, BDW, SKX, CSX, ICL, ICX, ZEN1, ZEN2, ZEN3, TX2, N1, "
|
||||
"A64FX, TSV110, A72). If no architecture is given, OSACA assumes a default uarch for x86/AArch64.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -328,6 +330,7 @@ def inspect(args, output_file=sys.stdout):
|
||||
# Do optimal schedule for kernel throughput if wished
|
||||
if not args.fixed:
|
||||
semantics.assign_optimal_throughput(kernel)
|
||||
semantics.assign_optimal_throughput(kernel)
|
||||
|
||||
# Create DiGrahps
|
||||
kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Semantics opbject responsible for architecture specific semantic operations"""
|
||||
|
||||
import sys
|
||||
import warnings
|
||||
from itertools import chain
|
||||
from operator import itemgetter
|
||||
from copy import deepcopy
|
||||
|
||||
from .hw_model import MachineModel
|
||||
from .isa_semantics import INSTR_FLAGS, ISASemantics
|
||||
@@ -31,7 +33,7 @@ class ArchSemantics(ISASemantics):
|
||||
if self._machine_model.has_hidden_loads():
|
||||
self.set_hidden_loads(kernel)
|
||||
|
||||
def assign_optimal_throughput(self, kernel):
|
||||
def assign_optimal_throughput(self, kernel, start=0):
|
||||
"""
|
||||
Assign optimal throughput port pressure to a kernel. This is done in steps of ``0.01cy``.
|
||||
|
||||
@@ -40,7 +42,26 @@ class ArchSemantics(ISASemantics):
|
||||
INC = 0.01
|
||||
kernel.reverse()
|
||||
port_list = self._machine_model.get_ports()
|
||||
for instruction_form in kernel:
|
||||
for idx, instruction_form in enumerate(kernel[start:], start):
|
||||
multiple_assignments = False
|
||||
# if iform has multiple possible port assignments, check all in a DFS manner and take the best
|
||||
if isinstance(instruction_form["port_uops"], dict):
|
||||
best_kernel = None
|
||||
best_kernel_tp = sys.maxsize
|
||||
for port_util_alt in list(instruction_form["port_uops"].values())[1:]:
|
||||
k_tmp = deepcopy(kernel)
|
||||
k_tmp[idx]["port_uops"] = deepcopy(port_util_alt)
|
||||
k_tmp[idx]["port_pressure"] = self._machine_model.average_port_pressure(
|
||||
k_tmp[idx]["port_uops"]
|
||||
)
|
||||
k_tmp.reverse()
|
||||
self.assign_optimal_throughput(k_tmp, idx)
|
||||
if max(self.get_throughput_sum(k_tmp)) < best_kernel_tp:
|
||||
best_kernel = k_tmp
|
||||
best_kernel_tp = max(self.get_throughput_sum(best_kernel))
|
||||
# check the first option in the main branch and compare against the best option later
|
||||
multiple_assignments = True
|
||||
kernel[idx]["port_uops"] = list(instruction_form["port_uops"].values())[0]
|
||||
for uop in instruction_form["port_uops"]:
|
||||
cycles = uop[0]
|
||||
ports = list(uop[1])
|
||||
@@ -84,6 +105,7 @@ class ArchSemantics(ISASemantics):
|
||||
p
|
||||
for p in indices
|
||||
if round(instruction_form["port_pressure"][p], 2) == 0
|
||||
or instruction_form["port_pressure"][p] < 0.00
|
||||
][0]
|
||||
instruction_form["port_pressure"][zero_index] = 0.0
|
||||
# Remove from further balancing
|
||||
@@ -108,6 +130,11 @@ class ArchSemantics(ISASemantics):
|
||||
itemgetter(*indices)(self.get_throughput_sum(kernel))
|
||||
)
|
||||
kernel.reverse()
|
||||
if multiple_assignments:
|
||||
if max(self.get_throughput_sum(kernel)) > best_kernel_tp:
|
||||
for i, instr in enumerate(best_kernel):
|
||||
kernel[i]["port_uops"] = best_kernel[i]["port_uops"]
|
||||
kernel[i]["port_pressure"] = best_kernel[i]["port_pressure"]
|
||||
|
||||
def set_hidden_loads(self, kernel):
|
||||
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
|
||||
@@ -209,11 +236,12 @@ class ArchSemantics(ISASemantics):
|
||||
operands.index(self._create_reg_wildcard())
|
||||
]
|
||||
)
|
||||
dummy_reg = {"class": "register", "name": reg_type}
|
||||
data_port_pressure = [0.0 for _ in range(port_number)]
|
||||
data_port_uops = []
|
||||
if INSTR_FLAGS.HAS_LD in instruction_form["flags"]:
|
||||
# LOAD performance data
|
||||
data_port_uops = self._machine_model.get_load_throughput(
|
||||
load_perf_data = self._machine_model.get_load_throughput(
|
||||
[
|
||||
x["memory"]
|
||||
for x in instruction_form["semantic_operands"]["source"]
|
||||
@@ -221,6 +249,19 @@ class ArchSemantics(ISASemantics):
|
||||
if "memory" in x
|
||||
][0]
|
||||
)
|
||||
# if multiple options, choose based on reg type
|
||||
data_port_uops = [
|
||||
ldp["port_pressure"]
|
||||
for ldp in load_perf_data
|
||||
if "dst" in ldp
|
||||
and self._machine_model._check_operands(
|
||||
dummy_reg, {"register": {"name": ldp["dst"]}}
|
||||
)
|
||||
]
|
||||
if len(data_port_uops) < 1:
|
||||
data_port_uops = load_perf_data[0]["port_pressure"]
|
||||
else:
|
||||
data_port_uops = data_port_uops[0]
|
||||
data_port_pressure = self._machine_model.average_port_pressure(
|
||||
data_port_uops
|
||||
)
|
||||
@@ -235,9 +276,22 @@ class ArchSemantics(ISASemantics):
|
||||
instruction_form["semantic_operands"]["destination"]
|
||||
+ instruction_form["semantic_operands"]["src_dst"]
|
||||
)
|
||||
st_data_port_uops = self._machine_model.get_store_throughput(
|
||||
store_perf_data = self._machine_model.get_store_throughput(
|
||||
[x["memory"] for x in destinations if "memory" in x][0]
|
||||
)
|
||||
# if multiple options, choose based on reg type
|
||||
st_data_port_uops = [
|
||||
stp["port_pressure"]
|
||||
for stp in store_perf_data
|
||||
if "src" in stp
|
||||
and self._machine_model._check_operands(
|
||||
dummy_reg, {"register": {"name": stp["src"]}}
|
||||
)
|
||||
]
|
||||
if len(st_data_port_uops) < 1:
|
||||
st_data_port_uops = store_perf_data[0]["port_pressure"]
|
||||
else:
|
||||
st_data_port_uops = st_data_port_uops[0]
|
||||
# zero data port pressure and remove HAS_ST flag if
|
||||
# - no mem operand in dst &&
|
||||
# - all mem operands in src_dst are pre-/post-indexed
|
||||
|
||||
@@ -143,11 +143,16 @@ class MachineModel(object):
|
||||
print("\nname: {}\noperands: {}".format(name, operands))
|
||||
raise TypeError from e
|
||||
|
||||
def average_port_pressure(self, port_pressure):
|
||||
def average_port_pressure(self, port_pressure, option=0):
|
||||
"""Construct average port pressure list from instruction data."""
|
||||
port_list = self._data["ports"]
|
||||
average_pressure = [0.0] * len(port_list)
|
||||
for cycles, ports in port_pressure:
|
||||
# if there are multiple port utilization options and none is selected, choose first one
|
||||
if isinstance(port_pressure, dict):
|
||||
used_pp = port_pressure[option]
|
||||
else:
|
||||
used_pp = port_pressure
|
||||
for cycles, ports in used_pp:
|
||||
for p in ports:
|
||||
try:
|
||||
average_pressure[port_list.index(p)] += cycles / len(ports)
|
||||
@@ -221,8 +226,8 @@ class MachineModel(object):
|
||||
"""Return load thorughput for given register type."""
|
||||
ld_tp = [m for m in self._data["load_throughput"] if self._match_mem_entries(memory, m)]
|
||||
if len(ld_tp) > 0:
|
||||
return ld_tp[0]["port_pressure"].copy()
|
||||
return self._data["load_throughput_default"].copy()
|
||||
return ld_tp.copy()
|
||||
return [{"port_pressure": self._data["load_throughput_default"].copy()}]
|
||||
|
||||
def get_store_latency(self, reg_type):
|
||||
"""Return store latency for given register type."""
|
||||
@@ -233,8 +238,8 @@ class MachineModel(object):
|
||||
"""Return store throughput for given register type."""
|
||||
st_tp = [m for m in self._data["store_throughput"] if self._match_mem_entries(memory, m)]
|
||||
if len(st_tp) > 0:
|
||||
return st_tp[0]["port_pressure"].copy()
|
||||
return self._data["store_throughput_default"].copy()
|
||||
return st_tp.copy()
|
||||
return [{"port_pressure": self._data["store_throughput_default"].copy()}]
|
||||
|
||||
def _match_mem_entries(self, mem, i_mem):
|
||||
"""Check if memory addressing ``mem`` and ``i_mem`` are of the same type."""
|
||||
@@ -273,7 +278,7 @@ class MachineModel(object):
|
||||
"zen1": "x86",
|
||||
"zen+": "x86",
|
||||
"zen2": "x86",
|
||||
"icl": "x86",
|
||||
"zen3": "x86",
|
||||
"con": "x86", # Intel Conroe
|
||||
"wol": "x86", # Intel Wolfdale
|
||||
"snb": "x86",
|
||||
@@ -289,6 +294,7 @@ class MachineModel(object):
|
||||
"cnl": "x86",
|
||||
"cfl": "x86",
|
||||
"icl": "x86",
|
||||
"icx": "x86",
|
||||
}
|
||||
arch = arch.lower()
|
||||
if arch in arch_dict:
|
||||
@@ -690,6 +696,8 @@ class MachineModel(object):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
if reg["name"].rstrip(string.digits).lower() == i_reg_name:
|
||||
return True
|
||||
if i_reg_name == "gpr":
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -35,7 +35,7 @@ def find_marked_kernel_AArch64(lines):
|
||||
:param list lines: kernel
|
||||
:returns: `tuple of int` -- start and end line of marked section
|
||||
"""
|
||||
nop_bytes = ["213", "3", "32", "31"]
|
||||
nop_bytes = [213, 3, 32, 31]
|
||||
return find_marked_section(
|
||||
lines,
|
||||
ParserAArch64(),
|
||||
@@ -55,7 +55,7 @@ def find_marked_kernel_x86ATT(lines):
|
||||
:param list lines: kernel
|
||||
:returns: `tuple of int` -- start and end line of marked section
|
||||
"""
|
||||
nop_bytes = ["100", "103", "144"]
|
||||
nop_bytes = [100, 103, 144]
|
||||
return find_marked_section(
|
||||
lines,
|
||||
ParserX86ATT(),
|
||||
@@ -186,7 +186,7 @@ def match_bytes(lines, index, byte_list):
|
||||
and lines[index].directive.name == "byte"
|
||||
):
|
||||
line_count += 1
|
||||
extracted_bytes += lines[index].directive.parameters
|
||||
extracted_bytes += [int(x, 0) for x in lines[index].directive.parameters]
|
||||
index += 1
|
||||
if extracted_bytes[0 : len(byte_list)] == byte_list:
|
||||
return True, line_count
|
||||
|
||||
@@ -61,6 +61,24 @@ port_model_scheme: |
|
||||
+-------+ | VNNI |
|
||||
+-------+
|
||||
instruction_forms:
|
||||
- name: fantasyinstr1
|
||||
operands:
|
||||
- class: register
|
||||
name: gpr
|
||||
- class: register
|
||||
name: gpr
|
||||
port_pressure: {0: [[1, '015']], 1: [[1, '56']]}
|
||||
throughput: 0.333333
|
||||
latency: 1.0
|
||||
- name: fantasyinstr2
|
||||
operands:
|
||||
- class: register
|
||||
name: gpr
|
||||
- class: register
|
||||
name: gpr
|
||||
port_pressure: [[1, '0'], [1, '1'], [1, '5']]
|
||||
throughput: 0.5
|
||||
latency: 1.0
|
||||
- name: LEA
|
||||
operands:
|
||||
- class: memory
|
||||
|
||||
@@ -53,6 +53,8 @@ class TestMarkerUtils(unittest.TestCase):
|
||||
bytes_3_lines_2 = ".byte 213\n" + ".byte 3,32\n" + ".byte 31\n"
|
||||
bytes_3_lines_3 = ".byte 213\n" + ".byte 3\n" + ".byte 32,31\n"
|
||||
bytes_4_lines = ".byte 213\n" + ".byte 3\n" + ".byte 32\n" + ".byte 31\n"
|
||||
bytes_hex = ".byte 0xd5, 0x3, 0x20, 0x1f\n"
|
||||
bytes_mixed = ".byte 0xd5\n.byte 3,0x20\n.byte 31\n"
|
||||
mov_start_1 = "mov x1, #111\n"
|
||||
mov_start_2 = "mov x1, 111 // should work as well\n"
|
||||
mov_end_1 = "mov x1, #222 // preferred way\n"
|
||||
@@ -80,6 +82,8 @@ class TestMarkerUtils(unittest.TestCase):
|
||||
bytes_3_lines_2,
|
||||
bytes_3_lines_3,
|
||||
bytes_4_lines,
|
||||
bytes_hex,
|
||||
bytes_mixed,
|
||||
]
|
||||
mov_start_variations = [mov_start_1, mov_start_2]
|
||||
mov_end_variations = [mov_end_1, mov_end_2]
|
||||
@@ -129,6 +133,8 @@ class TestMarkerUtils(unittest.TestCase):
|
||||
+ ".byte 103 # IACA MARKER UTILITY\n"
|
||||
+ ".byte 144 # IACA MARKER UTILITY\n"
|
||||
)
|
||||
bytes_hex_line = ".byte 0x64,0x67,0x90\n"
|
||||
bytes_mixed = ".byte 0x64 # MARKER\n .byte 103,0x90 # ANOTHER MARKER\n"
|
||||
mov_start_1 = "movl $111, %ebx # IACA START\n"
|
||||
mov_start_2 = "mov $111, %ebx # IACA START\n"
|
||||
mov_end_1 = "movl $222, %ebx # IACA END\n"
|
||||
@@ -148,6 +154,8 @@ class TestMarkerUtils(unittest.TestCase):
|
||||
bytes_2_lines_1,
|
||||
bytes_2_lines_2,
|
||||
bytes_3_lines,
|
||||
bytes_hex_line,
|
||||
bytes_mixed,
|
||||
]
|
||||
mov_start_variations = [mov_start_1, mov_start_2]
|
||||
mov_end_variations = [mov_end_1, mov_end_2]
|
||||
|
||||
@@ -175,7 +175,7 @@ class TestSemanticTools(unittest.TestCase):
|
||||
self.assertEqual(
|
||||
test_mm_x86.get_store_throughput(
|
||||
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
|
||||
),
|
||||
)[0]["port_pressure"],
|
||||
[[2, "237"], [2, "4"]],
|
||||
)
|
||||
self.assertEqual(
|
||||
@@ -186,13 +186,13 @@ class TestSemanticTools(unittest.TestCase):
|
||||
"index": "NOT_NONE",
|
||||
"scale": 1,
|
||||
}
|
||||
),
|
||||
)[0]["port_pressure"],
|
||||
[[1, "23"], [1, "4"]],
|
||||
)
|
||||
self.assertEqual(
|
||||
test_mm_arm.get_store_throughput(
|
||||
{"base": {"prefix": "x"}, "offset": None, "index": None, "scale": 1}
|
||||
),
|
||||
)[0]["port_pressure"],
|
||||
[[2, "34"], [2, "5"]],
|
||||
)
|
||||
self.assertEqual(
|
||||
@@ -203,7 +203,7 @@ class TestSemanticTools(unittest.TestCase):
|
||||
"index": None,
|
||||
"scale": 1,
|
||||
}
|
||||
),
|
||||
)[0]["port_pressure"],
|
||||
[[1, "34"], [1, "5"]],
|
||||
)
|
||||
|
||||
@@ -228,7 +228,7 @@ class TestSemanticTools(unittest.TestCase):
|
||||
self.assertEqual(
|
||||
test_mm_x86.get_load_throughput(
|
||||
{"base": {"name": "x"}, "offset": None, "index": None, "scale": 1}
|
||||
),
|
||||
)[0]["port_pressure"],
|
||||
[[1, "23"], [1, ["2D", "3D"]]],
|
||||
)
|
||||
|
||||
@@ -288,6 +288,21 @@ class TestSemanticTools(unittest.TestCase):
|
||||
tp_optimal = self.semantics_csx.get_throughput_sum(kernel_optimal)
|
||||
self.assertNotEqual(tp_fixed, tp_optimal)
|
||||
self.assertTrue(max(tp_optimal) <= max(tp_fixed))
|
||||
# test multiple port assignment options
|
||||
test_mm_x86 = MachineModel(path_to_yaml=self._find_file("test_db_x86.yml"))
|
||||
tmp_semantics = ArchSemantics(test_mm_x86)
|
||||
tmp_code_1 = "fantasyinstr1 %rax, %rax\n"
|
||||
tmp_code_2 = "fantasyinstr1 %rax, %rax\nfantasyinstr2 %rbx, %rbx\n"
|
||||
tmp_kernel_1 = self.parser_x86.parse_file(tmp_code_1)
|
||||
tmp_kernel_2 = self.parser_x86.parse_file(tmp_code_2)
|
||||
tmp_semantics.add_semantics(tmp_kernel_1)
|
||||
tmp_semantics.add_semantics(tmp_kernel_2)
|
||||
tmp_semantics.assign_optimal_throughput(tmp_kernel_1)
|
||||
tmp_semantics.assign_optimal_throughput(tmp_kernel_2)
|
||||
k1i1_pp = [round(x, 2) for x in tmp_kernel_1[0]["port_pressure"]]
|
||||
k2i1_pp = [round(x, 2) for x in tmp_kernel_2[0]["port_pressure"]]
|
||||
self.assertEqual(k1i1_pp, [0.33, 0.0, 0.33, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33, 0.0, 0.0])
|
||||
self.assertEqual(k2i1_pp, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])
|
||||
|
||||
# arm
|
||||
kernel_fixed = deepcopy(self.kernel_AArch64)
|
||||
|
||||
Reference in New Issue
Block a user