Add support for the Intel syntax supported by MSVC and ICC

This commit is contained in:
pleroy
2025-02-02 14:02:16 +01:00
parent 785a365c63
commit 1a7c1588f6
30 changed files with 2744 additions and 499 deletions

View File

@@ -111,7 +111,8 @@ def extract_model(tree, arch, skip_mem=True):
print("Skipping...", file=sys.stderr)
return None
mm = MachineModel(isa=isa)
parser = get_parser(isa)
# The model uses the AT&T syntax.
parser = get_parser(isa, "ATT")
for instruction_tag in tree.findall(".//instruction"):
ignore = False

View File

@@ -11,7 +11,7 @@ from ruamel.yaml import YAML
from osaca.db_interface import import_benchmark_output, sanity_check
from osaca.frontend import Frontend
from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT
from osaca.parser import BaseParser, ParserAArch64, ParserX86, ParserX86ATT, ParserX86Intel
from osaca.semantics import (
INSTR_FLAGS,
ArchSemantics,
@@ -47,6 +47,10 @@ DEFAULT_ARCHS = {
"aarch64": "V2",
"x86": "SPR",
}
SUPPORTED_SYNTAXES = [
"ATT",
"INTEL",
]
# Stolen from pip
@@ -108,6 +112,12 @@ def create_parser(parser=None):
"ZEN4, TX2, N1, A64FX, TSV110, A72, M1, V2). If no architecture is given, OSACA assumes a "
"default uarch for x86/AArch64.",
)
parser.add_argument(
"--syntax",
type=str,
help="Define the assembly syntax (ATT, Intel) for x86. If no syntax is given, OSACA "
"tries to determine automatically the syntax to use.",
)
parser.add_argument(
"--fixed",
action="store_true",
@@ -232,6 +242,14 @@ def check_arguments(args, parser):
parser.error(
"Microarchitecture not supported. Please see --help for all valid architecture codes."
)
if args.syntax and args.arch and MachineModel.get_isa_for_arch(args.arch) != "x86":
parser.error(
"Syntax can only be explicitly specified for an x86 microarchitecture"
)
if args.syntax and args.syntax.upper() not in SUPPORTED_SYNTAXES:
parser.error(
"Assembly syntax not supported. Please see --help for all valid assembly syntaxes."
)
if "import_data" in args and args.import_data not in supported_import_files:
parser.error(
"Microbenchmark not supported for data import. Please see --help for all valid "
@@ -310,30 +328,56 @@ def inspect(args, output_file=sys.stdout):
code = args.file.read()
# Detect ISA if necessary
arch = args.arch if args.arch is not None else DEFAULT_ARCHS[BaseParser.detect_ISA(code)]
print_arch_warning = False if args.arch else True
isa = MachineModel.get_isa_for_arch(arch)
detected_isa, detected_syntax = BaseParser.detect_ISA(code)
detected_arch = DEFAULT_ARCHS[detected_isa]
print_arch_warning = not args.arch
verbose = args.verbose
ignore_unknown = args.ignore_unknown
# Parse file
parser = get_asm_parser(arch)
try:
parsed_code = parser.parse_file(code)
except Exception as e:
# probably the wrong parser based on heuristic
if args.arch is None:
# change ISA and try again
arch = (
DEFAULT_ARCHS["x86"]
if BaseParser.detect_ISA(code) == "aarch64"
else DEFAULT_ARCHS["aarch64"]
)
isa = MachineModel.get_isa_for_arch(arch)
parser = get_asm_parser(arch)
# If the arch/syntax is explicitly specified, that's the only thing we'll try. Otherwise, we'll
# look at all the possible archs/syntaxes, but with our detected arch/syntax last in the list,
# thus tried first.
if args.arch:
archs_to_try = [args.arch]
else:
archs_to_try = list(DEFAULT_ARCHS)
archs_to_try.remove(detected_arch)
archs_to_try.append(detected_arch)
if args.syntax:
syntaxes_to_try = [args.syntax]
else:
syntaxes_to_try = SUPPORTED_SYNTAXES + [None]
syntaxes_to_try.remove(detected_syntax)
syntaxes_to_try.append(detected_syntax)
# Filter the cross-product of archs and syntaxes to eliminate the combinations that don't make
# sense.
combinations_to_try = [
(arch, syntax)
for arch in archs_to_try
for syntax in syntaxes_to_try
if (syntax != None) == (MachineModel.get_isa_for_arch(arch) == "x86")
]
# Parse file.
message = ""
single_combination = len(combinations_to_try) == 1
while True:
arch, syntax = combinations_to_try.pop()
parser = get_asm_parser(arch, syntax)
try:
parsed_code = parser.parse_file(code)
else:
raise e
break
except Exception as e:
message += f"\nWith arch {arch} and syntax {syntax} got error: {e}."
# Either the wrong parser based on heuristic, or a bona fide syntax error (or
# unsupported syntax). For ease of debugging, we emit the entire exception trace if
# we tried a single arch/syntax combination. If we tried multiple combinations, we
# don't emit the traceback as it would apply to the latest combination tried, which is
# probably the less interesting.
if not combinations_to_try:
raise SyntaxError(message) from e if single_combination else None
# Reduce to marked kernel or chosen section and add semantics
if args.lines:
@@ -341,13 +385,14 @@ def inspect(args, output_file=sys.stdout):
kernel = [line for line in parsed_code if line.line_number in line_range]
print_length_warning = False
else:
kernel = reduce_to_section(parsed_code, isa)
kernel = reduce_to_section(parsed_code, parser)
# Print warning if kernel has no markers and is larger than threshold (100)
print_length_warning = (
True if len(kernel) == len(parsed_code) and len(kernel) > 100 else False
)
machine_model = MachineModel(arch=arch)
semantics = ArchSemantics(machine_model)
semantics = ArchSemantics(parser, machine_model)
semantics.normalize_instruction_forms(kernel)
semantics.add_semantics(kernel)
# Do optimal schedule for kernel throughput if wished
if not args.fixed:
@@ -417,7 +462,7 @@ def run(args, output_file=sys.stdout):
@lru_cache()
def get_asm_parser(arch) -> BaseParser:
def get_asm_parser(arch, syntax) -> BaseParser:
"""
Helper function to create the right parser for a specific architecture.
@@ -427,7 +472,7 @@ def get_asm_parser(arch) -> BaseParser:
"""
isa = MachineModel.get_isa_for_arch(arch)
if isa == "x86":
return ParserX86ATT()
return ParserX86ATT() if syntax == "ATT" else ParserX86Intel()
elif isa == "aarch64":
return ParserAArch64()

View File

@@ -1,11 +1,13 @@
"""
Collection of parsers supported by OSACA.
Only the parser below will be exported, so please add new parsers to __all__.
Only the parsers below will be exported, so please add new parsers to __all__.
"""
from .base_parser import BaseParser
from .parser_x86 import ParserX86
from .parser_x86att import ParserX86ATT
from .parser_x86intel import ParserX86Intel
from .parser_AArch64 import ParserAArch64
from .instruction_form import InstructionForm
from .operand import Operand
@@ -14,15 +16,17 @@ __all__ = [
"Operand",
"InstructionForm",
"BaseParser",
"ParserX86",
"ParserX86ATT",
"ParserX86Intel",
"ParserAArch64",
"get_parser",
]
def get_parser(isa):
def get_parser(isa, syntax):
if isa.lower() == "x86":
return ParserX86ATT()
return ParserX86ATT() if syntax == "ATT" else ParserX86Intel()
elif isa.lower() == "aarch64":
return ParserAArch64()
else:

View File

@@ -3,6 +3,8 @@
import operator
import re
from osaca.semantics.hw_model import MachineModel
class BaseParser(object):
# Identifiers for operand types
@@ -25,20 +27,62 @@ class BaseParser(object):
self.construct_parser()
self._parser_constructed = True
def isa(self):
# Done in derived classes
raise NotImplementedError
# The marker functions return lists of `InstructionForm` that are used to find the IACA markers
# in the parsed code. In addition to just a list, the marker may have a structure like
# [I1, [I2, I3], I4, ...] where the nested list indicates that at least one of I2 and I3 must
# match the second instruction in the fragment of parsed code.
# If an instruction form is a `DirectiveOperand`, the match may happen over several directive
# operands in the parsed code, provided that the directives have the same name and the
# parameters are in sequence with respect to the pattern. This provides an easy way to describe
# a sequence of bytes irrespective of the way it was grouped in the assembly source.
# Note that markers must be matched *before* normalization.
def start_marker(self):
# Done in derived classes
raise NotImplementedError
def end_marker(self):
# Done in derived classes
raise NotImplementedError
# Performs all the normalization needed to match the instruction to the ISO/arch model. This
# method must set the `normalized` property of the instruction and must be idempotent.
def normalize_instruction_form(
self,
instruction_form,
isa_model: MachineModel,
arch_model: MachineModel
):
raise NotImplementedError
@staticmethod
def detect_ISA(file_content):
"""Detect the ISA of the assembly based on the used registers and return the ISA code."""
"""
Detect the ISA of the assembly based on the used registers and return the ISA code.
:param str file_content: assembly code.
:return: a tuple isa, syntax describing the architecture and the assembly syntax,
if appropriate. If there is no notion of syntax, the second element is None.
"""
# Check for the amount of registers in the code to determine the ISA
# 1) Check for xmm, ymm, zmm, rax, rbx, rcx, and rdx registers in x86
# AT&T syntax. There is a % before each register name.
heuristics_x86ATT = [r"%[xyz]mm[0-9]", r"%[er][abcd]x[0-9]"]
# 2) check for v and z vector registers and x/w general-purpose registers
# 2) Same as above, but for the Intel syntax. There is no % before the register names.
heuristics_x86Intel = [r"[^%][xyz]mm[0-9]", r"[^%][er][abcd]x[0-9]"]
# 3) check for v and z vector registers and x/w general-purpose registers
heuristics_aarch64 = [r"[vz][0-9][0-9]?\.[0-9][0-9]?[bhsd]", r"[wx][0-9]"]
matches = {"x86": 0, "aarch64": 0}
matches = {("x86", "ATT"): 0, ("x86", "INTEL"): 0, ("aarch64", None): 0}
for h in heuristics_x86ATT:
matches["x86"] += len(re.findall(h, file_content))
matches[("x86", "ATT")] += len(re.findall(h, file_content))
for h in heuristics_x86Intel:
matches[("x86", "INTEL")] += len(re.findall(h, file_content))
for h in heuristics_aarch64:
matches["aarch64"] += len(re.findall(h, file_content))
matches[("aarch64", None)] += len(re.findall(h, file_content))
return max(matches.items(), key=operator.itemgetter(1))[0]
@@ -94,6 +138,14 @@ class BaseParser(object):
def get_full_reg_name(self, register):
raise NotImplementedError
# Must be called on a *normalized* instruction.
def get_regular_source_operands(self, instruction_form):
raise NotImplementedError
# Must be called on a *normalized* instruction.
def get_regular_destination_operands(self, instruction_form):
raise NotImplementedError
def normalize_imd(self, imd):
raise NotImplementedError

View File

@@ -41,3 +41,12 @@ class IdentifierOperand(Operand):
def __repr__(self):
return self.__str__()
def __eq__(self, other):
if isinstance(other, IdentifierOperand):
return (
self._name == other._name
and self._offset == other._offset
and self._relocation == other._relocation
)
return False

View File

@@ -19,6 +19,7 @@ class InstructionForm:
port_pressure=None,
operation=None,
breaks_dependency_on_equal_operands=False,
normalized=False,
):
self._mnemonic = mnemonic
self._operands = operands
@@ -33,6 +34,7 @@ class InstructionForm:
self._operation = operation
self._uops = uops
self._breaks_dependency_on_equal_operands = breaks_dependency_on_equal_operands
self._normalized = normalized
self._latency = latency
self._throughput = throughput
self._latency_cp = []
@@ -42,6 +44,10 @@ class InstructionForm:
self._port_uops = []
self._flags = []
def check_normalized(self):
if not self._normalized:
raise AssertionError("Unnormalized instruction")
@property
def semantic_operands(self):
return self._semantic_operands
@@ -114,6 +120,10 @@ class InstructionForm:
def breaks_dependency_on_equal_operands(self):
return self._breaks_dependency_on_equal_operands
@property
def normalized(self):
return self._normalized
@semantic_operands.setter
def semantic_operands(self, semantic_operands):
self._semantic_operands = semantic_operands
@@ -142,6 +152,10 @@ class InstructionForm:
def breaks_dependency_on_equal_operands(self, boolean):
self._breaks_dependency_on_equal_operands = boolean
@normalized.setter
def normalized(self, normalized):
self._normalized = normalized
@mnemonic.setter
def mnemonic(self, mnemonic):
self._mnemonic = mnemonic

View File

@@ -20,3 +20,8 @@ class LabelOperand(Operand):
def __repr__(self):
return self.__str__()
def __eq__(self, other):
if isinstance(other, LabelOperand):
return self._name == other._name
return False

View File

@@ -15,6 +15,7 @@ class MemoryOperand(Operand):
pre_indexed=False,
post_indexed=False,
indexed_val=None,
data_type=None,
src=None,
dst=None,
source=False,
@@ -30,6 +31,7 @@ class MemoryOperand(Operand):
self._pre_indexed = pre_indexed
self._post_indexed = post_indexed
self._indexed_val = indexed_val
self._data_type = data_type
# type of register we store from (`src`) or load to (`dst`)
self._src = src
self._dst = dst
@@ -74,6 +76,14 @@ class MemoryOperand(Operand):
def indexed_val(self):
return self._indexed_val
@property
def data_type(self):
return self._data_type
@data_type.setter
def data_type(self, data_type):
self._data_type = data_type
@property
def src(self):
return self._src

View File

@@ -13,6 +13,7 @@ from osaca.parser.identifier import IdentifierOperand
from osaca.parser.immediate import ImmediateOperand
from osaca.parser.condition import ConditionOperand
from osaca.parser.prefetch import PrefetchOperand
from osaca.semantics.hw_model import MachineModel
class ParserAArch64(BaseParser):
@@ -26,7 +27,58 @@ class ParserAArch64(BaseParser):
def __init__(self):
super().__init__()
self.isa = "aarch64"
def isa(self):
return "aarch64"
def start_marker(self):
return [
InstructionForm(
mnemonic="mov",
operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=111)]
),
InstructionForm(
directive_id=DirectiveOperand(name="byte", parameters=["213", "3", "32", "31"])
)
]
def end_marker(self):
return [
InstructionForm(
mnemonic="mov",
operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=222)]
),
InstructionForm(
directive_id=DirectiveOperand(name="byte", parameters=["213", "3", "32", "31"])
)
]
def normalize_instruction_form(
self,
instruction_form,
isa_model: MachineModel,
arch_model: MachineModel
):
"""
If the instruction doesn't exist in the machine model, normalize it by dropping the shape
suffix.
"""
if instruction_form.normalized:
return
instruction_form.normalized = True
mnemonic = instruction_form.mnemonic
if not mnemonic:
return
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
if not model:
if "." in mnemonic:
# Check for instruction without shape/cc suffix.
suffix_start = mnemonic.index(".")
mnemonic = mnemonic[:suffix_start]
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
if model:
instruction_form.mnemonic = mnemonic
def construct_parser(self):
"""Create parser for ARM AArch64 ISA."""
@@ -589,6 +641,21 @@ class ParserAArch64(BaseParser):
name += "[" + str(register.index) + "]"
return name
def get_regular_source_operands(self, instruction_form):
"""Get source operand of given instruction form assuming regular src/dst behavior."""
# if there is only one operand, assume it is a source operand
if len(instruction_form.operands) == 1:
return [instruction_form.operands[0]]
return [op for op in instruction_form.operands[1:]]
def get_regular_destination_operands(self, instruction_form):
"""Get destination operand of given instruction form assuming regular src/dst behavior."""
# if there is only one operand, assume no destination
if len(instruction_form.operands) == 1:
return []
# return first operand
return instruction_form.operands[:1]
def normalize_imd(self, imd):
"""Normalize immediate to decimal based representation"""
if isinstance(imd, IdentifierOperand):

123
osaca/parser/parser_x86.py Normal file
View File

@@ -0,0 +1,123 @@
import re
import string
from osaca.parser import BaseParser
class ParserX86(BaseParser):
_instance = None
# Singleton pattern, as this is created very many times.
def __new__(cls):
if cls._instance is None:
cls._instance = super(ParserX86, cls).__new__(cls)
return cls._instance
def __init__(self):
super().__init__()
def isa(self):
return "x86"
def is_reg_dependend_of(self, reg_a, reg_b):
"""Check if ``reg_a`` is dependent on ``reg_b``"""
reg_a_name = reg_a.name.upper()
reg_b_name = reg_b.name.upper()
# Check if they are the same registers
if reg_a_name == reg_b_name:
return True
# Check vector registers first
if self.is_vector_register(reg_a):
if self.is_vector_register(reg_b):
if reg_a_name[1:] == reg_b_name[1:]:
# Registers in the same vector space
return True
return False
# Check basic GPRs
gpr_groups = {
"A": ["RAX", "EAX", "AX", "AH", "AL"],
"B": ["RBX", "EBX", "BX", "BH", "BL"],
"C": ["RCX", "ECX", "CX", "CH", "CL"],
"D": ["RDX", "EDX", "DX", "DH", "DL"],
"SP": ["RSP", "ESP", "SP", "SPL"],
"SRC": ["RSI", "ESI", "SI", "SIL"],
"DST": ["RDI", "EDI", "DI", "DIL"],
}
if self.is_basic_gpr(reg_a):
if self.is_basic_gpr(reg_b):
for dep_group in gpr_groups.values():
if reg_a_name in dep_group:
if reg_b_name in dep_group:
return True
return False
# Check other GPRs
ma = re.match(r"R([0-9]+)[DWB]?", reg_a_name)
mb = re.match(r"R([0-9]+)[DWB]?", reg_b_name)
if ma and mb and ma.group(1) == mb.group(1):
return True
# No dependencies
return False
def is_basic_gpr(self, register):
"""Check if register is a basic general purpose register (ebi, rax, ...)"""
if any(char.isdigit() for char in register.name) or any(
register.name.lower().startswith(x) for x in ["mm", "xmm", "ymm", "zmm"]
):
return False
return True
def is_gpr(self, register):
"""Check if register is a general purpose register"""
if register is None:
return False
if self.is_basic_gpr(register):
return True
return re.match(r"R([0-9]+)[DWB]?", register.name, re.IGNORECASE)
def is_vector_register(self, register):
"""Check if register is a vector register"""
if register is None or register.name is None:
return False
if register.name.rstrip(string.digits).lower() in [
"mm",
"xmm",
"ymm",
"zmm",
]:
return True
return False
def get_reg_type(self, register):
"""Get register type"""
if register is None:
return False
if self.is_gpr(register):
return "gpr"
elif self.is_vector_register(register):
return register.name.rstrip(string.digits).lower()
raise ValueError
def is_flag_dependend_of(self, flag_a, flag_b):
"""Check if ``flag_a`` is dependent on ``flag_b``"""
# we assume flags are independent of each other, e.g., CF can be read while ZF gets written
# TODO validate this assumption
return flag_a.name == flag_b.name
def get_regular_source_operands(self, instruction_form):
"""Get source operand of given instruction form assuming regular src/dst behavior."""
# if there is only one operand, assume it is a source operand
if len(instruction_form.operands) == 1:
return [instruction_form.operands[0]]
# return all but last operand
return [op for op in instruction_form.operands[0:-1]]
def get_regular_destination_operands(self, instruction_form):
"""Get destination operand of given instruction form assuming regular src/dst behavior."""
# if there is only one operand, assume no destination
if len(instruction_form.operands) == 1:
return []
# return last operand
return instruction_form.operands[-1:]

View File

@@ -5,7 +5,7 @@ import re
import pyparsing as pp
from osaca.parser import BaseParser
from osaca.parser import ParserX86
from osaca.parser.instruction_form import InstructionForm
from osaca.parser.directive import DirectiveOperand
from osaca.parser.memory import MemoryOperand
@@ -13,10 +13,12 @@ from osaca.parser.label import LabelOperand
from osaca.parser.register import RegisterOperand
from osaca.parser.identifier import IdentifierOperand
from osaca.parser.immediate import ImmediateOperand
from osaca.semantics.hw_model import MachineModel
class ParserX86ATT(BaseParser):
class ParserX86ATT(ParserX86):
_instance = None
GAS_SUFFIXES = "bswlqt"
# Singelton pattern, as this is created very many times
def __new__(cls):
@@ -26,7 +28,66 @@ class ParserX86ATT(BaseParser):
def __init__(self):
super().__init__()
self.isa = "x86"
def start_marker(self):
return [
[
InstructionForm(
mnemonic="mov",
operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")]
),
InstructionForm(
mnemonic="movl",
operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")]
)
],
InstructionForm(
directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"])
)
]
def end_marker(self):
return [
[
InstructionForm(
mnemonic="mov",
operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")]
),
InstructionForm(
mnemonic="movl",
operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")]
)
],
InstructionForm(
directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"])
)
]
def normalize_instruction_form(
self,
instruction_form,
isa_model: MachineModel,
arch_model: MachineModel
):
"""
If the instruction doesn't exist in the machine model, normalize it by dropping the GAS
suffix.
"""
if instruction_form.normalized:
return
instruction_form.normalized = True
mnemonic = instruction_form.mnemonic
if not mnemonic:
return
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
if not model:
# Check for instruction without GAS suffix.
if mnemonic[-1] in self.GAS_SUFFIXES:
mnemonic = mnemonic[:-1]
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
if model:
instruction_form.mnemonic = mnemonic
def construct_parser(self):
"""Create parser for x86 AT&T ISA."""
@@ -253,10 +314,10 @@ class ParserX86ATT(BaseParser):
if result is None:
try:
result = self.parse_instruction(line)
except pp.ParseException:
except pp.ParseException as e:
raise ValueError(
"Could not parse instruction on line {}: {!r}".format(line_number, line)
)
) from e
instruction_form.mnemonic = result.mnemonic
instruction_form.operands = result.operands
instruction_form.comment = result.comment
@@ -393,90 +454,3 @@ class ParserX86ATT(BaseParser):
return imd.value
# identifier
return imd
def is_flag_dependend_of(self, flag_a, flag_b):
"""Check if ``flag_a`` is dependent on ``flag_b``"""
# we assume flags are independent of each other, e.g., CF can be read while ZF gets written
# TODO validate this assumption
return flag_a.name == flag_b.name
def is_reg_dependend_of(self, reg_a, reg_b):
"""Check if ``reg_a`` is dependent on ``reg_b``"""
reg_a_name = reg_a.name.upper()
reg_b_name = reg_b.name.upper()
# Check if they are the same registers
if reg_a_name == reg_b_name:
return True
# Check vector registers first
if self.is_vector_register(reg_a):
if self.is_vector_register(reg_b):
if reg_a_name[1:] == reg_b_name[1:]:
# Registers in the same vector space
return True
return False
# Check basic GPRs
gpr_groups = {
"A": ["RAX", "EAX", "AX", "AH", "AL"],
"B": ["RBX", "EBX", "BX", "BH", "BL"],
"C": ["RCX", "ECX", "CX", "CH", "CL"],
"D": ["RDX", "EDX", "DX", "DH", "DL"],
"SP": ["RSP", "ESP", "SP", "SPL"],
"SRC": ["RSI", "ESI", "SI", "SIL"],
"DST": ["RDI", "EDI", "DI", "DIL"],
}
if self.is_basic_gpr(reg_a):
if self.is_basic_gpr(reg_b):
for dep_group in gpr_groups.values():
if reg_a_name in dep_group:
if reg_b_name in dep_group:
return True
return False
# Check other GPRs
ma = re.match(r"R([0-9]+)[DWB]?", reg_a_name)
mb = re.match(r"R([0-9]+)[DWB]?", reg_b_name)
if ma and mb and ma.group(1) == mb.group(1):
return True
# No dependencies
return False
def is_basic_gpr(self, register):
"""Check if register is a basic general purpose register (ebi, rax, ...)"""
if any(char.isdigit() for char in register.name) or any(
register.name.lower().startswith(x) for x in ["mm", "xmm", "ymm", "zmm"]
):
return False
return True
def is_gpr(self, register):
"""Check if register is a general purpose register"""
if register is None:
return False
if self.is_basic_gpr(register):
return True
return re.match(r"R([0-9]+)[DWB]?", register.name, re.IGNORECASE)
def is_vector_register(self, register):
"""Check if register is a vector register"""
if register is None or register.name is None:
return False
if register.name.rstrip(string.digits).lower() in [
"mm",
"xmm",
"ymm",
"zmm",
]:
return True
return False
def get_reg_type(self, register):
"""Get register type"""
if register is None:
return False
if self.is_gpr(register):
return "gpr"
elif self.is_vector_register(register):
return register.name.rstrip(string.digits).lower()
raise ValueError

View File

@@ -0,0 +1,830 @@
#!/usr/bin/env python3
import pyparsing as pp
import re
import string
import unicodedata
from osaca.parser import ParserX86
from osaca.parser.directive import DirectiveOperand
from osaca.parser.identifier import IdentifierOperand
from osaca.parser.immediate import ImmediateOperand
from osaca.parser.instruction_form import InstructionForm
from osaca.parser.label import LabelOperand
from osaca.parser.memory import MemoryOperand
from osaca.parser.register import RegisterOperand
from osaca.semantics.hw_model import MachineModel
# We assume any non-ASCII characters except control characters and line terminators can be part of
# identifiers; this is based on the assumption that no assembler uses non-ASCII white space and
# syntax characters.
# This approach is described at the end of https://www.unicode.org/reports/tr55/#Whitespace-Syntax.
# It is appropriate for tools, such as this one, which process source code but do not fully validate
# it (in this case, thats the job of the assembler).
NON_ASCII_PRINTABLE_CHARACTERS = "".join(
chr(cp) for cp in range(0x80, 0x10FFFF + 1)
if unicodedata.category(chr(cp)) not in ("Cc", "Zl", "Zp", "Cs", "Cn")
)
# References:
# ASM386 Assembly Language Reference, document number 469165-003, https://mirror.math.princeton.edu/pub/oldlinux/Linux.old/Ref-docs/asm-ref.pdf.
# Microsoft Macro Assembler BNF Grammar, https://learn.microsoft.com/en-us/cpp/assembler/masm/masm-bnf-grammar?view=msvc-170.
# Intel Architecture Code Analyzer User's Guide, https://www.intel.com/content/dam/develop/external/us/en/documents/intel-architecture-code-analyzer-3-0-users-guide-157552.pdf.
class ParserX86Intel(ParserX86):
_instance = None
# Singleton pattern, as this is created very many times.
def __new__(cls):
if cls._instance is None:
cls._instance = super(ParserX86Intel, cls).__new__(cls)
return cls._instance
def __init__(self):
super().__init__()
self._equ = {}
# The IACA manual says: "For For Microsoft* Visual C++ compiler, 64-bit version, use
# IACA_VC64_START and IACA_VC64_END, instead" (of IACA_START and IACA_END).
# TODO: Inconveniently, the code generated with optimization disabled (/Od) has two
# instructions. We should support both patterns, but then who runs OSACA with /Od?
def start_marker(self):
return [
InstructionForm(
mnemonic="mov",
operands=[
MemoryOperand(
base=RegisterOperand(name="GS"),
offset=ImmediateOperand(value=111)
),
ImmediateOperand(value=111)
]
),
]
def end_marker(self):
return [
InstructionForm(
mnemonic="mov",
operands=[
MemoryOperand(
base=RegisterOperand(name="GS"),
offset=ImmediateOperand(value=222)
),
ImmediateOperand(value=222)
]
),
]
def normalize_instruction_form(
self,
instruction_form,
isa_model: MachineModel,
arch_model: MachineModel
):
"""
If the model indicates that this instruction has a single destination that is the last
operand, move the first operand to the last position. This effectively converts the Intel
syntax to the AT&T one.
"""
if instruction_form.normalized:
return
instruction_form.normalized = True
mnemonic = instruction_form.mnemonic
if not mnemonic:
return
# The model may only contain the VEX-encoded instruction and we may have the non-VEX-encoded
# one, or vice-versa. Note that this doesn't work when the arguments differ between VEX-
# encoded and non-VEX-encoded, e.g., for psubq.
if not arch_model.get_instruction(
mnemonic,
len(instruction_form.operands)
):
if mnemonic[0] == 'v':
unvexed_mnemonic = mnemonic[1:]
if arch_model.get_instruction(
unvexed_mnemonic,
len(instruction_form.operands)
):
mnemonic = unvexed_mnemonic
else:
vexed_mnemonic = 'v' + mnemonic
if arch_model.get_instruction(
vexed_mnemonic,
len(instruction_form.operands)
):
mnemonic = vexed_mnemonic
instruction_form.mnemonic = mnemonic
# We cannot pass the operands because they may not match before the reordering. We just
# pass the arity instead. Also, this must use the ISA model, because that's where the
# source/destination information is found.
model = isa_model.get_instruction(mnemonic, len(instruction_form.operands))
has_single_destination_at_end = False
has_destination = False
if model:
for o in model.operands:
if o.source:
if has_destination:
has_single_destination_at_end = False
if o.destination:
if has_destination:
has_single_destination_at_end = False
else:
has_destination = True
has_single_destination_at_end = True
else:
# if there is only one operand, assume it is a source operand
has_single_destination_at_end = len(instruction_form.operands) > 1
if has_single_destination_at_end:
# It is important to reverse the operands, we cannot just move the first one last. This
# makes a difference for instructions with 3 operands or more, such as roundsd: the
# model files expect the rounding mode (an immediate) first but the Intel syntax has it
# last.
instruction_form.operands.reverse()
# A hack to help with comparison instruction: if the instruction is in the model, and has
# exactly two sources, swap its operands.
if (model and
not has_destination and
len(instruction_form.operands) == 2
and not isa_model.get_instruction(
mnemonic,
instruction_form.operands
) and not arch_model.get_instruction(
mnemonic,
instruction_form.operands
)):
instruction_form.operands.reverse()
# If the instruction has a well-known data type, append a suffix.
data_type_to_suffix = {"DWORD": "d", "QWORD": "q"}
for o in instruction_form.operands:
if isinstance(o, MemoryOperand) and o.data_type:
suffix = data_type_to_suffix.get(o.data_type, None)
if suffix:
suffixed_mnemonic = mnemonic + suffix
if isa_model.get_instruction(
suffixed_mnemonic,
len(instruction_form.operands)
) or arch_model.get_instruction(
suffixed_mnemonic,
len(instruction_form.operands)
):
instruction_form.mnemonic = suffixed_mnemonic
break
def construct_parser(self):
"""Create parser for x86 Intel ISA."""
# Numeric literal.
binary_number = pp.Combine(
pp.Word("01") + pp.CaselessLiteral("B")
)
octal_number = pp.Combine(
pp.Word("01234567") + pp.CaselessLiteral("O")
)
decimal_number = pp.Combine(
pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)
)
hex_number = pp.Combine(
pp.Word(pp.hexnums) + pp.CaselessLiteral("H")
)
float_number = pp.Combine(
pp.Optional(pp.Literal("-")) + pp.Word(pp.nums) + pp.Word(".", pp.nums)
).setResultsName("value")
integer_number = (
binary_number ^ octal_number ^ decimal_number ^ hex_number
).setResultsName("value")
# Comment.
self.comment = pp.Word(";#", exact=1) + pp.Group(
pp.ZeroOrMore(pp.Word(pp.printables + NON_ASCII_PRINTABLE_CHARACTERS))
).setResultsName(self.comment_id)
# Types.
data_type = (
pp.CaselessKeyword("BYTE")
| pp.CaselessKeyword("DWORD")
| pp.CaselessKeyword("FWORD")
| pp.CaselessKeyword("MMWORD")
| pp.CaselessKeyword("OWORD")
| pp.CaselessKeyword("QWORD")
| pp.CaselessKeyword("REAL10")
| pp.CaselessKeyword("REAL4")
| pp.CaselessKeyword("REAL8")
| pp.CaselessKeyword("SBYTE")
| pp.CaselessKeyword("SDWORD")
| pp.CaselessKeyword("SQWORD")
| pp.CaselessKeyword("SWORD")
| pp.CaselessKeyword("TBYTE")
| pp.CaselessKeyword("WORD")
| pp.CaselessKeyword("XMMWORD")
| pp.CaselessKeyword("YMMWORD")
).setResultsName("data_type")
# Identifier. Note that $ is not mentioned in the ASM386 Assembly Language Reference,
# but it is mentioned in the MASM syntax. < and > apparently show up in C++ mangled names.
# ICC allows ".", at least in labels.
first = pp.Word(pp.alphas + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>", exact=1)
rest = pp.Word(pp.alphanums + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>")
identifier = pp.Group(
pp.Combine(first + pp.Optional(rest)).setResultsName("name")
).setResultsName("identifier")
# Register.
# This follows the MASM grammar.
special_register = (
pp.CaselessKeyword("CR0")
| pp.CaselessKeyword("CR2")
| pp.CaselessKeyword("CR3")
| pp.CaselessKeyword("DR0")
| pp.CaselessKeyword("DR1")
| pp.CaselessKeyword("DR2")
| pp.CaselessKeyword("DR3")
| pp.CaselessKeyword("DR6")
| pp.CaselessKeyword("DR7")
| pp.CaselessKeyword("TR3")
| pp.CaselessKeyword("TR4")
| pp.CaselessKeyword("TR5")
| pp.CaselessKeyword("TR6")
| pp.CaselessKeyword("TR7")
).setResultsName("name")
gp_register = (
pp.CaselessKeyword("AX")
| pp.CaselessKeyword("EAX")
| pp.CaselessKeyword("CX")
| pp.CaselessKeyword("ECX")
| pp.CaselessKeyword("DX")
| pp.CaselessKeyword("EDX")
| pp.CaselessKeyword("BX")
| pp.CaselessKeyword("EBX")
| pp.CaselessKeyword("DI")
| pp.CaselessKeyword("EDI")
| pp.CaselessKeyword("SI")
| pp.CaselessKeyword("ESI")
| pp.CaselessKeyword("BP")
| pp.CaselessKeyword("EBP")
| pp.CaselessKeyword("SP")
| pp.CaselessKeyword("ESP")
| pp.CaselessKeyword("R8W")
| pp.CaselessKeyword("R8D")
| pp.CaselessKeyword("R9W")
| pp.CaselessKeyword("R9D")
| pp.CaselessKeyword("R12D")
| pp.CaselessKeyword("R13W")
| pp.CaselessKeyword("R13D")
| pp.CaselessKeyword("R14W")
| pp.CaselessKeyword("R14D")
).setResultsName("name")
byte_register = (
pp.CaselessKeyword("AL")
| pp.CaselessKeyword("AH")
| pp.CaselessKeyword("CL")
| pp.CaselessKeyword("CH")
| pp.CaselessKeyword("DL")
| pp.CaselessKeyword("DH")
| pp.CaselessKeyword("BL")
| pp.CaselessKeyword("BH")
| pp.CaselessKeyword("R8B")
| pp.CaselessKeyword("R9B")
| pp.CaselessKeyword("R10B")
| pp.CaselessKeyword("R11B")
| pp.CaselessKeyword("R12B")
| pp.CaselessKeyword("R13B")
).setResultsName("name")
qword_register = (
pp.CaselessKeyword("RAX")
| pp.CaselessKeyword("RCX")
| pp.CaselessKeyword("RDX")
| pp.CaselessKeyword("RBX")
| pp.CaselessKeyword("RSP")
| pp.CaselessKeyword("RBP")
| pp.CaselessKeyword("RSI")
| pp.CaselessKeyword("RDI")
| pp.CaselessKeyword("R8")
| pp.CaselessKeyword("R9")
| pp.CaselessKeyword("R10")
| pp.CaselessKeyword("R11")
| pp.CaselessKeyword("R12")
| pp.CaselessKeyword("R13")
| pp.CaselessKeyword("R14")
| pp.CaselessKeyword("R15")
).setResultsName("name")
fpu_register = pp.Combine(
pp.CaselessKeyword("ST")
+ pp.Optional(pp.Literal("(") + pp.Word("01234567") + pp.Literal(")"))
).setResultsName("name")
xmm_register = (
pp.Combine(pp.CaselessLiteral("XMM") + pp.Word(pp.nums))
| pp.Combine(pp.CaselessLiteral("XMM1") + pp.Word("012345"))
)
simd_register = (
pp.Combine(pp.CaselessLiteral("MM") + pp.Word("01234567"))
| xmm_register
| pp.Combine(pp.CaselessLiteral("YMM") + pp.Word(pp.nums))
| pp.Combine(pp.CaselessLiteral("YMM1") + pp.Word("012345"))
).setResultsName("name")
segment_register = (
pp.CaselessKeyword("CS")
| pp.CaselessKeyword("DS")
| pp.CaselessKeyword("ES")
| pp.CaselessKeyword("FS")
| pp.CaselessKeyword("GS")
| pp.CaselessKeyword("SS")
).setResultsName("name")
self.register = pp.Group(
special_register
| gp_register
| byte_register
| qword_register
| fpu_register
| simd_register
| segment_register
| pp.CaselessKeyword("RIP")
).setResultsName(self.register_id)
# Register expressions.
base_register = self.register
index_register = self.register
scale = pp.Word("1248", exact=1)
post_displacement = pp.Group(
(pp.Literal("+") ^ pp.Literal("-")).setResultsName("sign")
+ integer_number | identifier
).setResultsName(self.immediate_id)
pre_displacement = pp.Group(integer_number + pp.Literal("+")
).setResultsName(self.immediate_id)
indexed = pp.Group(
index_register.setResultsName("index")
+ pp.Optional(pp.Literal("*")
+ scale.setResultsName("scale"))
).setResultsName("indexed")
register_expression = pp.Group(
pp.Literal("[")
+ pp.Optional(pp.Group(pre_displacement).setResultsName("pre_displacement"))
+ pp.Group(
base_register.setResultsName("base")
^ pp.Group(
base_register.setResultsName("base")
+ pp.Literal("+")
+ indexed).setResultsName("base_and_indexed")
^ indexed
).setResultsName("non_displacement")
+ pp.Optional(pp.Group(post_displacement).setResultsName("post_displacement"))
+ pp.Literal("]")
).setResultsName("register_expression")
# Immediate.
immediate = pp.Group(
integer_number | float_number | identifier
).setResultsName(self.immediate_id)
# Expressions.
# The ASM86 manual has weird expressions on page 130 (displacement outside of the register
# expression, multiple register expressions). Let's ignore those for now, but see
# https://stackoverflow.com/questions/71540754/why-sometimes-use-offset-flatlabel-and-sometimes-not.
address_expression = pp.Group(
self.register.setResultsName("segment") + pp.Literal(":") + immediate
^ immediate + register_expression
^ register_expression
^ identifier + pp.Optional(pp.Literal("+") + immediate)
).setResultsName("address_expression")
offset_expression = pp.Group(
pp.CaselessKeyword("OFFSET")
+ pp.Group(
pp.CaselessKeyword("GROUP")
| pp.CaselessKeyword("SEGMENT")
| pp.CaselessKeyword("FLAT")
)
# The MASM grammar has the ":" immediately after "OFFSET", but that's not what MSVC
# outputs.
+ pp.Literal(":")
+ identifier.setResultsName("identifier")
+ pp.Optional(pp.Literal("+") + immediate.setResultsName("displacement"))
).setResultsName("offset_expression")
ptr_expression = pp.Group(
data_type + pp.CaselessKeyword("PTR") + address_expression
).setResultsName("ptr_expression")
short_expression = pp.Group(
pp.CaselessKeyword("SHORT") + identifier
).setResultsName("short_expression")
# Instructions.
mnemonic = pp.Word(
pp.alphas, pp.alphanums
).setResultsName("mnemonic")
operand = pp.Group(
self.register
| pp.Group(
offset_expression
| ptr_expression
| short_expression
| address_expression
).setResultsName(self.memory_id)
| immediate
)
self.instruction_parser = (
mnemonic
+ pp.Optional(operand.setResultsName("operand1"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand.setResultsName("operand2"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand.setResultsName("operand3"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand.setResultsName("operand4"))
+ pp.Optional(self.comment)
)
# Label.
self.label = pp.Group(
identifier.setResultsName("name")
+ pp.Literal(":")
+ pp.Optional(self.instruction_parser)
+ pp.Optional(self.comment)
).setResultsName(self.label_id)
# Directives.
# The identifiers at the beginnig of a directive cannot start with a "." otherwise we end up
# with ambiguities.
directive_first = pp.Word(pp.alphas + NON_ASCII_PRINTABLE_CHARACTERS + "$?@_<>", exact=1)
directive_rest = pp.Word(pp.alphanums + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>")
directive_identifier = pp.Group(
pp.Combine(directive_first + pp.Optional(directive_rest)).setResultsName("name")
).setResultsName("identifier")
# Parameter can be any quoted string or sequence of characters besides ';' (for comments)
# or ',' (parameter delimiter). See ASM386 p. 38.
directive_parameter = (
pp.quotedString
^ (
pp.Word(pp.printables + NON_ASCII_PRINTABLE_CHARACTERS, excludeChars=",;")
+ pp.Optional(pp.Suppress(pp.Literal(",")))
)
^ pp.Suppress(pp.Literal(","))
)
# The directives that don't start with a "." are ambiguous with instructions, so we list
# them explicitly.
# TODO: The directives that are types introduce a nasty ambiguity with instructions. Skip
# them for now, apparently the MSVC output uses the short D? directives.
directive_keywords = (
pp.CaselessKeyword("ALIAS")
| pp.CaselessKeyword("ALIGN")
| pp.CaselessKeyword("ASSUME")
#| pp.CaselessKeyword("BYTE")
| pp.CaselessKeyword("CATSTR")
| pp.CaselessKeyword("COMM")
| pp.CaselessKeyword("COMMENT")
| pp.CaselessKeyword("DB")
| pp.CaselessKeyword("DD")
| pp.CaselessKeyword("DF")
| pp.CaselessKeyword("DQ")
| pp.CaselessKeyword("DT")
| pp.CaselessKeyword("DW")
#| pp.CaselessKeyword("DWORD")
| pp.CaselessKeyword("ECHO")
| pp.CaselessKeyword("END")
| pp.CaselessKeyword("ENDP")
| pp.CaselessKeyword("ENDS")
| pp.CaselessKeyword("EQU")
| pp.CaselessKeyword("EVEN")
| pp.CaselessKeyword("EXTRN")
| pp.CaselessKeyword("EXTERNDEF")
#| pp.CaselessKeyword("FWORD")
| pp.CaselessKeyword("GROUP")
| pp.CaselessKeyword("INCLUDE")
| pp.CaselessKeyword("INCLUDELIB")
| pp.CaselessKeyword("INSTR")
| pp.CaselessKeyword("INVOKE")
| pp.CaselessKeyword("LABEL")
#| pp.CaselessKeyword("MMWORD")
| pp.CaselessKeyword("OPTION")
| pp.CaselessKeyword("ORG")
| pp.CaselessKeyword("PAGE")
| pp.CaselessKeyword("POPCONTEXT")
| pp.CaselessKeyword("PROC")
| pp.CaselessKeyword("PROTO")
| pp.CaselessKeyword("PUBLIC")
| pp.CaselessKeyword("PUSHCONTEXT")
#| pp.CaselessKeyword("QWORD")
#| pp.CaselessKeyword("REAL10")
#| pp.CaselessKeyword("REAL4")
#| pp.CaselessKeyword("REAL8")
| pp.CaselessKeyword("RECORD")
#| pp.CaselessKeyword("SBYTE")
#| pp.CaselessKeyword("SDWORD")
| pp.CaselessKeyword("SEGMENT")
| pp.CaselessKeyword("SIZESTR")
| pp.CaselessKeyword("STRUCT")
| pp.CaselessKeyword("SUBSTR")
| pp.CaselessKeyword("SUBTITLE")
#| pp.CaselessKeyword("SWORD")
#| pp.CaselessKeyword("TBYTE")
| pp.CaselessKeyword("TEXTEQU")
| pp.CaselessKeyword("TITLE")
| pp.CaselessKeyword("TYPEDEF")
| pp.CaselessKeyword("UNION")
#| pp.CaselessKeyword("WORD")
#| pp.CaselessKeyword("XMMWORD")
#| pp.CaselessKeyword("YMMWORD")
)
self.directive = pp.Group(
pp.Optional(~directive_keywords + directive_identifier)
+ (
pp.Combine(pp.Literal(".") + pp.Word(pp.alphanums + "_"))
| pp.Literal("=")
| directive_keywords
).setResultsName("name")
+ pp.ZeroOrMore(directive_parameter).setResultsName("parameters")
+ pp.Optional(self.comment)
).setResultsName(self.directive_id)
def parse_line(self, line, line_number=None):
"""
Parse line and return instruction form.
:param str line: line of assembly code
:param line_number: default None, identifier of instruction form
:type line_number: int, optional
:return: ``dict`` -- parsed asm line (comment, label, directive or instruction form)
"""
instruction_form = InstructionForm(line=line, line_number=line_number)
result = None
# 1. Parse comment.
try:
result = self.process_operand(self.comment.parseString(line, parseAll=True))
instruction_form.comment = " ".join(result[self.comment_id])
except pp.ParseException:
pass
# 2. Parse label.
if not result:
try:
# Returns tuple with label operand and comment, if any.
result = self.process_operand(self.label.parseString(line, parseAll=True))
instruction_form.label = result[0].name
if result[1]:
instruction_form.comment = " ".join(result[1])
except pp.ParseException:
pass
# 3. Parse directive.
if not result:
try:
# Returns tuple with directive operand and comment, if any.
result = self.process_operand(self.directive.parseString(line, parseAll=True))
instruction_form.directive = result[0]
if result[1]:
instruction_form.comment = " ".join(result[1])
except pp.ParseException:
pass
# 4. Parse instruction.
if not result:
try:
result = self.parse_instruction(line)
except pp.ParseException as e:
raise ValueError(
"Could not parse instruction on line {}: {!r}".format(line_number, line)
) from e
instruction_form.mnemonic = result.mnemonic
instruction_form.operands = result.operands
instruction_form.comment = result.comment
return instruction_form
def make_instruction(self, parse_result):
"""
Parse instruction in asm line.
:param parse_result: tuple resulting from calling `parseString` on the `instruction_parser`.
:returns: `dict` -- parsed instruction form
"""
operands = []
# Add operands to list
# Check first operand
if "operand1" in parse_result:
operands.append(self.process_operand(parse_result.operand1))
# Check second operand
if "operand2" in parse_result:
operands.append(self.process_operand(parse_result.operand2))
# Check third operand
if "operand3" in parse_result:
operands.append(self.process_operand(parse_result.operand3))
# Check fourth operand
if "operand4" in parse_result:
operands.append(self.process_operand(parse_result.operand4))
return_dict = InstructionForm(
mnemonic=parse_result.mnemonic,
operands=operands,
label_id=None,
comment_id=" ".join(parse_result[self.comment_id])
if self.comment_id in parse_result else None,
)
return return_dict
def parse_instruction(self, instruction):
"""
Parse instruction in asm line.
:param str instruction: Assembly line string.
:returns: `dict` -- parsed instruction form
"""
return self.make_instruction(
self.instruction_parser.parseString(instruction, parseAll=True)
)
def parse_register(self, register_string):
"""Parse register string"""
try:
return self.process_operand(
self.register.parseString(register_string, parseAll=True)
)
except pp.ParseException:
return None
def process_operand(self, operand):
"""Post-process operand"""
if self.directive_id in operand:
return self.process_directive(operand[self.directive_id])
if self.identifier in operand:
return self.process_identifier(operand[self.identifier])
if self.immediate_id in operand:
return self.process_immediate(operand[self.immediate_id])
if self.label_id in operand:
return self.process_label(operand[self.label_id])
if self.memory_id in operand:
return self.process_memory_address(operand[self.memory_id])
if self.register_id in operand:
return self.process_register(operand[self.register_id])
return operand
def process_directive(self, directive):
# TODO: This is putting the identifier in the parameters. No idea if it's right.
parameters = [directive.identifier.name] if "identifier" in directive else []
parameters.extend(directive.parameters)
directive_new = DirectiveOperand(
name=directive.name,
parameters=parameters or None
)
# Interpret the "=" directives because the generated assembly is full of symbols that are
# defined there.
if directive.name == "=":
self._equ[parameters[0]] = parameters[1]
return directive_new, directive.get("comment")
def process_register(self, operand):
return RegisterOperand(name=operand.name)
def process_register_expression(self, register_expression):
pre_displacement = register_expression.get("pre_displacement")
post_displacement = register_expression.get("post_displacement")
non_displacement = register_expression.get("non_displacement")
base = None
indexed = None
if non_displacement:
base_and_indexed = non_displacement.get("base_and_indexed")
if base_and_indexed:
base = base_and_indexed.get("base")
indexed = base_and_indexed.get("indexed")
else:
base = non_displacement.get("base")
if not base:
indexed = non_displacement.get("indexed")
if indexed:
index = indexed.get("index")
scale = int(indexed.get("scale", "1"), 0)
else:
index = None
scale = 1
displacement_op = (
self.process_immediate(pre_displacement.immediate) if pre_displacement else None
)
displacement_op = (
self.process_immediate(post_displacement.immediate)
if post_displacement else displacement_op
)
base_op = RegisterOperand(name=base.name) if base else None
index_op = RegisterOperand(name=index.name) if index else None
new_memory = MemoryOperand(offset=displacement_op, base=base_op, index=index_op, scale=scale)
return new_memory
def process_address_expression(self, address_expression, data_type=None):
# TODO: It seems that we could have a prefix immediate operand, a displacement in the
# brackets, and an offset. How all of this works together is somewhat mysterious.
immediate_operand = (
self.process_immediate(address_expression.immediate)
if "immediate" in address_expression else None
)
register_expression = (
self.process_register_expression(address_expression.register_expression)
if "register_expression" in address_expression else None
)
segment = (
self.process_register(address_expression.segment)
if "segment" in address_expression else None
)
identifier = (
self.process_identifier(address_expression.identifier)
if "identifier" in address_expression else None
)
if register_expression:
if immediate_operand:
register_expression.offset = immediate_operand
if data_type:
register_expression.data_type = data_type
return register_expression
elif segment:
return MemoryOperand(base=segment, offset=immediate_operand, data_type=data_type)
elif identifier:
if immediate_operand:
identifier.offset = immediate_operand
elif not data_type:
# An address expression without a data type or an offset is just an identifier.
# This matters for jumps.
return identifier
return MemoryOperand(offset=identifier, data_type=data_type)
else:
return MemoryOperand(base=immediate_operand, data_type=data_type)
def process_offset_expression(self, offset_expression):
# TODO: Record that this is an offset expression.
displacement = (
self.process_immediate(offset_expression.displacement)
if "displacement" in offset_expression else None
)
identifier = self.process_identifier(offset_expression.identifier)
identifier.offset = displacement
return MemoryOperand(offset=identifier)
def process_ptr_expression(self, ptr_expression):
# TODO: Do something with the data_type.
return self.process_address_expression(
ptr_expression.address_expression,
ptr_expression.data_type
)
def process_short_expression(self, short_expression):
# TODO: Do something with the fact that it is short.
return LabelOperand(name=short_expression.identifier.name)
def process_memory_address(self, memory_address):
"""Post-process memory address operand"""
if "address_expression" in memory_address:
return self.process_address_expression(memory_address.address_expression)
elif "offset_expression" in memory_address:
return self.process_offset_expression(memory_address.offset_expression)
elif "ptr_expression" in memory_address:
return self.process_ptr_expression(memory_address.ptr_expression)
elif "short_expression" in memory_address:
return self.process_short_expression(memory_address.short_expression)
return memory_address
def process_label(self, label):
"""Post-process label asm line"""
# Remove duplicated 'name' level due to identifier. Note that there is no place to put the
# comment, if any.
label["name"] = label["name"]["name"]
return (LabelOperand(name=label.name),
self.make_instruction(label) if "mnemonic" in label else None)
def process_immediate(self, immediate):
"""Post-process immediate operand"""
if "identifier" in immediate:
# Actually an identifier, change declaration.
return self.process_identifier(immediate.identifier)
new_immediate = ImmediateOperand(value=immediate.get("sign", "") + immediate.value)
new_immediate.value = self.normalize_imd(new_immediate)
return new_immediate
def process_identifier(self, identifier):
if identifier.name in self._equ:
# Actually an immediate, change declaration.
new_immediate = ImmediateOperand(
identifier=identifier.name,
value=self._equ[identifier.name]
)
new_immediate.value = self.normalize_imd(new_immediate)
return new_immediate
return IdentifierOperand(name=identifier.name)
def normalize_imd(self, imd):
"""Normalize immediate to decimal based representation"""
if isinstance(imd.value, str):
if '.' in imd.value:
return float(imd.value)
# Now parse depending on the base.
base = {'B': 2, 'O': 8, 'H': 16}.get(imd.value[-1], 10)
value = 0
negative = imd.value[0] == '-'
positive = imd.value[0] == '+'
start = +(negative or positive)
stop = len(imd.value) if base == 10 else -1
for c in imd.value[start:stop]:
value = value * base + int(c, base)
return -value if negative else value
else:
return imd.value

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python3
"""Semantics opbject responsible for architecture specific semantic operations"""
from dis import Instruction
import sys
import warnings
from itertools import chain
@@ -14,12 +15,24 @@ from osaca.parser.register import RegisterOperand
class ArchSemantics(ISASemantics):
GAS_SUFFIXES = "bswlqt"
def __init__(self, machine_model: MachineModel, path_to_yaml=None):
super().__init__(machine_model.get_ISA().lower(), path_to_yaml=path_to_yaml)
def __init__(self, parser, machine_model: MachineModel, path_to_yaml=None):
super().__init__(parser, path_to_yaml=path_to_yaml)
self._machine_model = machine_model
self._isa = machine_model.get_ISA().lower()
def normalize_instruction_form(self, instruction_form):
self.parser.normalize_instruction_form(
instruction_form,
self.isa_model,
self._machine_model
)
def normalize_instruction_forms(self, instruction_forms):
for instruction_form in instruction_forms:
self.normalize_instruction_form(instruction_form)
def _check_normalized(self, instruction_forms):
for instruction_form in instruction_forms:
instruction_form.check_normalized()
# SUMMARY FUNCTION
def add_semantics(self, kernel):
@@ -29,6 +42,7 @@ class ArchSemantics(ISASemantics):
:param list kernel: kernel to apply semantics
"""
self._check_normalized(kernel)
for instruction_form in kernel:
self.assign_src_dst(instruction_form)
self.assign_tp_lt(instruction_form)
@@ -41,6 +55,7 @@ class ArchSemantics(ISASemantics):
:param list kernel: kernel to apply optimal port utilization
"""
self._check_normalized(kernel)
INC = 0.01
kernel.reverse()
port_list = self._machine_model.get_ports()
@@ -137,6 +152,7 @@ class ArchSemantics(ISASemantics):
def set_hidden_loads(self, kernel):
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
self._check_normalized(kernel)
loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr.flags]
stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr.flags]
# Filter instructions including load and store
@@ -176,6 +192,7 @@ class ArchSemantics(ISASemantics):
# mark instruction form with semantic flags
def assign_tp_lt(self, instruction_form):
"""Assign throughput and latency to an instruction form."""
instruction_form.check_normalized()
flags = []
port_number = len(self._machine_model["ports"])
if instruction_form.mnemonic is None:
@@ -189,25 +206,6 @@ class ArchSemantics(ISASemantics):
instruction_data = self._machine_model.get_instruction(
instruction_form.mnemonic, instruction_form.operands
)
if (
not instruction_data
and self._isa == "x86"
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
):
# check for instruction without GAS suffix
instruction_data = self._machine_model.get_instruction(
instruction_form.mnemonic[:-1], instruction_form.operands
)
if (
instruction_data is None
and self._isa == "aarch64"
and "." in instruction_form.mnemonic
):
# Check for instruction without shape/cc suffix
suffix_start = instruction_form.mnemonic.index(".")
instruction_data = self._machine_model.get_instruction(
instruction_form.mnemonic[:suffix_start], instruction_form.operands
)
if instruction_data:
# instruction form in DB
(
@@ -232,25 +230,6 @@ class ArchSemantics(ISASemantics):
instruction_data_reg = self._machine_model.get_instruction(
instruction_form.mnemonic, operands
)
if (
not instruction_data_reg
and self._isa == "x86"
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
):
# check for instruction without GAS suffix
instruction_data_reg = self._machine_model.get_instruction(
instruction_form.mnemonic[:-1], operands
)
if (
instruction_data_reg is None
and self._isa == "aarch64"
and "." in instruction_form.mnemonic
):
# Check for instruction without shape/cc suffix
suffix_start = instruction_form.mnemonic.index(".")
instruction_data_reg = self._machine_model.get_instruction(
instruction_form.mnemonic[:suffix_start], operands
)
if instruction_data_reg:
assign_unknown = False
reg_type = self._parser.get_reg_type(
@@ -310,7 +289,7 @@ class ArchSemantics(ISASemantics):
# - all mem operands in src_dst are pre-/post_indexed
# since it is no mem store
if (
self._isa == "aarch64"
self._parser.isa() == "aarch64"
and not isinstance(
instruction_form.semantic_operands["destination"],
MemoryOperand,
@@ -406,6 +385,7 @@ class ArchSemantics(ISASemantics):
def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags):
"""Apply performance data to instruction if it was found in the archDB"""
instruction_form.check_normalized()
throughput = instruction_data.throughput
port_pressure = self._machine_model.average_port_pressure(instruction_data.port_pressure)
instruction_form.port_uops = instruction_data.port_pressure
@@ -441,12 +421,12 @@ class ArchSemantics(ISASemantics):
def convert_op_to_reg(self, reg_type, regtype="0"):
"""Create register operand for a memory addressing operand"""
if self._isa == "x86":
if self._parser.isa() == "x86":
if reg_type == "gpr":
register = RegisterOperand(name="r" + str(int(regtype) + 9))
else:
register = RegisterOperand(name=reg_type + regtype)
elif self._isa == "aarch64":
elif self._parser.isa() == "aarch64":
register = RegisterOperand(name=regtype, prefix=reg_type)
return register

View File

@@ -11,7 +11,6 @@ from pathlib import Path
import ruamel.yaml
from osaca import __version__, utils
from osaca.parser import ParserX86ATT
from osaca.parser.instruction_form import InstructionForm
from osaca.parser.operand import Operand
from osaca.parser.memory import MemoryOperand
@@ -79,7 +78,7 @@ class MachineModel(object):
else:
yaml = self._create_yaml_object()
# otherwise load
with open(self._path, "r") as f:
with open(self._path, "r", encoding="utf8") as f:
if not lazy:
self._data = yaml.load(f)
else:
@@ -286,23 +285,38 @@ class MachineModel(object):
######################################################
def get_instruction(self, name, operands):
"""Find and return instruction data from name and operands."""
"""Find and return instruction data from name and operands/arity."""
# For use with dict instead of list as DB
if name is None:
return None
name_matched_iforms = self._data["instruction_forms_dict"].get(name.upper(), [])
try:
return next(
instruction_form
for instruction_form in name_matched_iforms
if self._match_operands(
instruction_form.operands,
operands,
# If `operands` is an integer, it represents the arity of the instruction. This is
# useful to reorder the operands in the Intel syntax because in their original order
# they may not match the model.
if isinstance(operands, int):
arity = operands
return next(
(
instruction_form
for instruction_form in name_matched_iforms
if len(instruction_form.operands) == arity
),
None
)
else:
return next(
(
instruction_form
for instruction_form in name_matched_iforms
if self._match_operands(
instruction_form.operands,
operands
)
),
None
)
)
except StopIteration:
return None
except TypeError as e:
print("\nname: {}\noperands: {}".format(name, operands))
raise TypeError from e
@@ -878,6 +892,7 @@ class MachineModel(object):
return True
def _is_x86_reg_type(self, i_reg, reg, consider_masking=False):
from osaca.parser import ParserX86
"""Check if register type match."""
if reg is None:
if i_reg is None:
@@ -895,7 +910,7 @@ class MachineModel(object):
if i_reg_name == self.WILDCARD or reg.name == self.WILDCARD:
return True
# differentiate between vector registers (mm, xmm, ymm, zmm) and others (gpr)
parser_x86 = ParserX86ATT()
parser_x86 = ParserX86()
if parser_x86.is_vector_register(reg):
if reg.name.rstrip(string.digits).lower() == i_reg_name:
# Consider masking and zeroing for AVX512

View File

@@ -2,7 +2,6 @@
from itertools import chain
from osaca import utils
from osaca.parser import ParserAArch64, ParserX86ATT
from osaca.parser.memory import MemoryOperand
from osaca.parser.operand import Operand
from osaca.parser.register import RegisterOperand
@@ -26,20 +25,23 @@ class INSTR_FLAGS:
class ISASemantics(object):
GAS_SUFFIXES = "bswlqt"
def __init__(self, isa, path_to_yaml=None):
self._isa = isa.lower()
path = path_to_yaml or utils.find_datafile("isa/" + self._isa + ".yml")
def __init__(self, parser, path_to_yaml=None):
path = path_to_yaml or utils.find_datafile("isa/" + parser.isa() + ".yml")
self._isa_model = MachineModel(path_to_yaml=path)
if self._isa == "x86":
self._parser = ParserX86ATT()
elif self._isa == "aarch64":
self._parser = ParserAArch64()
self._parser = parser
@property
def parser(self):
return self._parser
@property
def isa_model(self):
return self._isa_model
def process(self, instruction_forms):
"""Process a list of instruction forms."""
for i in instruction_forms:
i.check_normalized()
self.assign_src_dst(i)
# get ;parser result and assign operands to
@@ -48,6 +50,7 @@ class ISASemantics(object):
# - source/destination
def assign_src_dst(self, instruction_form):
"""Update instruction form dictionary with source, destination and flag information."""
instruction_form.check_normalized()
# if the instruction form doesn't have operands or is None, there's nothing to do
if instruction_form.operands is None or instruction_form.mnemonic is None:
instruction_form.semantic_operands = {"source": [], "destination": [], "src_dst": []}
@@ -57,21 +60,6 @@ class ISASemantics(object):
isa_data = self._isa_model.get_instruction(
instruction_form.mnemonic, instruction_form.operands
)
if (
isa_data is None
and self._isa == "x86"
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
):
# Check for instruction without GAS suffix
isa_data = self._isa_model.get_instruction(
instruction_form.mnemonic[:-1], instruction_form.operands
)
if isa_data is None and self._isa == "aarch64" and "." in instruction_form.mnemonic:
# Check for instruction without shape/cc suffix
suffix_start = instruction_form.mnemonic.index(".")
isa_data = self._isa_model.get_instruction(
instruction_form.mnemonic[:suffix_start], instruction_form.operands
)
operands = instruction_form.operands
op_dict = {}
@@ -88,36 +76,17 @@ class ISASemantics(object):
isa_data_reg = self._isa_model.get_instruction(
instruction_form.mnemonic, operands_reg
)
if (
isa_data_reg is None
and self._isa == "x86"
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
):
# Check for instruction without GAS suffix
isa_data_reg = self._isa_model.get_instruction(
instruction_form.mnemonic[:-1], operands_reg
)
if (
isa_data_reg is None
and self._isa == "aarch64"
and "." in instruction_form.mnemonic
):
# Check for instruction without shape/cc suffix
suffix_start = instruction_form.mnemonic.index(".")
isa_data_reg = self._isa_model.get_instruction(
instruction_form.mnemonic[:suffix_start], operands_reg
)
if isa_data_reg:
assign_default = False
op_dict = self._apply_found_ISA_data(isa_data_reg, operands)
if assign_default:
# no irregular operand structure, apply default
op_dict["source"] = self._get_regular_source_operands(instruction_form)
op_dict["destination"] = self._get_regular_destination_operands(instruction_form)
op_dict["source"] = self._parser.get_regular_source_operands(instruction_form)
op_dict["destination"] = self._parser.get_regular_destination_operands(instruction_form)
op_dict["src_dst"] = []
# post-process pre- and post-indexing for aarch64 memory operands
if self._isa == "aarch64":
if self._parser.isa() == "aarch64":
for operand in [op for op in op_dict["source"] if isinstance(op, MemoryOperand)]:
post_indexed = operand.post_indexed
pre_indexed = operand.pre_indexed
@@ -161,6 +130,7 @@ class ISASemantics(object):
Empty dict if no changes of registers occured. None for registers with unknown changes.
If only_postindexed is True, only considers changes due to post_indexed memory references.
"""
instruction_form.check_normalized()
if instruction_form.mnemonic is None:
return {}
dest_reg_names = [
@@ -174,21 +144,6 @@ class ISASemantics(object):
isa_data = self._isa_model.get_instruction(
instruction_form.mnemonic, instruction_form.operands
)
if (
isa_data is None
and self._isa == "x86"
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
):
# Check for instruction without GAS suffix
isa_data = self._isa_model.get_instruction(
instruction_form.mnemonic[:-1], instruction_form.operands
)
if isa_data is None and self._isa == "aarch64" and "." in instruction_form.mnemonic:
# Check for instruction without shape/cc suffix
suffix_start = instruction_form.mnemonic.index(".")
isa_data = self._isa_model.get_instruction(
instruction_form.mnemonic[:suffix_start], instruction_form.operands
)
if only_postindexed:
for o in instruction_form.operands:
@@ -301,6 +256,7 @@ class ISASemantics(object):
def _has_load(self, instruction_form):
"""Check if instruction form performs a LOAD"""
instruction_form.check_normalized()
for operand in chain(
instruction_form.semantic_operands["source"],
instruction_form.semantic_operands["src_dst"],
@@ -311,6 +267,7 @@ class ISASemantics(object):
def _has_store(self, instruction_form):
"""Check if instruction form perfroms a STORE"""
instruction_form.check_normalized()
for operand in chain(
instruction_form.semantic_operands["destination"],
instruction_form.semantic_operands["src_dst"],
@@ -319,33 +276,6 @@ class ISASemantics(object):
return True
return False
def _get_regular_source_operands(self, instruction_form):
"""Get source operand of given instruction form assuming regular src/dst behavior."""
# if there is only one operand, assume it is a source operand
if len(instruction_form.operands) == 1:
return [instruction_form.operands[0]]
if self._isa == "x86":
# return all but last operand
return [op for op in instruction_form.operands[0:-1]]
elif self._isa == "aarch64":
return [op for op in instruction_form.operands[1:]]
else:
raise ValueError("Unsupported ISA {}.".format(self._isa))
def _get_regular_destination_operands(self, instruction_form):
"""Get destination operand of given instruction form assuming regular src/dst behavior."""
# if there is only one operand, assume no destination
if len(instruction_form.operands) == 1:
return []
if self._isa == "x86":
# return last operand
return instruction_form.operands[-1:]
if self._isa == "aarch64":
# return first operand
return instruction_form.operands[:1]
else:
raise ValueError("Unsupported ISA {}.".format(self._isa))
def substitute_mem_address(self, operands):
"""Create memory wildcard for all memory operands"""
return [

View File

@@ -38,7 +38,8 @@ class KernelDG(nx.DiGraph):
self.kernel, timeout, flag_dependencies
)
def _extend_path(self, dst_list, kernel, dg, offset):
@classmethod
def _extend_path(cls, dst_list, kernel, dg, offset):
for instr in kernel:
generator_path = nx.algorithms.simple_paths.all_simple_paths(
dg, instr.line_number, instr.line_number + offset
@@ -138,7 +139,7 @@ class KernelDG(nx.DiGraph):
all_paths = manager.list()
processes = [
Process(
target=self._extend_path,
target=KernelDG._extend_path,
args=(all_paths, instr_section, dg, offset),
)
for instr_section in instrs
@@ -164,9 +165,7 @@ class KernelDG(nx.DiGraph):
# terminate running processes
for p in processes:
if p.is_alive():
# Python 3.6 does not support Process.kill().
# Can be changed to `p.kill()` after EoL (01/22) of Py3.6
os.kill(p.pid, signal.SIGKILL)
p.kill()
p.join()
all_paths = list(all_paths)
else:
@@ -186,11 +185,11 @@ class KernelDG(nx.DiGraph):
for s, d in nx.utils.pairwise(path):
edge_lat = dg.edges[s, d]["latency"]
# map source node back to original line numbers
if s >= offset:
if s > offset:
s -= offset
lat_path.append((s, edge_lat))
lat_sum += edge_lat
if d >= offset:
if d > offset:
d -= offset
lat_path.sort()
@@ -413,7 +412,7 @@ class KernelDG(nx.DiGraph):
addr_change = 0
if isinstance(src.offset, ImmediateOperand) and src.offset.value is not None:
addr_change += src.offset.value
if mem.offset:
if isinstance(mem.offset, ImmediateOperand) and mem.offset.value is not None:
addr_change -= mem.offset.value
if mem.base and src.base:
base_change = register_changes.get(

View File

@@ -1,29 +1,36 @@
#!/usr/bin/env python3
from collections import OrderedDict
from enum import Enum
from functools import partial
from osaca.parser import ParserAArch64, ParserX86ATT, get_parser
from osaca.parser.register import RegisterOperand
from osaca.parser.instruction_form import InstructionForm
from osaca.parser.directive import DirectiveOperand
from osaca.parser.identifier import IdentifierOperand
from osaca.parser.immediate import ImmediateOperand
from osaca.parser.memory import MemoryOperand
from osaca.parser.register import RegisterOperand
COMMENT_MARKER = {"start": "OSACA-BEGIN", "end": "OSACA-END"}
# State of marker matching.
# No: we have determined that the code doesn't match the marker.
# Partial: so far the code matches the marker, but we have not reached the end of the marker yet.
# Full: the code matches all instructions in the marker.
class Matching(Enum):
No = 0
Partial = 1
Full = 2
def reduce_to_section(kernel, isa):
def reduce_to_section(kernel, parser):
"""
Finds OSACA markers in given kernel and returns marked section
:param list kernel: kernel to check
:param str isa: ISA of given kernel
:param BaseParser parser: parser used to produce the kernel
:returns: `list` -- marked section of kernel as list of instruction forms
"""
isa = isa.lower()
if isa == "x86":
start, end = find_marked_kernel_x86ATT(kernel)
elif isa == "aarch64":
start, end = find_marked_kernel_AArch64(kernel)
else:
raise ValueError("ISA not supported.")
start, end = find_marked_section(kernel, parser, COMMENT_MARKER)
if start == -1:
start = 0
if end == -1:
@@ -31,109 +38,21 @@ def reduce_to_section(kernel, isa):
return kernel[start:end]
def find_marked_kernel_AArch64(lines):
"""
Find marked section for AArch64
:param list lines: kernel
:returns: `tuple of int` -- start and end line of marked section
"""
nop_bytes = [213, 3, 32, 31]
return find_marked_section(
lines,
ParserAArch64(),
["mov"],
"x1",
[111, 222],
nop_bytes,
reverse=True,
comments=COMMENT_MARKER,
)
def find_marked_kernel_x86ATT(lines):
"""
Find marked section for x86
:param list lines: kernel
:returns: `tuple of int` -- start and end line of marked section
"""
nop_bytes = [100, 103, 144]
return find_marked_section(
lines,
ParserX86ATT(),
["mov", "movl"],
"ebx",
[111, 222],
nop_bytes,
comments=COMMENT_MARKER,
)
def get_marker(isa, comment=""):
"""Return tuple of start and end marker lines."""
isa = isa.lower()
if isa == "x86":
start_marker_raw = (
"movl $111, %ebx # OSACA START MARKER\n"
".byte 100 # OSACA START MARKER\n"
".byte 103 # OSACA START MARKER\n"
".byte 144 # OSACA START MARKER\n"
)
if comment:
start_marker_raw += "# {}\n".format(comment)
end_marker_raw = (
"movl $222, %ebx # OSACA END MARKER\n"
".byte 100 # OSACA END MARKER\n"
".byte 103 # OSACA END MARKER\n"
".byte 144 # OSACA END MARKER\n"
)
elif isa == "aarch64":
start_marker_raw = (
"mov x1, #111 // OSACA START MARKER\n"
".byte 213,3,32,31 // OSACA START MARKER\n"
)
if comment:
start_marker_raw += "// {}\n".format(comment)
# After loop
end_marker_raw = (
"mov x1, #222 // OSACA END MARKER\n"
".byte 213,3,32,31 // OSACA END MARKER\n"
)
parser = get_parser(isa)
start_marker = parser.parse_file(start_marker_raw)
end_marker = parser.parse_file(end_marker_raw)
return start_marker, end_marker
def find_marked_section(
lines, parser, mov_instr, mov_reg, mov_vals, nop_bytes, reverse=False, comments=None
):
def find_marked_section(lines, parser, comments=None):
"""
Return indexes of marked section
:param list lines: kernel
:param parser: parser to use for checking
:type parser: :class:`~parser.BaseParser`
:param mov_instr: all MOV instruction possible for the marker
:type mov_instr: `list of str`
:param mov_reg: register used for the marker
:type mov_reg: `str`
:param mov_vals: values needed to be moved to ``mov_reg`` for valid marker
:type mov_vals: `list of int`
:param nop_bytes: bytes representing opcode of NOP
:type nop_bytes: `list of int`
:param reverse: indicating if ISA syntax requires reverse operand order, defaults to `False`
:type reverse: boolean, optional
:param comments: dictionary with start and end markers in comment format, defaults to None
:type comments: dict, optional
:returns: `tuple of int` -- start and end line of marked section
"""
# TODO match to instructions returned by get_marker
index_start = -1
index_end = -1
start_marker = parser.start_marker()
end_marker = parser.end_marker()
for i, line in enumerate(lines):
try:
if line.mnemonic is None and comments is not None and line.comment is not None:
@@ -141,59 +60,151 @@ def find_marked_section(
index_start = i + 1
elif comments["end"] == line.comment:
index_end = i
elif (
line.mnemonic in mov_instr
and len(lines) > i + 1
and lines[i + 1].directive is not None
):
source = line.operands[0 if not reverse else 1]
destination = line.operands[1 if not reverse else 0]
# instruction pair matches, check for operands
if (
isinstance(source, ImmediateOperand)
and parser.normalize_imd(source) == mov_vals[0]
and isinstance(destination, RegisterOperand)
and parser.get_full_reg_name(destination) == mov_reg
):
# operands of first instruction match start, check for second one
match, line_count = match_bytes(lines, i + 1, nop_bytes)
if match:
# return first line after the marker
index_start = i + 1 + line_count
elif (
isinstance(source, ImmediateOperand)
and parser.normalize_imd(source) == mov_vals[1]
and isinstance(destination, RegisterOperand)
and parser.get_full_reg_name(destination) == mov_reg
):
# operand of first instruction match end, check for second one
match, line_count = match_bytes(lines, i + 1, nop_bytes)
if match:
# return line of the marker
index_end = i
except TypeError:
print(i, line)
if index_start == -1:
matching_lines = match_lines(parser, lines[i:], start_marker)
if matching_lines > 0:
# Return the first line after the marker.
index_start = i + matching_lines
if index_end == -1:
if match_lines(parser, lines[i:], end_marker):
index_end = i
except TypeError as e:
print(i, e, line)
if index_start != -1 and index_end != -1:
break
return index_start, index_end
def match_bytes(lines, index, byte_list):
"""Match bytes directives of markers"""
# either all bytes are in one line or in separate ones
extracted_bytes = []
line_count = 0
while (
index < len(lines)
and lines[index].directive is not None
and lines[index].directive.name == "byte"
# This function and the following ones traverse the syntactic tree produced by the parser and try to
# match it to the marker. This is necessary because the IACA markers are significantly different on
# MSVC x86 than on other ISA/compilers. Therefore, simple string matching is not sufficient. Also,
# the syntax of numeric literals depends on the parser and should not be known to this class.
# The matching only checks for a limited number of properties (and the marker doesn't specify the
# rest).
def match_lines(parser, lines, marker):
"""
Returns True iff the `lines` match the `marker`.
:param list of `InstructionForm` lines: parsed assembly code.
:param list of `InstructionForm` marker: pattern to match against the `lines`.
:return int: the length of the match in the parsed code, 0 if there is no match.
"""
marker_iter = iter(marker)
marker_line = next(marker_iter)
for matched_lines, line in enumerate(lines):
if isinstance(marker_line, list):
# No support for partial matching in lists.
for marker_alternative in marker_line:
matching = match_line(parser, line, marker_alternative)
if matching == Matching.Full:
break
else:
return 0
marker_line = next(marker_iter, None)
else:
matching = match_line(parser, line, marker_line)
if matching == Matching.No:
return 0
elif matching == Matching.Partial:
# Try the same marker line again. The call to `match_line` consumed some of the
# directive parameters.
pass
elif matching == Matching.Full:
# Move to the next marker line, the current one has been fully matched.
marker_line = next(marker_iter, None)
# If we have reached the last marker line, the parsed code matches the marker.
if not marker_line:
return matched_lines + 1
def match_line(parser, line, marker_line):
"""
Returns whether `line` matches `marker_line`.
:param `IntructionForm` line: parsed assembly code.
:param marker_line `InstructionForm` marker: pattern to match against `line`.
:return: Matching. In case of partial match, `marker_line` is modified and should be reused for
matching the next line in the parsed assembly code.
"""
if (
line.mnemonic
and marker_line.mnemonic
and line.mnemonic == marker_line.mnemonic
and match_operands(line.operands, marker_line.operands)
):
line_count += 1
extracted_bytes += [int(x, 0) for x in lines[index].directive.parameters]
index += 1
if extracted_bytes[0 : len(byte_list)] == byte_list:
return True, line_count
return False, -1
return Matching.Full
if (
line.directive
and marker_line.directive
and line.directive.name == marker_line.directive.name
):
return match_parameters(parser, line.directive.parameters, marker_line.directive.parameters)
else:
return Matching.No
def match_operands(line_operands, marker_line_operands):
if len(line_operands) != len(marker_line_operands):
return False
return all(
match_operand(line_operand, marker_line_operand)
for line_operand, marker_line_operand in
zip(line_operands, marker_line_operands)
)
def match_operand(line_operand, marker_line_operand):
if (
isinstance(line_operand, ImmediateOperand)
and isinstance(marker_line_operand, ImmediateOperand)
and line_operand.value == marker_line_operand.value
):
return True
if (
isinstance(line_operand, RegisterOperand)
and isinstance(marker_line_operand, RegisterOperand)
and line_operand.name.lower() == marker_line_operand.name.lower()
):
return True
if (
isinstance(line_operand, MemoryOperand)
and isinstance(marker_line_operand, MemoryOperand)
and match_operand(line_operand.base, marker_line_operand.base)
and match_operand(line_operand.offset, line_operand.offset)
):
return True
return False
def match_parameters(parser, line_parameters, marker_line_parameters):
"""
Returns whether `line_parameters` matches `marker_line_parameters`.
:param list of strings line_parameters: parameters of a directive in the parsed assembly code.
:param list of strings marker_line_parameters: parameters of a directive in the marker.
:return: Matching. In case of partial match, `marker_line_parameters` is modified and should be
reused for matching the next line in the parsed assembly code.
"""
line_parameter_count = len(line_parameters)
marker_line_parameter_count = len(marker_line_parameters)
# The elements of `marker_line_parameters` are consumed as they are matched.
for line_parameter in line_parameters:
if not marker_line_parameters:
break;
marker_line_parameter = marker_line_parameters[0]
if not match_parameter(parser, line_parameter, marker_line_parameter):
return Matching.No
marker_line_parameters.pop(0)
if marker_line_parameters:
return Matching.Partial
else:
return Matching.Full
def match_parameter(parser, line_parameter, marker_line_parameter):
if line_parameter.lower() == marker_line_parameter.lower():
return True
else:
# If the parameters don't match verbatim, check if they represent the same immediate value.
line_immediate = ImmediateOperand(value=line_parameter)
marker_line_immediate = ImmediateOperand(value=marker_line_parameter)
return parser.normalize_imd(line_immediate) == parser.normalize_imd(marker_line_immediate)
def find_jump_labels(lines):