mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2026-01-05 02:30:08 +01:00
Add support for the Intel syntax supported by MSVC and ICC
This commit is contained in:
@@ -111,7 +111,8 @@ def extract_model(tree, arch, skip_mem=True):
|
||||
print("Skipping...", file=sys.stderr)
|
||||
return None
|
||||
mm = MachineModel(isa=isa)
|
||||
parser = get_parser(isa)
|
||||
# The model uses the AT&T syntax.
|
||||
parser = get_parser(isa, "ATT")
|
||||
|
||||
for instruction_tag in tree.findall(".//instruction"):
|
||||
ignore = False
|
||||
|
||||
@@ -11,7 +11,7 @@ from ruamel.yaml import YAML
|
||||
|
||||
from osaca.db_interface import import_benchmark_output, sanity_check
|
||||
from osaca.frontend import Frontend
|
||||
from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT
|
||||
from osaca.parser import BaseParser, ParserAArch64, ParserX86, ParserX86ATT, ParserX86Intel
|
||||
from osaca.semantics import (
|
||||
INSTR_FLAGS,
|
||||
ArchSemantics,
|
||||
@@ -47,6 +47,10 @@ DEFAULT_ARCHS = {
|
||||
"aarch64": "V2",
|
||||
"x86": "SPR",
|
||||
}
|
||||
SUPPORTED_SYNTAXES = [
|
||||
"ATT",
|
||||
"INTEL",
|
||||
]
|
||||
|
||||
|
||||
# Stolen from pip
|
||||
@@ -108,6 +112,12 @@ def create_parser(parser=None):
|
||||
"ZEN4, TX2, N1, A64FX, TSV110, A72, M1, V2). If no architecture is given, OSACA assumes a "
|
||||
"default uarch for x86/AArch64.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--syntax",
|
||||
type=str,
|
||||
help="Define the assembly syntax (ATT, Intel) for x86. If no syntax is given, OSACA "
|
||||
"tries to determine automatically the syntax to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fixed",
|
||||
action="store_true",
|
||||
@@ -232,6 +242,14 @@ def check_arguments(args, parser):
|
||||
parser.error(
|
||||
"Microarchitecture not supported. Please see --help for all valid architecture codes."
|
||||
)
|
||||
if args.syntax and args.arch and MachineModel.get_isa_for_arch(args.arch) != "x86":
|
||||
parser.error(
|
||||
"Syntax can only be explicitly specified for an x86 microarchitecture"
|
||||
)
|
||||
if args.syntax and args.syntax.upper() not in SUPPORTED_SYNTAXES:
|
||||
parser.error(
|
||||
"Assembly syntax not supported. Please see --help for all valid assembly syntaxes."
|
||||
)
|
||||
if "import_data" in args and args.import_data not in supported_import_files:
|
||||
parser.error(
|
||||
"Microbenchmark not supported for data import. Please see --help for all valid "
|
||||
@@ -310,30 +328,56 @@ def inspect(args, output_file=sys.stdout):
|
||||
code = args.file.read()
|
||||
|
||||
# Detect ISA if necessary
|
||||
arch = args.arch if args.arch is not None else DEFAULT_ARCHS[BaseParser.detect_ISA(code)]
|
||||
print_arch_warning = False if args.arch else True
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
detected_isa, detected_syntax = BaseParser.detect_ISA(code)
|
||||
detected_arch = DEFAULT_ARCHS[detected_isa]
|
||||
|
||||
print_arch_warning = not args.arch
|
||||
verbose = args.verbose
|
||||
ignore_unknown = args.ignore_unknown
|
||||
|
||||
# Parse file
|
||||
parser = get_asm_parser(arch)
|
||||
try:
|
||||
parsed_code = parser.parse_file(code)
|
||||
except Exception as e:
|
||||
# probably the wrong parser based on heuristic
|
||||
if args.arch is None:
|
||||
# change ISA and try again
|
||||
arch = (
|
||||
DEFAULT_ARCHS["x86"]
|
||||
if BaseParser.detect_ISA(code) == "aarch64"
|
||||
else DEFAULT_ARCHS["aarch64"]
|
||||
)
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
parser = get_asm_parser(arch)
|
||||
# If the arch/syntax is explicitly specified, that's the only thing we'll try. Otherwise, we'll
|
||||
# look at all the possible archs/syntaxes, but with our detected arch/syntax last in the list,
|
||||
# thus tried first.
|
||||
if args.arch:
|
||||
archs_to_try = [args.arch]
|
||||
else:
|
||||
archs_to_try = list(DEFAULT_ARCHS)
|
||||
archs_to_try.remove(detected_arch)
|
||||
archs_to_try.append(detected_arch)
|
||||
if args.syntax:
|
||||
syntaxes_to_try = [args.syntax]
|
||||
else:
|
||||
syntaxes_to_try = SUPPORTED_SYNTAXES + [None]
|
||||
syntaxes_to_try.remove(detected_syntax)
|
||||
syntaxes_to_try.append(detected_syntax)
|
||||
|
||||
# Filter the cross-product of archs and syntaxes to eliminate the combinations that don't make
|
||||
# sense.
|
||||
combinations_to_try = [
|
||||
(arch, syntax)
|
||||
for arch in archs_to_try
|
||||
for syntax in syntaxes_to_try
|
||||
if (syntax != None) == (MachineModel.get_isa_for_arch(arch) == "x86")
|
||||
]
|
||||
|
||||
# Parse file.
|
||||
message = ""
|
||||
single_combination = len(combinations_to_try) == 1
|
||||
while True:
|
||||
arch, syntax = combinations_to_try.pop()
|
||||
parser = get_asm_parser(arch, syntax)
|
||||
try:
|
||||
parsed_code = parser.parse_file(code)
|
||||
else:
|
||||
raise e
|
||||
break
|
||||
except Exception as e:
|
||||
message += f"\nWith arch {arch} and syntax {syntax} got error: {e}."
|
||||
# Either the wrong parser based on heuristic, or a bona fide syntax error (or
|
||||
# unsupported syntax). For ease of debugging, we emit the entire exception trace if
|
||||
# we tried a single arch/syntax combination. If we tried multiple combinations, we
|
||||
# don't emit the traceback as it would apply to the latest combination tried, which is
|
||||
# probably the less interesting.
|
||||
if not combinations_to_try:
|
||||
raise SyntaxError(message) from e if single_combination else None
|
||||
|
||||
# Reduce to marked kernel or chosen section and add semantics
|
||||
if args.lines:
|
||||
@@ -341,13 +385,14 @@ def inspect(args, output_file=sys.stdout):
|
||||
kernel = [line for line in parsed_code if line.line_number in line_range]
|
||||
print_length_warning = False
|
||||
else:
|
||||
kernel = reduce_to_section(parsed_code, isa)
|
||||
kernel = reduce_to_section(parsed_code, parser)
|
||||
# Print warning if kernel has no markers and is larger than threshold (100)
|
||||
print_length_warning = (
|
||||
True if len(kernel) == len(parsed_code) and len(kernel) > 100 else False
|
||||
)
|
||||
machine_model = MachineModel(arch=arch)
|
||||
semantics = ArchSemantics(machine_model)
|
||||
semantics = ArchSemantics(parser, machine_model)
|
||||
semantics.normalize_instruction_forms(kernel)
|
||||
semantics.add_semantics(kernel)
|
||||
# Do optimal schedule for kernel throughput if wished
|
||||
if not args.fixed:
|
||||
@@ -417,7 +462,7 @@ def run(args, output_file=sys.stdout):
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_asm_parser(arch) -> BaseParser:
|
||||
def get_asm_parser(arch, syntax) -> BaseParser:
|
||||
"""
|
||||
Helper function to create the right parser for a specific architecture.
|
||||
|
||||
@@ -427,7 +472,7 @@ def get_asm_parser(arch) -> BaseParser:
|
||||
"""
|
||||
isa = MachineModel.get_isa_for_arch(arch)
|
||||
if isa == "x86":
|
||||
return ParserX86ATT()
|
||||
return ParserX86ATT() if syntax == "ATT" else ParserX86Intel()
|
||||
elif isa == "aarch64":
|
||||
return ParserAArch64()
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
"""
|
||||
Collection of parsers supported by OSACA.
|
||||
|
||||
Only the parser below will be exported, so please add new parsers to __all__.
|
||||
Only the parsers below will be exported, so please add new parsers to __all__.
|
||||
"""
|
||||
|
||||
from .base_parser import BaseParser
|
||||
from .parser_x86 import ParserX86
|
||||
from .parser_x86att import ParserX86ATT
|
||||
from .parser_x86intel import ParserX86Intel
|
||||
from .parser_AArch64 import ParserAArch64
|
||||
from .instruction_form import InstructionForm
|
||||
from .operand import Operand
|
||||
@@ -14,15 +16,17 @@ __all__ = [
|
||||
"Operand",
|
||||
"InstructionForm",
|
||||
"BaseParser",
|
||||
"ParserX86",
|
||||
"ParserX86ATT",
|
||||
"ParserX86Intel",
|
||||
"ParserAArch64",
|
||||
"get_parser",
|
||||
]
|
||||
|
||||
|
||||
def get_parser(isa):
|
||||
def get_parser(isa, syntax):
|
||||
if isa.lower() == "x86":
|
||||
return ParserX86ATT()
|
||||
return ParserX86ATT() if syntax == "ATT" else ParserX86Intel()
|
||||
elif isa.lower() == "aarch64":
|
||||
return ParserAArch64()
|
||||
else:
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
import operator
|
||||
import re
|
||||
|
||||
from osaca.semantics.hw_model import MachineModel
|
||||
|
||||
|
||||
class BaseParser(object):
|
||||
# Identifiers for operand types
|
||||
@@ -25,20 +27,62 @@ class BaseParser(object):
|
||||
self.construct_parser()
|
||||
self._parser_constructed = True
|
||||
|
||||
def isa(self):
|
||||
# Done in derived classes
|
||||
raise NotImplementedError
|
||||
|
||||
# The marker functions return lists of `InstructionForm` that are used to find the IACA markers
|
||||
# in the parsed code. In addition to just a list, the marker may have a structure like
|
||||
# [I1, [I2, I3], I4, ...] where the nested list indicates that at least one of I2 and I3 must
|
||||
# match the second instruction in the fragment of parsed code.
|
||||
# If an instruction form is a `DirectiveOperand`, the match may happen over several directive
|
||||
# operands in the parsed code, provided that the directives have the same name and the
|
||||
# parameters are in sequence with respect to the pattern. This provides an easy way to describe
|
||||
# a sequence of bytes irrespective of the way it was grouped in the assembly source.
|
||||
# Note that markers must be matched *before* normalization.
|
||||
def start_marker(self):
|
||||
# Done in derived classes
|
||||
raise NotImplementedError
|
||||
|
||||
def end_marker(self):
|
||||
# Done in derived classes
|
||||
raise NotImplementedError
|
||||
|
||||
# Performs all the normalization needed to match the instruction to the ISO/arch model. This
|
||||
# method must set the `normalized` property of the instruction and must be idempotent.
|
||||
def normalize_instruction_form(
|
||||
self,
|
||||
instruction_form,
|
||||
isa_model: MachineModel,
|
||||
arch_model: MachineModel
|
||||
):
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def detect_ISA(file_content):
|
||||
"""Detect the ISA of the assembly based on the used registers and return the ISA code."""
|
||||
"""
|
||||
Detect the ISA of the assembly based on the used registers and return the ISA code.
|
||||
|
||||
:param str file_content: assembly code.
|
||||
:return: a tuple isa, syntax describing the architecture and the assembly syntax,
|
||||
if appropriate. If there is no notion of syntax, the second element is None.
|
||||
"""
|
||||
# Check for the amount of registers in the code to determine the ISA
|
||||
# 1) Check for xmm, ymm, zmm, rax, rbx, rcx, and rdx registers in x86
|
||||
# AT&T syntax. There is a % before each register name.
|
||||
heuristics_x86ATT = [r"%[xyz]mm[0-9]", r"%[er][abcd]x[0-9]"]
|
||||
# 2) check for v and z vector registers and x/w general-purpose registers
|
||||
# 2) Same as above, but for the Intel syntax. There is no % before the register names.
|
||||
heuristics_x86Intel = [r"[^%][xyz]mm[0-9]", r"[^%][er][abcd]x[0-9]"]
|
||||
# 3) check for v and z vector registers and x/w general-purpose registers
|
||||
heuristics_aarch64 = [r"[vz][0-9][0-9]?\.[0-9][0-9]?[bhsd]", r"[wx][0-9]"]
|
||||
matches = {"x86": 0, "aarch64": 0}
|
||||
matches = {("x86", "ATT"): 0, ("x86", "INTEL"): 0, ("aarch64", None): 0}
|
||||
|
||||
for h in heuristics_x86ATT:
|
||||
matches["x86"] += len(re.findall(h, file_content))
|
||||
matches[("x86", "ATT")] += len(re.findall(h, file_content))
|
||||
for h in heuristics_x86Intel:
|
||||
matches[("x86", "INTEL")] += len(re.findall(h, file_content))
|
||||
for h in heuristics_aarch64:
|
||||
matches["aarch64"] += len(re.findall(h, file_content))
|
||||
matches[("aarch64", None)] += len(re.findall(h, file_content))
|
||||
|
||||
return max(matches.items(), key=operator.itemgetter(1))[0]
|
||||
|
||||
@@ -94,6 +138,14 @@ class BaseParser(object):
|
||||
def get_full_reg_name(self, register):
|
||||
raise NotImplementedError
|
||||
|
||||
# Must be called on a *normalized* instruction.
|
||||
def get_regular_source_operands(self, instruction_form):
|
||||
raise NotImplementedError
|
||||
|
||||
# Must be called on a *normalized* instruction.
|
||||
def get_regular_destination_operands(self, instruction_form):
|
||||
raise NotImplementedError
|
||||
|
||||
def normalize_imd(self, imd):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -41,3 +41,12 @@ class IdentifierOperand(Operand):
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, IdentifierOperand):
|
||||
return (
|
||||
self._name == other._name
|
||||
and self._offset == other._offset
|
||||
and self._relocation == other._relocation
|
||||
)
|
||||
return False
|
||||
|
||||
@@ -19,6 +19,7 @@ class InstructionForm:
|
||||
port_pressure=None,
|
||||
operation=None,
|
||||
breaks_dependency_on_equal_operands=False,
|
||||
normalized=False,
|
||||
):
|
||||
self._mnemonic = mnemonic
|
||||
self._operands = operands
|
||||
@@ -33,6 +34,7 @@ class InstructionForm:
|
||||
self._operation = operation
|
||||
self._uops = uops
|
||||
self._breaks_dependency_on_equal_operands = breaks_dependency_on_equal_operands
|
||||
self._normalized = normalized
|
||||
self._latency = latency
|
||||
self._throughput = throughput
|
||||
self._latency_cp = []
|
||||
@@ -42,6 +44,10 @@ class InstructionForm:
|
||||
self._port_uops = []
|
||||
self._flags = []
|
||||
|
||||
def check_normalized(self):
|
||||
if not self._normalized:
|
||||
raise AssertionError("Unnormalized instruction")
|
||||
|
||||
@property
|
||||
def semantic_operands(self):
|
||||
return self._semantic_operands
|
||||
@@ -114,6 +120,10 @@ class InstructionForm:
|
||||
def breaks_dependency_on_equal_operands(self):
|
||||
return self._breaks_dependency_on_equal_operands
|
||||
|
||||
@property
|
||||
def normalized(self):
|
||||
return self._normalized
|
||||
|
||||
@semantic_operands.setter
|
||||
def semantic_operands(self, semantic_operands):
|
||||
self._semantic_operands = semantic_operands
|
||||
@@ -142,6 +152,10 @@ class InstructionForm:
|
||||
def breaks_dependency_on_equal_operands(self, boolean):
|
||||
self._breaks_dependency_on_equal_operands = boolean
|
||||
|
||||
@normalized.setter
|
||||
def normalized(self, normalized):
|
||||
self._normalized = normalized
|
||||
|
||||
@mnemonic.setter
|
||||
def mnemonic(self, mnemonic):
|
||||
self._mnemonic = mnemonic
|
||||
|
||||
@@ -20,3 +20,8 @@ class LabelOperand(Operand):
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, LabelOperand):
|
||||
return self._name == other._name
|
||||
return False
|
||||
|
||||
@@ -15,6 +15,7 @@ class MemoryOperand(Operand):
|
||||
pre_indexed=False,
|
||||
post_indexed=False,
|
||||
indexed_val=None,
|
||||
data_type=None,
|
||||
src=None,
|
||||
dst=None,
|
||||
source=False,
|
||||
@@ -30,6 +31,7 @@ class MemoryOperand(Operand):
|
||||
self._pre_indexed = pre_indexed
|
||||
self._post_indexed = post_indexed
|
||||
self._indexed_val = indexed_val
|
||||
self._data_type = data_type
|
||||
# type of register we store from (`src`) or load to (`dst`)
|
||||
self._src = src
|
||||
self._dst = dst
|
||||
@@ -74,6 +76,14 @@ class MemoryOperand(Operand):
|
||||
def indexed_val(self):
|
||||
return self._indexed_val
|
||||
|
||||
@property
|
||||
def data_type(self):
|
||||
return self._data_type
|
||||
|
||||
@data_type.setter
|
||||
def data_type(self, data_type):
|
||||
self._data_type = data_type
|
||||
|
||||
@property
|
||||
def src(self):
|
||||
return self._src
|
||||
|
||||
@@ -13,6 +13,7 @@ from osaca.parser.identifier import IdentifierOperand
|
||||
from osaca.parser.immediate import ImmediateOperand
|
||||
from osaca.parser.condition import ConditionOperand
|
||||
from osaca.parser.prefetch import PrefetchOperand
|
||||
from osaca.semantics.hw_model import MachineModel
|
||||
|
||||
|
||||
class ParserAArch64(BaseParser):
|
||||
@@ -26,7 +27,58 @@ class ParserAArch64(BaseParser):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.isa = "aarch64"
|
||||
|
||||
def isa(self):
|
||||
return "aarch64"
|
||||
|
||||
def start_marker(self):
|
||||
return [
|
||||
InstructionForm(
|
||||
mnemonic="mov",
|
||||
operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=111)]
|
||||
),
|
||||
InstructionForm(
|
||||
directive_id=DirectiveOperand(name="byte", parameters=["213", "3", "32", "31"])
|
||||
)
|
||||
]
|
||||
|
||||
def end_marker(self):
|
||||
return [
|
||||
InstructionForm(
|
||||
mnemonic="mov",
|
||||
operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=222)]
|
||||
),
|
||||
InstructionForm(
|
||||
directive_id=DirectiveOperand(name="byte", parameters=["213", "3", "32", "31"])
|
||||
)
|
||||
]
|
||||
|
||||
def normalize_instruction_form(
|
||||
self,
|
||||
instruction_form,
|
||||
isa_model: MachineModel,
|
||||
arch_model: MachineModel
|
||||
):
|
||||
"""
|
||||
If the instruction doesn't exist in the machine model, normalize it by dropping the shape
|
||||
suffix.
|
||||
"""
|
||||
if instruction_form.normalized:
|
||||
return
|
||||
instruction_form.normalized = True
|
||||
|
||||
mnemonic = instruction_form.mnemonic
|
||||
if not mnemonic:
|
||||
return
|
||||
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
|
||||
if not model:
|
||||
if "." in mnemonic:
|
||||
# Check for instruction without shape/cc suffix.
|
||||
suffix_start = mnemonic.index(".")
|
||||
mnemonic = mnemonic[:suffix_start]
|
||||
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
|
||||
if model:
|
||||
instruction_form.mnemonic = mnemonic
|
||||
|
||||
def construct_parser(self):
|
||||
"""Create parser for ARM AArch64 ISA."""
|
||||
@@ -589,6 +641,21 @@ class ParserAArch64(BaseParser):
|
||||
name += "[" + str(register.index) + "]"
|
||||
return name
|
||||
|
||||
def get_regular_source_operands(self, instruction_form):
|
||||
"""Get source operand of given instruction form assuming regular src/dst behavior."""
|
||||
# if there is only one operand, assume it is a source operand
|
||||
if len(instruction_form.operands) == 1:
|
||||
return [instruction_form.operands[0]]
|
||||
return [op for op in instruction_form.operands[1:]]
|
||||
|
||||
def get_regular_destination_operands(self, instruction_form):
|
||||
"""Get destination operand of given instruction form assuming regular src/dst behavior."""
|
||||
# if there is only one operand, assume no destination
|
||||
if len(instruction_form.operands) == 1:
|
||||
return []
|
||||
# return first operand
|
||||
return instruction_form.operands[:1]
|
||||
|
||||
def normalize_imd(self, imd):
|
||||
"""Normalize immediate to decimal based representation"""
|
||||
if isinstance(imd, IdentifierOperand):
|
||||
|
||||
123
osaca/parser/parser_x86.py
Normal file
123
osaca/parser/parser_x86.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import re
|
||||
import string
|
||||
|
||||
from osaca.parser import BaseParser
|
||||
|
||||
|
||||
class ParserX86(BaseParser):
|
||||
_instance = None
|
||||
|
||||
# Singleton pattern, as this is created very many times.
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(ParserX86, cls).__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def isa(self):
|
||||
return "x86"
|
||||
|
||||
def is_reg_dependend_of(self, reg_a, reg_b):
|
||||
"""Check if ``reg_a`` is dependent on ``reg_b``"""
|
||||
reg_a_name = reg_a.name.upper()
|
||||
reg_b_name = reg_b.name.upper()
|
||||
|
||||
# Check if they are the same registers
|
||||
if reg_a_name == reg_b_name:
|
||||
return True
|
||||
# Check vector registers first
|
||||
if self.is_vector_register(reg_a):
|
||||
if self.is_vector_register(reg_b):
|
||||
if reg_a_name[1:] == reg_b_name[1:]:
|
||||
# Registers in the same vector space
|
||||
return True
|
||||
return False
|
||||
# Check basic GPRs
|
||||
gpr_groups = {
|
||||
"A": ["RAX", "EAX", "AX", "AH", "AL"],
|
||||
"B": ["RBX", "EBX", "BX", "BH", "BL"],
|
||||
"C": ["RCX", "ECX", "CX", "CH", "CL"],
|
||||
"D": ["RDX", "EDX", "DX", "DH", "DL"],
|
||||
"SP": ["RSP", "ESP", "SP", "SPL"],
|
||||
"SRC": ["RSI", "ESI", "SI", "SIL"],
|
||||
"DST": ["RDI", "EDI", "DI", "DIL"],
|
||||
}
|
||||
if self.is_basic_gpr(reg_a):
|
||||
if self.is_basic_gpr(reg_b):
|
||||
for dep_group in gpr_groups.values():
|
||||
if reg_a_name in dep_group:
|
||||
if reg_b_name in dep_group:
|
||||
return True
|
||||
return False
|
||||
|
||||
# Check other GPRs
|
||||
ma = re.match(r"R([0-9]+)[DWB]?", reg_a_name)
|
||||
mb = re.match(r"R([0-9]+)[DWB]?", reg_b_name)
|
||||
if ma and mb and ma.group(1) == mb.group(1):
|
||||
return True
|
||||
|
||||
# No dependencies
|
||||
return False
|
||||
|
||||
def is_basic_gpr(self, register):
|
||||
"""Check if register is a basic general purpose register (ebi, rax, ...)"""
|
||||
if any(char.isdigit() for char in register.name) or any(
|
||||
register.name.lower().startswith(x) for x in ["mm", "xmm", "ymm", "zmm"]
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_gpr(self, register):
|
||||
"""Check if register is a general purpose register"""
|
||||
if register is None:
|
||||
return False
|
||||
if self.is_basic_gpr(register):
|
||||
return True
|
||||
return re.match(r"R([0-9]+)[DWB]?", register.name, re.IGNORECASE)
|
||||
|
||||
def is_vector_register(self, register):
|
||||
"""Check if register is a vector register"""
|
||||
if register is None or register.name is None:
|
||||
return False
|
||||
if register.name.rstrip(string.digits).lower() in [
|
||||
"mm",
|
||||
"xmm",
|
||||
"ymm",
|
||||
"zmm",
|
||||
]:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_reg_type(self, register):
|
||||
"""Get register type"""
|
||||
if register is None:
|
||||
return False
|
||||
if self.is_gpr(register):
|
||||
return "gpr"
|
||||
elif self.is_vector_register(register):
|
||||
return register.name.rstrip(string.digits).lower()
|
||||
raise ValueError
|
||||
|
||||
def is_flag_dependend_of(self, flag_a, flag_b):
|
||||
"""Check if ``flag_a`` is dependent on ``flag_b``"""
|
||||
# we assume flags are independent of each other, e.g., CF can be read while ZF gets written
|
||||
# TODO validate this assumption
|
||||
return flag_a.name == flag_b.name
|
||||
|
||||
def get_regular_source_operands(self, instruction_form):
|
||||
"""Get source operand of given instruction form assuming regular src/dst behavior."""
|
||||
# if there is only one operand, assume it is a source operand
|
||||
if len(instruction_form.operands) == 1:
|
||||
return [instruction_form.operands[0]]
|
||||
# return all but last operand
|
||||
return [op for op in instruction_form.operands[0:-1]]
|
||||
|
||||
def get_regular_destination_operands(self, instruction_form):
|
||||
"""Get destination operand of given instruction form assuming regular src/dst behavior."""
|
||||
# if there is only one operand, assume no destination
|
||||
if len(instruction_form.operands) == 1:
|
||||
return []
|
||||
# return last operand
|
||||
return instruction_form.operands[-1:]
|
||||
@@ -5,7 +5,7 @@ import re
|
||||
|
||||
import pyparsing as pp
|
||||
|
||||
from osaca.parser import BaseParser
|
||||
from osaca.parser import ParserX86
|
||||
from osaca.parser.instruction_form import InstructionForm
|
||||
from osaca.parser.directive import DirectiveOperand
|
||||
from osaca.parser.memory import MemoryOperand
|
||||
@@ -13,10 +13,12 @@ from osaca.parser.label import LabelOperand
|
||||
from osaca.parser.register import RegisterOperand
|
||||
from osaca.parser.identifier import IdentifierOperand
|
||||
from osaca.parser.immediate import ImmediateOperand
|
||||
from osaca.semantics.hw_model import MachineModel
|
||||
|
||||
|
||||
class ParserX86ATT(BaseParser):
|
||||
class ParserX86ATT(ParserX86):
|
||||
_instance = None
|
||||
GAS_SUFFIXES = "bswlqt"
|
||||
|
||||
# Singelton pattern, as this is created very many times
|
||||
def __new__(cls):
|
||||
@@ -26,7 +28,66 @@ class ParserX86ATT(BaseParser):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.isa = "x86"
|
||||
|
||||
def start_marker(self):
|
||||
return [
|
||||
[
|
||||
InstructionForm(
|
||||
mnemonic="mov",
|
||||
operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")]
|
||||
),
|
||||
InstructionForm(
|
||||
mnemonic="movl",
|
||||
operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")]
|
||||
)
|
||||
],
|
||||
InstructionForm(
|
||||
directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"])
|
||||
)
|
||||
]
|
||||
|
||||
def end_marker(self):
|
||||
return [
|
||||
[
|
||||
InstructionForm(
|
||||
mnemonic="mov",
|
||||
operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")]
|
||||
),
|
||||
InstructionForm(
|
||||
mnemonic="movl",
|
||||
operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")]
|
||||
)
|
||||
],
|
||||
InstructionForm(
|
||||
directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"])
|
||||
)
|
||||
]
|
||||
|
||||
def normalize_instruction_form(
|
||||
self,
|
||||
instruction_form,
|
||||
isa_model: MachineModel,
|
||||
arch_model: MachineModel
|
||||
):
|
||||
"""
|
||||
If the instruction doesn't exist in the machine model, normalize it by dropping the GAS
|
||||
suffix.
|
||||
"""
|
||||
if instruction_form.normalized:
|
||||
return
|
||||
instruction_form.normalized = True
|
||||
|
||||
mnemonic = instruction_form.mnemonic
|
||||
if not mnemonic:
|
||||
return
|
||||
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
|
||||
if not model:
|
||||
# Check for instruction without GAS suffix.
|
||||
if mnemonic[-1] in self.GAS_SUFFIXES:
|
||||
mnemonic = mnemonic[:-1]
|
||||
model = arch_model.get_instruction(mnemonic, instruction_form.operands)
|
||||
if model:
|
||||
instruction_form.mnemonic = mnemonic
|
||||
|
||||
def construct_parser(self):
|
||||
"""Create parser for x86 AT&T ISA."""
|
||||
@@ -253,10 +314,10 @@ class ParserX86ATT(BaseParser):
|
||||
if result is None:
|
||||
try:
|
||||
result = self.parse_instruction(line)
|
||||
except pp.ParseException:
|
||||
except pp.ParseException as e:
|
||||
raise ValueError(
|
||||
"Could not parse instruction on line {}: {!r}".format(line_number, line)
|
||||
)
|
||||
) from e
|
||||
instruction_form.mnemonic = result.mnemonic
|
||||
instruction_form.operands = result.operands
|
||||
instruction_form.comment = result.comment
|
||||
@@ -393,90 +454,3 @@ class ParserX86ATT(BaseParser):
|
||||
return imd.value
|
||||
# identifier
|
||||
return imd
|
||||
|
||||
def is_flag_dependend_of(self, flag_a, flag_b):
|
||||
"""Check if ``flag_a`` is dependent on ``flag_b``"""
|
||||
# we assume flags are independent of each other, e.g., CF can be read while ZF gets written
|
||||
# TODO validate this assumption
|
||||
return flag_a.name == flag_b.name
|
||||
|
||||
def is_reg_dependend_of(self, reg_a, reg_b):
|
||||
"""Check if ``reg_a`` is dependent on ``reg_b``"""
|
||||
reg_a_name = reg_a.name.upper()
|
||||
reg_b_name = reg_b.name.upper()
|
||||
|
||||
# Check if they are the same registers
|
||||
if reg_a_name == reg_b_name:
|
||||
return True
|
||||
# Check vector registers first
|
||||
if self.is_vector_register(reg_a):
|
||||
if self.is_vector_register(reg_b):
|
||||
if reg_a_name[1:] == reg_b_name[1:]:
|
||||
# Registers in the same vector space
|
||||
return True
|
||||
return False
|
||||
# Check basic GPRs
|
||||
gpr_groups = {
|
||||
"A": ["RAX", "EAX", "AX", "AH", "AL"],
|
||||
"B": ["RBX", "EBX", "BX", "BH", "BL"],
|
||||
"C": ["RCX", "ECX", "CX", "CH", "CL"],
|
||||
"D": ["RDX", "EDX", "DX", "DH", "DL"],
|
||||
"SP": ["RSP", "ESP", "SP", "SPL"],
|
||||
"SRC": ["RSI", "ESI", "SI", "SIL"],
|
||||
"DST": ["RDI", "EDI", "DI", "DIL"],
|
||||
}
|
||||
if self.is_basic_gpr(reg_a):
|
||||
if self.is_basic_gpr(reg_b):
|
||||
for dep_group in gpr_groups.values():
|
||||
if reg_a_name in dep_group:
|
||||
if reg_b_name in dep_group:
|
||||
return True
|
||||
return False
|
||||
|
||||
# Check other GPRs
|
||||
ma = re.match(r"R([0-9]+)[DWB]?", reg_a_name)
|
||||
mb = re.match(r"R([0-9]+)[DWB]?", reg_b_name)
|
||||
if ma and mb and ma.group(1) == mb.group(1):
|
||||
return True
|
||||
|
||||
# No dependencies
|
||||
return False
|
||||
|
||||
def is_basic_gpr(self, register):
|
||||
"""Check if register is a basic general purpose register (ebi, rax, ...)"""
|
||||
if any(char.isdigit() for char in register.name) or any(
|
||||
register.name.lower().startswith(x) for x in ["mm", "xmm", "ymm", "zmm"]
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_gpr(self, register):
|
||||
"""Check if register is a general purpose register"""
|
||||
if register is None:
|
||||
return False
|
||||
if self.is_basic_gpr(register):
|
||||
return True
|
||||
return re.match(r"R([0-9]+)[DWB]?", register.name, re.IGNORECASE)
|
||||
|
||||
def is_vector_register(self, register):
|
||||
"""Check if register is a vector register"""
|
||||
if register is None or register.name is None:
|
||||
return False
|
||||
if register.name.rstrip(string.digits).lower() in [
|
||||
"mm",
|
||||
"xmm",
|
||||
"ymm",
|
||||
"zmm",
|
||||
]:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_reg_type(self, register):
|
||||
"""Get register type"""
|
||||
if register is None:
|
||||
return False
|
||||
if self.is_gpr(register):
|
||||
return "gpr"
|
||||
elif self.is_vector_register(register):
|
||||
return register.name.rstrip(string.digits).lower()
|
||||
raise ValueError
|
||||
|
||||
830
osaca/parser/parser_x86intel.py
Normal file
830
osaca/parser/parser_x86intel.py
Normal file
@@ -0,0 +1,830 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import pyparsing as pp
|
||||
import re
|
||||
import string
|
||||
import unicodedata
|
||||
|
||||
from osaca.parser import ParserX86
|
||||
from osaca.parser.directive import DirectiveOperand
|
||||
from osaca.parser.identifier import IdentifierOperand
|
||||
from osaca.parser.immediate import ImmediateOperand
|
||||
from osaca.parser.instruction_form import InstructionForm
|
||||
from osaca.parser.label import LabelOperand
|
||||
from osaca.parser.memory import MemoryOperand
|
||||
from osaca.parser.register import RegisterOperand
|
||||
from osaca.semantics.hw_model import MachineModel
|
||||
|
||||
# We assume any non-ASCII characters except control characters and line terminators can be part of
|
||||
# identifiers; this is based on the assumption that no assembler uses non-ASCII white space and
|
||||
# syntax characters.
|
||||
# This approach is described at the end of https://www.unicode.org/reports/tr55/#Whitespace-Syntax.
|
||||
# It is appropriate for tools, such as this one, which process source code but do not fully validate
|
||||
# it (in this case, that’s the job of the assembler).
|
||||
NON_ASCII_PRINTABLE_CHARACTERS = "".join(
|
||||
chr(cp) for cp in range(0x80, 0x10FFFF + 1)
|
||||
if unicodedata.category(chr(cp)) not in ("Cc", "Zl", "Zp", "Cs", "Cn")
|
||||
)
|
||||
|
||||
# References:
|
||||
# ASM386 Assembly Language Reference, document number 469165-003, https://mirror.math.princeton.edu/pub/oldlinux/Linux.old/Ref-docs/asm-ref.pdf.
|
||||
# Microsoft Macro Assembler BNF Grammar, https://learn.microsoft.com/en-us/cpp/assembler/masm/masm-bnf-grammar?view=msvc-170.
|
||||
# Intel Architecture Code Analyzer User's Guide, https://www.intel.com/content/dam/develop/external/us/en/documents/intel-architecture-code-analyzer-3-0-users-guide-157552.pdf.
|
||||
class ParserX86Intel(ParserX86):
|
||||
_instance = None
|
||||
|
||||
# Singleton pattern, as this is created very many times.
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(ParserX86Intel, cls).__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._equ = {}
|
||||
|
||||
# The IACA manual says: "For For Microsoft* Visual C++ compiler, 64-bit version, use
|
||||
# IACA_VC64_START and IACA_VC64_END, instead" (of IACA_START and IACA_END).
|
||||
# TODO: Inconveniently, the code generated with optimization disabled (/Od) has two
|
||||
# instructions. We should support both patterns, but then who runs OSACA with /Od?
|
||||
def start_marker(self):
|
||||
return [
|
||||
InstructionForm(
|
||||
mnemonic="mov",
|
||||
operands=[
|
||||
MemoryOperand(
|
||||
base=RegisterOperand(name="GS"),
|
||||
offset=ImmediateOperand(value=111)
|
||||
),
|
||||
ImmediateOperand(value=111)
|
||||
]
|
||||
),
|
||||
]
|
||||
|
||||
def end_marker(self):
|
||||
return [
|
||||
InstructionForm(
|
||||
mnemonic="mov",
|
||||
operands=[
|
||||
MemoryOperand(
|
||||
base=RegisterOperand(name="GS"),
|
||||
offset=ImmediateOperand(value=222)
|
||||
),
|
||||
ImmediateOperand(value=222)
|
||||
]
|
||||
),
|
||||
]
|
||||
|
||||
def normalize_instruction_form(
|
||||
self,
|
||||
instruction_form,
|
||||
isa_model: MachineModel,
|
||||
arch_model: MachineModel
|
||||
):
|
||||
"""
|
||||
If the model indicates that this instruction has a single destination that is the last
|
||||
operand, move the first operand to the last position. This effectively converts the Intel
|
||||
syntax to the AT&T one.
|
||||
"""
|
||||
if instruction_form.normalized:
|
||||
return
|
||||
instruction_form.normalized = True
|
||||
|
||||
mnemonic = instruction_form.mnemonic
|
||||
if not mnemonic:
|
||||
return
|
||||
|
||||
# The model may only contain the VEX-encoded instruction and we may have the non-VEX-encoded
|
||||
# one, or vice-versa. Note that this doesn't work when the arguments differ between VEX-
|
||||
# encoded and non-VEX-encoded, e.g., for psubq.
|
||||
if not arch_model.get_instruction(
|
||||
mnemonic,
|
||||
len(instruction_form.operands)
|
||||
):
|
||||
if mnemonic[0] == 'v':
|
||||
unvexed_mnemonic = mnemonic[1:]
|
||||
if arch_model.get_instruction(
|
||||
unvexed_mnemonic,
|
||||
len(instruction_form.operands)
|
||||
):
|
||||
mnemonic = unvexed_mnemonic
|
||||
else:
|
||||
vexed_mnemonic = 'v' + mnemonic
|
||||
if arch_model.get_instruction(
|
||||
vexed_mnemonic,
|
||||
len(instruction_form.operands)
|
||||
):
|
||||
mnemonic = vexed_mnemonic
|
||||
instruction_form.mnemonic = mnemonic
|
||||
|
||||
# We cannot pass the operands because they may not match before the reordering. We just
|
||||
# pass the arity instead. Also, this must use the ISA model, because that's where the
|
||||
# source/destination information is found.
|
||||
model = isa_model.get_instruction(mnemonic, len(instruction_form.operands))
|
||||
has_single_destination_at_end = False
|
||||
has_destination = False
|
||||
if model:
|
||||
for o in model.operands:
|
||||
if o.source:
|
||||
if has_destination:
|
||||
has_single_destination_at_end = False
|
||||
if o.destination:
|
||||
if has_destination:
|
||||
has_single_destination_at_end = False
|
||||
else:
|
||||
has_destination = True
|
||||
has_single_destination_at_end = True
|
||||
else:
|
||||
# if there is only one operand, assume it is a source operand
|
||||
has_single_destination_at_end = len(instruction_form.operands) > 1
|
||||
|
||||
if has_single_destination_at_end:
|
||||
# It is important to reverse the operands, we cannot just move the first one last. This
|
||||
# makes a difference for instructions with 3 operands or more, such as roundsd: the
|
||||
# model files expect the rounding mode (an immediate) first but the Intel syntax has it
|
||||
# last.
|
||||
instruction_form.operands.reverse()
|
||||
|
||||
# A hack to help with comparison instruction: if the instruction is in the model, and has
|
||||
# exactly two sources, swap its operands.
|
||||
if (model and
|
||||
not has_destination and
|
||||
len(instruction_form.operands) == 2
|
||||
and not isa_model.get_instruction(
|
||||
mnemonic,
|
||||
instruction_form.operands
|
||||
) and not arch_model.get_instruction(
|
||||
mnemonic,
|
||||
instruction_form.operands
|
||||
)):
|
||||
instruction_form.operands.reverse()
|
||||
|
||||
# If the instruction has a well-known data type, append a suffix.
|
||||
data_type_to_suffix = {"DWORD": "d", "QWORD": "q"}
|
||||
for o in instruction_form.operands:
|
||||
if isinstance(o, MemoryOperand) and o.data_type:
|
||||
suffix = data_type_to_suffix.get(o.data_type, None)
|
||||
if suffix:
|
||||
suffixed_mnemonic = mnemonic + suffix
|
||||
if isa_model.get_instruction(
|
||||
suffixed_mnemonic,
|
||||
len(instruction_form.operands)
|
||||
) or arch_model.get_instruction(
|
||||
suffixed_mnemonic,
|
||||
len(instruction_form.operands)
|
||||
):
|
||||
instruction_form.mnemonic = suffixed_mnemonic
|
||||
break
|
||||
|
||||
|
||||
def construct_parser(self):
|
||||
"""Create parser for x86 Intel ISA."""
|
||||
# Numeric literal.
|
||||
binary_number = pp.Combine(
|
||||
pp.Word("01") + pp.CaselessLiteral("B")
|
||||
)
|
||||
octal_number = pp.Combine(
|
||||
pp.Word("01234567") + pp.CaselessLiteral("O")
|
||||
)
|
||||
decimal_number = pp.Combine(
|
||||
pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)
|
||||
)
|
||||
hex_number = pp.Combine(
|
||||
pp.Word(pp.hexnums) + pp.CaselessLiteral("H")
|
||||
)
|
||||
float_number = pp.Combine(
|
||||
pp.Optional(pp.Literal("-")) + pp.Word(pp.nums) + pp.Word(".", pp.nums)
|
||||
).setResultsName("value")
|
||||
integer_number = (
|
||||
binary_number ^ octal_number ^ decimal_number ^ hex_number
|
||||
).setResultsName("value")
|
||||
|
||||
# Comment.
|
||||
self.comment = pp.Word(";#", exact=1) + pp.Group(
|
||||
pp.ZeroOrMore(pp.Word(pp.printables + NON_ASCII_PRINTABLE_CHARACTERS))
|
||||
).setResultsName(self.comment_id)
|
||||
|
||||
# Types.
|
||||
data_type = (
|
||||
pp.CaselessKeyword("BYTE")
|
||||
| pp.CaselessKeyword("DWORD")
|
||||
| pp.CaselessKeyword("FWORD")
|
||||
| pp.CaselessKeyword("MMWORD")
|
||||
| pp.CaselessKeyword("OWORD")
|
||||
| pp.CaselessKeyword("QWORD")
|
||||
| pp.CaselessKeyword("REAL10")
|
||||
| pp.CaselessKeyword("REAL4")
|
||||
| pp.CaselessKeyword("REAL8")
|
||||
| pp.CaselessKeyword("SBYTE")
|
||||
| pp.CaselessKeyword("SDWORD")
|
||||
| pp.CaselessKeyword("SQWORD")
|
||||
| pp.CaselessKeyword("SWORD")
|
||||
| pp.CaselessKeyword("TBYTE")
|
||||
| pp.CaselessKeyword("WORD")
|
||||
| pp.CaselessKeyword("XMMWORD")
|
||||
| pp.CaselessKeyword("YMMWORD")
|
||||
).setResultsName("data_type")
|
||||
|
||||
# Identifier. Note that $ is not mentioned in the ASM386 Assembly Language Reference,
|
||||
# but it is mentioned in the MASM syntax. < and > apparently show up in C++ mangled names.
|
||||
# ICC allows ".", at least in labels.
|
||||
first = pp.Word(pp.alphas + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>", exact=1)
|
||||
rest = pp.Word(pp.alphanums + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>")
|
||||
identifier = pp.Group(
|
||||
pp.Combine(first + pp.Optional(rest)).setResultsName("name")
|
||||
).setResultsName("identifier")
|
||||
|
||||
# Register.
|
||||
# This follows the MASM grammar.
|
||||
special_register = (
|
||||
pp.CaselessKeyword("CR0")
|
||||
| pp.CaselessKeyword("CR2")
|
||||
| pp.CaselessKeyword("CR3")
|
||||
| pp.CaselessKeyword("DR0")
|
||||
| pp.CaselessKeyword("DR1")
|
||||
| pp.CaselessKeyword("DR2")
|
||||
| pp.CaselessKeyword("DR3")
|
||||
| pp.CaselessKeyword("DR6")
|
||||
| pp.CaselessKeyword("DR7")
|
||||
| pp.CaselessKeyword("TR3")
|
||||
| pp.CaselessKeyword("TR4")
|
||||
| pp.CaselessKeyword("TR5")
|
||||
| pp.CaselessKeyword("TR6")
|
||||
| pp.CaselessKeyword("TR7")
|
||||
).setResultsName("name")
|
||||
gp_register = (
|
||||
pp.CaselessKeyword("AX")
|
||||
| pp.CaselessKeyword("EAX")
|
||||
| pp.CaselessKeyword("CX")
|
||||
| pp.CaselessKeyword("ECX")
|
||||
| pp.CaselessKeyword("DX")
|
||||
| pp.CaselessKeyword("EDX")
|
||||
| pp.CaselessKeyword("BX")
|
||||
| pp.CaselessKeyword("EBX")
|
||||
| pp.CaselessKeyword("DI")
|
||||
| pp.CaselessKeyword("EDI")
|
||||
| pp.CaselessKeyword("SI")
|
||||
| pp.CaselessKeyword("ESI")
|
||||
| pp.CaselessKeyword("BP")
|
||||
| pp.CaselessKeyword("EBP")
|
||||
| pp.CaselessKeyword("SP")
|
||||
| pp.CaselessKeyword("ESP")
|
||||
| pp.CaselessKeyword("R8W")
|
||||
| pp.CaselessKeyword("R8D")
|
||||
| pp.CaselessKeyword("R9W")
|
||||
| pp.CaselessKeyword("R9D")
|
||||
| pp.CaselessKeyword("R12D")
|
||||
| pp.CaselessKeyword("R13W")
|
||||
| pp.CaselessKeyword("R13D")
|
||||
| pp.CaselessKeyword("R14W")
|
||||
| pp.CaselessKeyword("R14D")
|
||||
).setResultsName("name")
|
||||
byte_register = (
|
||||
pp.CaselessKeyword("AL")
|
||||
| pp.CaselessKeyword("AH")
|
||||
| pp.CaselessKeyword("CL")
|
||||
| pp.CaselessKeyword("CH")
|
||||
| pp.CaselessKeyword("DL")
|
||||
| pp.CaselessKeyword("DH")
|
||||
| pp.CaselessKeyword("BL")
|
||||
| pp.CaselessKeyword("BH")
|
||||
| pp.CaselessKeyword("R8B")
|
||||
| pp.CaselessKeyword("R9B")
|
||||
| pp.CaselessKeyword("R10B")
|
||||
| pp.CaselessKeyword("R11B")
|
||||
| pp.CaselessKeyword("R12B")
|
||||
| pp.CaselessKeyword("R13B")
|
||||
).setResultsName("name")
|
||||
qword_register = (
|
||||
pp.CaselessKeyword("RAX")
|
||||
| pp.CaselessKeyword("RCX")
|
||||
| pp.CaselessKeyword("RDX")
|
||||
| pp.CaselessKeyword("RBX")
|
||||
| pp.CaselessKeyword("RSP")
|
||||
| pp.CaselessKeyword("RBP")
|
||||
| pp.CaselessKeyword("RSI")
|
||||
| pp.CaselessKeyword("RDI")
|
||||
| pp.CaselessKeyword("R8")
|
||||
| pp.CaselessKeyword("R9")
|
||||
| pp.CaselessKeyword("R10")
|
||||
| pp.CaselessKeyword("R11")
|
||||
| pp.CaselessKeyword("R12")
|
||||
| pp.CaselessKeyword("R13")
|
||||
| pp.CaselessKeyword("R14")
|
||||
| pp.CaselessKeyword("R15")
|
||||
).setResultsName("name")
|
||||
fpu_register = pp.Combine(
|
||||
pp.CaselessKeyword("ST")
|
||||
+ pp.Optional(pp.Literal("(") + pp.Word("01234567") + pp.Literal(")"))
|
||||
).setResultsName("name")
|
||||
xmm_register = (
|
||||
pp.Combine(pp.CaselessLiteral("XMM") + pp.Word(pp.nums))
|
||||
| pp.Combine(pp.CaselessLiteral("XMM1") + pp.Word("012345"))
|
||||
)
|
||||
simd_register = (
|
||||
pp.Combine(pp.CaselessLiteral("MM") + pp.Word("01234567"))
|
||||
| xmm_register
|
||||
| pp.Combine(pp.CaselessLiteral("YMM") + pp.Word(pp.nums))
|
||||
| pp.Combine(pp.CaselessLiteral("YMM1") + pp.Word("012345"))
|
||||
).setResultsName("name")
|
||||
segment_register = (
|
||||
pp.CaselessKeyword("CS")
|
||||
| pp.CaselessKeyword("DS")
|
||||
| pp.CaselessKeyword("ES")
|
||||
| pp.CaselessKeyword("FS")
|
||||
| pp.CaselessKeyword("GS")
|
||||
| pp.CaselessKeyword("SS")
|
||||
).setResultsName("name")
|
||||
self.register = pp.Group(
|
||||
special_register
|
||||
| gp_register
|
||||
| byte_register
|
||||
| qword_register
|
||||
| fpu_register
|
||||
| simd_register
|
||||
| segment_register
|
||||
| pp.CaselessKeyword("RIP")
|
||||
).setResultsName(self.register_id)
|
||||
|
||||
# Register expressions.
|
||||
base_register = self.register
|
||||
index_register = self.register
|
||||
scale = pp.Word("1248", exact=1)
|
||||
post_displacement = pp.Group(
|
||||
(pp.Literal("+") ^ pp.Literal("-")).setResultsName("sign")
|
||||
+ integer_number | identifier
|
||||
).setResultsName(self.immediate_id)
|
||||
pre_displacement = pp.Group(integer_number + pp.Literal("+")
|
||||
).setResultsName(self.immediate_id)
|
||||
indexed = pp.Group(
|
||||
index_register.setResultsName("index")
|
||||
+ pp.Optional(pp.Literal("*")
|
||||
+ scale.setResultsName("scale"))
|
||||
).setResultsName("indexed")
|
||||
register_expression = pp.Group(
|
||||
pp.Literal("[")
|
||||
+ pp.Optional(pp.Group(pre_displacement).setResultsName("pre_displacement"))
|
||||
+ pp.Group(
|
||||
base_register.setResultsName("base")
|
||||
^ pp.Group(
|
||||
base_register.setResultsName("base")
|
||||
+ pp.Literal("+")
|
||||
+ indexed).setResultsName("base_and_indexed")
|
||||
^ indexed
|
||||
).setResultsName("non_displacement")
|
||||
+ pp.Optional(pp.Group(post_displacement).setResultsName("post_displacement"))
|
||||
+ pp.Literal("]")
|
||||
).setResultsName("register_expression")
|
||||
|
||||
# Immediate.
|
||||
immediate = pp.Group(
|
||||
integer_number | float_number | identifier
|
||||
).setResultsName(self.immediate_id)
|
||||
|
||||
# Expressions.
|
||||
# The ASM86 manual has weird expressions on page 130 (displacement outside of the register
|
||||
# expression, multiple register expressions). Let's ignore those for now, but see
|
||||
# https://stackoverflow.com/questions/71540754/why-sometimes-use-offset-flatlabel-and-sometimes-not.
|
||||
address_expression = pp.Group(
|
||||
self.register.setResultsName("segment") + pp.Literal(":") + immediate
|
||||
^ immediate + register_expression
|
||||
^ register_expression
|
||||
^ identifier + pp.Optional(pp.Literal("+") + immediate)
|
||||
).setResultsName("address_expression")
|
||||
|
||||
offset_expression = pp.Group(
|
||||
pp.CaselessKeyword("OFFSET")
|
||||
+ pp.Group(
|
||||
pp.CaselessKeyword("GROUP")
|
||||
| pp.CaselessKeyword("SEGMENT")
|
||||
| pp.CaselessKeyword("FLAT")
|
||||
)
|
||||
# The MASM grammar has the ":" immediately after "OFFSET", but that's not what MSVC
|
||||
# outputs.
|
||||
+ pp.Literal(":")
|
||||
+ identifier.setResultsName("identifier")
|
||||
+ pp.Optional(pp.Literal("+") + immediate.setResultsName("displacement"))
|
||||
).setResultsName("offset_expression")
|
||||
ptr_expression = pp.Group(
|
||||
data_type + pp.CaselessKeyword("PTR") + address_expression
|
||||
).setResultsName("ptr_expression")
|
||||
short_expression = pp.Group(
|
||||
pp.CaselessKeyword("SHORT") + identifier
|
||||
).setResultsName("short_expression")
|
||||
|
||||
# Instructions.
|
||||
mnemonic = pp.Word(
|
||||
pp.alphas, pp.alphanums
|
||||
).setResultsName("mnemonic")
|
||||
operand = pp.Group(
|
||||
self.register
|
||||
| pp.Group(
|
||||
offset_expression
|
||||
| ptr_expression
|
||||
| short_expression
|
||||
| address_expression
|
||||
).setResultsName(self.memory_id)
|
||||
| immediate
|
||||
)
|
||||
self.instruction_parser = (
|
||||
mnemonic
|
||||
+ pp.Optional(operand.setResultsName("operand1"))
|
||||
+ pp.Optional(pp.Suppress(pp.Literal(",")))
|
||||
+ pp.Optional(operand.setResultsName("operand2"))
|
||||
+ pp.Optional(pp.Suppress(pp.Literal(",")))
|
||||
+ pp.Optional(operand.setResultsName("operand3"))
|
||||
+ pp.Optional(pp.Suppress(pp.Literal(",")))
|
||||
+ pp.Optional(operand.setResultsName("operand4"))
|
||||
+ pp.Optional(self.comment)
|
||||
)
|
||||
|
||||
# Label.
|
||||
self.label = pp.Group(
|
||||
identifier.setResultsName("name")
|
||||
+ pp.Literal(":")
|
||||
+ pp.Optional(self.instruction_parser)
|
||||
+ pp.Optional(self.comment)
|
||||
).setResultsName(self.label_id)
|
||||
|
||||
# Directives.
|
||||
# The identifiers at the beginnig of a directive cannot start with a "." otherwise we end up
|
||||
# with ambiguities.
|
||||
directive_first = pp.Word(pp.alphas + NON_ASCII_PRINTABLE_CHARACTERS + "$?@_<>", exact=1)
|
||||
directive_rest = pp.Word(pp.alphanums + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>")
|
||||
directive_identifier = pp.Group(
|
||||
pp.Combine(directive_first + pp.Optional(directive_rest)).setResultsName("name")
|
||||
).setResultsName("identifier")
|
||||
|
||||
# Parameter can be any quoted string or sequence of characters besides ';' (for comments)
|
||||
# or ',' (parameter delimiter). See ASM386 p. 38.
|
||||
directive_parameter = (
|
||||
pp.quotedString
|
||||
^ (
|
||||
pp.Word(pp.printables + NON_ASCII_PRINTABLE_CHARACTERS, excludeChars=",;")
|
||||
+ pp.Optional(pp.Suppress(pp.Literal(",")))
|
||||
)
|
||||
^ pp.Suppress(pp.Literal(","))
|
||||
)
|
||||
# The directives that don't start with a "." are ambiguous with instructions, so we list
|
||||
# them explicitly.
|
||||
# TODO: The directives that are types introduce a nasty ambiguity with instructions. Skip
|
||||
# them for now, apparently the MSVC output uses the short D? directives.
|
||||
directive_keywords = (
|
||||
pp.CaselessKeyword("ALIAS")
|
||||
| pp.CaselessKeyword("ALIGN")
|
||||
| pp.CaselessKeyword("ASSUME")
|
||||
#| pp.CaselessKeyword("BYTE")
|
||||
| pp.CaselessKeyword("CATSTR")
|
||||
| pp.CaselessKeyword("COMM")
|
||||
| pp.CaselessKeyword("COMMENT")
|
||||
| pp.CaselessKeyword("DB")
|
||||
| pp.CaselessKeyword("DD")
|
||||
| pp.CaselessKeyword("DF")
|
||||
| pp.CaselessKeyword("DQ")
|
||||
| pp.CaselessKeyword("DT")
|
||||
| pp.CaselessKeyword("DW")
|
||||
#| pp.CaselessKeyword("DWORD")
|
||||
| pp.CaselessKeyword("ECHO")
|
||||
| pp.CaselessKeyword("END")
|
||||
| pp.CaselessKeyword("ENDP")
|
||||
| pp.CaselessKeyword("ENDS")
|
||||
| pp.CaselessKeyword("EQU")
|
||||
| pp.CaselessKeyword("EVEN")
|
||||
| pp.CaselessKeyword("EXTRN")
|
||||
| pp.CaselessKeyword("EXTERNDEF")
|
||||
#| pp.CaselessKeyword("FWORD")
|
||||
| pp.CaselessKeyword("GROUP")
|
||||
| pp.CaselessKeyword("INCLUDE")
|
||||
| pp.CaselessKeyword("INCLUDELIB")
|
||||
| pp.CaselessKeyword("INSTR")
|
||||
| pp.CaselessKeyword("INVOKE")
|
||||
| pp.CaselessKeyword("LABEL")
|
||||
#| pp.CaselessKeyword("MMWORD")
|
||||
| pp.CaselessKeyword("OPTION")
|
||||
| pp.CaselessKeyword("ORG")
|
||||
| pp.CaselessKeyword("PAGE")
|
||||
| pp.CaselessKeyword("POPCONTEXT")
|
||||
| pp.CaselessKeyword("PROC")
|
||||
| pp.CaselessKeyword("PROTO")
|
||||
| pp.CaselessKeyword("PUBLIC")
|
||||
| pp.CaselessKeyword("PUSHCONTEXT")
|
||||
#| pp.CaselessKeyword("QWORD")
|
||||
#| pp.CaselessKeyword("REAL10")
|
||||
#| pp.CaselessKeyword("REAL4")
|
||||
#| pp.CaselessKeyword("REAL8")
|
||||
| pp.CaselessKeyword("RECORD")
|
||||
#| pp.CaselessKeyword("SBYTE")
|
||||
#| pp.CaselessKeyword("SDWORD")
|
||||
| pp.CaselessKeyword("SEGMENT")
|
||||
| pp.CaselessKeyword("SIZESTR")
|
||||
| pp.CaselessKeyword("STRUCT")
|
||||
| pp.CaselessKeyword("SUBSTR")
|
||||
| pp.CaselessKeyword("SUBTITLE")
|
||||
#| pp.CaselessKeyword("SWORD")
|
||||
#| pp.CaselessKeyword("TBYTE")
|
||||
| pp.CaselessKeyword("TEXTEQU")
|
||||
| pp.CaselessKeyword("TITLE")
|
||||
| pp.CaselessKeyword("TYPEDEF")
|
||||
| pp.CaselessKeyword("UNION")
|
||||
#| pp.CaselessKeyword("WORD")
|
||||
#| pp.CaselessKeyword("XMMWORD")
|
||||
#| pp.CaselessKeyword("YMMWORD")
|
||||
)
|
||||
self.directive = pp.Group(
|
||||
pp.Optional(~directive_keywords + directive_identifier)
|
||||
+ (
|
||||
pp.Combine(pp.Literal(".") + pp.Word(pp.alphanums + "_"))
|
||||
| pp.Literal("=")
|
||||
| directive_keywords
|
||||
).setResultsName("name")
|
||||
+ pp.ZeroOrMore(directive_parameter).setResultsName("parameters")
|
||||
+ pp.Optional(self.comment)
|
||||
).setResultsName(self.directive_id)
|
||||
|
||||
def parse_line(self, line, line_number=None):
|
||||
"""
|
||||
Parse line and return instruction form.
|
||||
|
||||
:param str line: line of assembly code
|
||||
:param line_number: default None, identifier of instruction form
|
||||
:type line_number: int, optional
|
||||
:return: ``dict`` -- parsed asm line (comment, label, directive or instruction form)
|
||||
"""
|
||||
instruction_form = InstructionForm(line=line, line_number=line_number)
|
||||
result = None
|
||||
|
||||
# 1. Parse comment.
|
||||
try:
|
||||
result = self.process_operand(self.comment.parseString(line, parseAll=True))
|
||||
instruction_form.comment = " ".join(result[self.comment_id])
|
||||
except pp.ParseException:
|
||||
pass
|
||||
|
||||
# 2. Parse label.
|
||||
if not result:
|
||||
try:
|
||||
# Returns tuple with label operand and comment, if any.
|
||||
result = self.process_operand(self.label.parseString(line, parseAll=True))
|
||||
instruction_form.label = result[0].name
|
||||
if result[1]:
|
||||
instruction_form.comment = " ".join(result[1])
|
||||
except pp.ParseException:
|
||||
pass
|
||||
|
||||
# 3. Parse directive.
|
||||
if not result:
|
||||
try:
|
||||
# Returns tuple with directive operand and comment, if any.
|
||||
result = self.process_operand(self.directive.parseString(line, parseAll=True))
|
||||
instruction_form.directive = result[0]
|
||||
if result[1]:
|
||||
instruction_form.comment = " ".join(result[1])
|
||||
except pp.ParseException:
|
||||
pass
|
||||
|
||||
# 4. Parse instruction.
|
||||
if not result:
|
||||
try:
|
||||
result = self.parse_instruction(line)
|
||||
except pp.ParseException as e:
|
||||
raise ValueError(
|
||||
"Could not parse instruction on line {}: {!r}".format(line_number, line)
|
||||
) from e
|
||||
instruction_form.mnemonic = result.mnemonic
|
||||
instruction_form.operands = result.operands
|
||||
instruction_form.comment = result.comment
|
||||
return instruction_form
|
||||
|
||||
def make_instruction(self, parse_result):
|
||||
"""
|
||||
Parse instruction in asm line.
|
||||
|
||||
:param parse_result: tuple resulting from calling `parseString` on the `instruction_parser`.
|
||||
:returns: `dict` -- parsed instruction form
|
||||
"""
|
||||
operands = []
|
||||
# Add operands to list
|
||||
# Check first operand
|
||||
if "operand1" in parse_result:
|
||||
operands.append(self.process_operand(parse_result.operand1))
|
||||
# Check second operand
|
||||
if "operand2" in parse_result:
|
||||
operands.append(self.process_operand(parse_result.operand2))
|
||||
# Check third operand
|
||||
if "operand3" in parse_result:
|
||||
operands.append(self.process_operand(parse_result.operand3))
|
||||
# Check fourth operand
|
||||
if "operand4" in parse_result:
|
||||
operands.append(self.process_operand(parse_result.operand4))
|
||||
return_dict = InstructionForm(
|
||||
mnemonic=parse_result.mnemonic,
|
||||
operands=operands,
|
||||
label_id=None,
|
||||
comment_id=" ".join(parse_result[self.comment_id])
|
||||
if self.comment_id in parse_result else None,
|
||||
)
|
||||
|
||||
return return_dict
|
||||
|
||||
def parse_instruction(self, instruction):
|
||||
"""
|
||||
Parse instruction in asm line.
|
||||
|
||||
:param str instruction: Assembly line string.
|
||||
:returns: `dict` -- parsed instruction form
|
||||
"""
|
||||
return self.make_instruction(
|
||||
self.instruction_parser.parseString(instruction, parseAll=True)
|
||||
)
|
||||
|
||||
def parse_register(self, register_string):
|
||||
"""Parse register string"""
|
||||
try:
|
||||
return self.process_operand(
|
||||
self.register.parseString(register_string, parseAll=True)
|
||||
)
|
||||
except pp.ParseException:
|
||||
return None
|
||||
|
||||
def process_operand(self, operand):
|
||||
"""Post-process operand"""
|
||||
if self.directive_id in operand:
|
||||
return self.process_directive(operand[self.directive_id])
|
||||
if self.identifier in operand:
|
||||
return self.process_identifier(operand[self.identifier])
|
||||
if self.immediate_id in operand:
|
||||
return self.process_immediate(operand[self.immediate_id])
|
||||
if self.label_id in operand:
|
||||
return self.process_label(operand[self.label_id])
|
||||
if self.memory_id in operand:
|
||||
return self.process_memory_address(operand[self.memory_id])
|
||||
if self.register_id in operand:
|
||||
return self.process_register(operand[self.register_id])
|
||||
return operand
|
||||
|
||||
def process_directive(self, directive):
|
||||
# TODO: This is putting the identifier in the parameters. No idea if it's right.
|
||||
parameters = [directive.identifier.name] if "identifier" in directive else []
|
||||
parameters.extend(directive.parameters)
|
||||
directive_new = DirectiveOperand(
|
||||
name=directive.name,
|
||||
parameters=parameters or None
|
||||
)
|
||||
# Interpret the "=" directives because the generated assembly is full of symbols that are
|
||||
# defined there.
|
||||
if directive.name == "=":
|
||||
self._equ[parameters[0]] = parameters[1]
|
||||
return directive_new, directive.get("comment")
|
||||
|
||||
def process_register(self, operand):
|
||||
return RegisterOperand(name=operand.name)
|
||||
|
||||
def process_register_expression(self, register_expression):
|
||||
pre_displacement = register_expression.get("pre_displacement")
|
||||
post_displacement = register_expression.get("post_displacement")
|
||||
non_displacement = register_expression.get("non_displacement")
|
||||
base = None
|
||||
indexed = None
|
||||
if non_displacement:
|
||||
base_and_indexed = non_displacement.get("base_and_indexed")
|
||||
if base_and_indexed:
|
||||
base = base_and_indexed.get("base")
|
||||
indexed = base_and_indexed.get("indexed")
|
||||
else:
|
||||
base = non_displacement.get("base")
|
||||
if not base:
|
||||
indexed = non_displacement.get("indexed")
|
||||
if indexed:
|
||||
index = indexed.get("index")
|
||||
scale = int(indexed.get("scale", "1"), 0)
|
||||
else:
|
||||
index = None
|
||||
scale = 1
|
||||
displacement_op = (
|
||||
self.process_immediate(pre_displacement.immediate) if pre_displacement else None
|
||||
)
|
||||
displacement_op = (
|
||||
self.process_immediate(post_displacement.immediate)
|
||||
if post_displacement else displacement_op
|
||||
)
|
||||
base_op = RegisterOperand(name=base.name) if base else None
|
||||
index_op = RegisterOperand(name=index.name) if index else None
|
||||
new_memory = MemoryOperand(offset=displacement_op, base=base_op, index=index_op, scale=scale)
|
||||
return new_memory
|
||||
|
||||
def process_address_expression(self, address_expression, data_type=None):
|
||||
# TODO: It seems that we could have a prefix immediate operand, a displacement in the
|
||||
# brackets, and an offset. How all of this works together is somewhat mysterious.
|
||||
immediate_operand = (
|
||||
self.process_immediate(address_expression.immediate)
|
||||
if "immediate" in address_expression else None
|
||||
)
|
||||
register_expression = (
|
||||
self.process_register_expression(address_expression.register_expression)
|
||||
if "register_expression" in address_expression else None
|
||||
)
|
||||
segment = (
|
||||
self.process_register(address_expression.segment)
|
||||
if "segment" in address_expression else None
|
||||
)
|
||||
identifier = (
|
||||
self.process_identifier(address_expression.identifier)
|
||||
if "identifier" in address_expression else None
|
||||
)
|
||||
if register_expression:
|
||||
if immediate_operand:
|
||||
register_expression.offset = immediate_operand
|
||||
if data_type:
|
||||
register_expression.data_type = data_type
|
||||
return register_expression
|
||||
elif segment:
|
||||
return MemoryOperand(base=segment, offset=immediate_operand, data_type=data_type)
|
||||
elif identifier:
|
||||
if immediate_operand:
|
||||
identifier.offset = immediate_operand
|
||||
elif not data_type:
|
||||
# An address expression without a data type or an offset is just an identifier.
|
||||
# This matters for jumps.
|
||||
return identifier
|
||||
return MemoryOperand(offset=identifier, data_type=data_type)
|
||||
else:
|
||||
return MemoryOperand(base=immediate_operand, data_type=data_type)
|
||||
|
||||
def process_offset_expression(self, offset_expression):
|
||||
# TODO: Record that this is an offset expression.
|
||||
displacement = (
|
||||
self.process_immediate(offset_expression.displacement)
|
||||
if "displacement" in offset_expression else None
|
||||
)
|
||||
identifier = self.process_identifier(offset_expression.identifier)
|
||||
identifier.offset = displacement
|
||||
return MemoryOperand(offset=identifier)
|
||||
|
||||
def process_ptr_expression(self, ptr_expression):
|
||||
# TODO: Do something with the data_type.
|
||||
return self.process_address_expression(
|
||||
ptr_expression.address_expression,
|
||||
ptr_expression.data_type
|
||||
)
|
||||
|
||||
def process_short_expression(self, short_expression):
|
||||
# TODO: Do something with the fact that it is short.
|
||||
return LabelOperand(name=short_expression.identifier.name)
|
||||
|
||||
def process_memory_address(self, memory_address):
|
||||
"""Post-process memory address operand"""
|
||||
if "address_expression" in memory_address:
|
||||
return self.process_address_expression(memory_address.address_expression)
|
||||
elif "offset_expression" in memory_address:
|
||||
return self.process_offset_expression(memory_address.offset_expression)
|
||||
elif "ptr_expression" in memory_address:
|
||||
return self.process_ptr_expression(memory_address.ptr_expression)
|
||||
elif "short_expression" in memory_address:
|
||||
return self.process_short_expression(memory_address.short_expression)
|
||||
return memory_address
|
||||
|
||||
def process_label(self, label):
|
||||
"""Post-process label asm line"""
|
||||
# Remove duplicated 'name' level due to identifier. Note that there is no place to put the
|
||||
# comment, if any.
|
||||
label["name"] = label["name"]["name"]
|
||||
return (LabelOperand(name=label.name),
|
||||
self.make_instruction(label) if "mnemonic" in label else None)
|
||||
|
||||
def process_immediate(self, immediate):
|
||||
"""Post-process immediate operand"""
|
||||
if "identifier" in immediate:
|
||||
# Actually an identifier, change declaration.
|
||||
return self.process_identifier(immediate.identifier)
|
||||
new_immediate = ImmediateOperand(value=immediate.get("sign", "") + immediate.value)
|
||||
new_immediate.value = self.normalize_imd(new_immediate)
|
||||
return new_immediate
|
||||
|
||||
def process_identifier(self, identifier):
|
||||
if identifier.name in self._equ:
|
||||
# Actually an immediate, change declaration.
|
||||
new_immediate = ImmediateOperand(
|
||||
identifier=identifier.name,
|
||||
value=self._equ[identifier.name]
|
||||
)
|
||||
new_immediate.value = self.normalize_imd(new_immediate)
|
||||
return new_immediate
|
||||
return IdentifierOperand(name=identifier.name)
|
||||
|
||||
def normalize_imd(self, imd):
|
||||
"""Normalize immediate to decimal based representation"""
|
||||
if isinstance(imd.value, str):
|
||||
if '.' in imd.value:
|
||||
return float(imd.value)
|
||||
# Now parse depending on the base.
|
||||
base = {'B': 2, 'O': 8, 'H': 16}.get(imd.value[-1], 10)
|
||||
value = 0
|
||||
negative = imd.value[0] == '-'
|
||||
positive = imd.value[0] == '+'
|
||||
start = +(negative or positive)
|
||||
stop = len(imd.value) if base == 10 else -1
|
||||
for c in imd.value[start:stop]:
|
||||
value = value * base + int(c, base)
|
||||
return -value if negative else value
|
||||
else:
|
||||
return imd.value
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Semantics opbject responsible for architecture specific semantic operations"""
|
||||
|
||||
from dis import Instruction
|
||||
import sys
|
||||
import warnings
|
||||
from itertools import chain
|
||||
@@ -14,12 +15,24 @@ from osaca.parser.register import RegisterOperand
|
||||
|
||||
|
||||
class ArchSemantics(ISASemantics):
|
||||
GAS_SUFFIXES = "bswlqt"
|
||||
|
||||
def __init__(self, machine_model: MachineModel, path_to_yaml=None):
|
||||
super().__init__(machine_model.get_ISA().lower(), path_to_yaml=path_to_yaml)
|
||||
def __init__(self, parser, machine_model: MachineModel, path_to_yaml=None):
|
||||
super().__init__(parser, path_to_yaml=path_to_yaml)
|
||||
self._machine_model = machine_model
|
||||
self._isa = machine_model.get_ISA().lower()
|
||||
|
||||
def normalize_instruction_form(self, instruction_form):
|
||||
self.parser.normalize_instruction_form(
|
||||
instruction_form,
|
||||
self.isa_model,
|
||||
self._machine_model
|
||||
)
|
||||
|
||||
def normalize_instruction_forms(self, instruction_forms):
|
||||
for instruction_form in instruction_forms:
|
||||
self.normalize_instruction_form(instruction_form)
|
||||
|
||||
def _check_normalized(self, instruction_forms):
|
||||
for instruction_form in instruction_forms:
|
||||
instruction_form.check_normalized()
|
||||
|
||||
# SUMMARY FUNCTION
|
||||
def add_semantics(self, kernel):
|
||||
@@ -29,6 +42,7 @@ class ArchSemantics(ISASemantics):
|
||||
|
||||
:param list kernel: kernel to apply semantics
|
||||
"""
|
||||
self._check_normalized(kernel)
|
||||
for instruction_form in kernel:
|
||||
self.assign_src_dst(instruction_form)
|
||||
self.assign_tp_lt(instruction_form)
|
||||
@@ -41,6 +55,7 @@ class ArchSemantics(ISASemantics):
|
||||
|
||||
:param list kernel: kernel to apply optimal port utilization
|
||||
"""
|
||||
self._check_normalized(kernel)
|
||||
INC = 0.01
|
||||
kernel.reverse()
|
||||
port_list = self._machine_model.get_ports()
|
||||
@@ -137,6 +152,7 @@ class ArchSemantics(ISASemantics):
|
||||
|
||||
def set_hidden_loads(self, kernel):
|
||||
"""Hide loads behind stores if architecture supports hidden loads (depricated)"""
|
||||
self._check_normalized(kernel)
|
||||
loads = [instr for instr in kernel if INSTR_FLAGS.HAS_LD in instr.flags]
|
||||
stores = [instr for instr in kernel if INSTR_FLAGS.HAS_ST in instr.flags]
|
||||
# Filter instructions including load and store
|
||||
@@ -176,6 +192,7 @@ class ArchSemantics(ISASemantics):
|
||||
# mark instruction form with semantic flags
|
||||
def assign_tp_lt(self, instruction_form):
|
||||
"""Assign throughput and latency to an instruction form."""
|
||||
instruction_form.check_normalized()
|
||||
flags = []
|
||||
port_number = len(self._machine_model["ports"])
|
||||
if instruction_form.mnemonic is None:
|
||||
@@ -189,25 +206,6 @@ class ArchSemantics(ISASemantics):
|
||||
instruction_data = self._machine_model.get_instruction(
|
||||
instruction_form.mnemonic, instruction_form.operands
|
||||
)
|
||||
if (
|
||||
not instruction_data
|
||||
and self._isa == "x86"
|
||||
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
|
||||
):
|
||||
# check for instruction without GAS suffix
|
||||
instruction_data = self._machine_model.get_instruction(
|
||||
instruction_form.mnemonic[:-1], instruction_form.operands
|
||||
)
|
||||
if (
|
||||
instruction_data is None
|
||||
and self._isa == "aarch64"
|
||||
and "." in instruction_form.mnemonic
|
||||
):
|
||||
# Check for instruction without shape/cc suffix
|
||||
suffix_start = instruction_form.mnemonic.index(".")
|
||||
instruction_data = self._machine_model.get_instruction(
|
||||
instruction_form.mnemonic[:suffix_start], instruction_form.operands
|
||||
)
|
||||
if instruction_data:
|
||||
# instruction form in DB
|
||||
(
|
||||
@@ -232,25 +230,6 @@ class ArchSemantics(ISASemantics):
|
||||
instruction_data_reg = self._machine_model.get_instruction(
|
||||
instruction_form.mnemonic, operands
|
||||
)
|
||||
if (
|
||||
not instruction_data_reg
|
||||
and self._isa == "x86"
|
||||
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
|
||||
):
|
||||
# check for instruction without GAS suffix
|
||||
instruction_data_reg = self._machine_model.get_instruction(
|
||||
instruction_form.mnemonic[:-1], operands
|
||||
)
|
||||
if (
|
||||
instruction_data_reg is None
|
||||
and self._isa == "aarch64"
|
||||
and "." in instruction_form.mnemonic
|
||||
):
|
||||
# Check for instruction without shape/cc suffix
|
||||
suffix_start = instruction_form.mnemonic.index(".")
|
||||
instruction_data_reg = self._machine_model.get_instruction(
|
||||
instruction_form.mnemonic[:suffix_start], operands
|
||||
)
|
||||
if instruction_data_reg:
|
||||
assign_unknown = False
|
||||
reg_type = self._parser.get_reg_type(
|
||||
@@ -310,7 +289,7 @@ class ArchSemantics(ISASemantics):
|
||||
# - all mem operands in src_dst are pre-/post_indexed
|
||||
# since it is no mem store
|
||||
if (
|
||||
self._isa == "aarch64"
|
||||
self._parser.isa() == "aarch64"
|
||||
and not isinstance(
|
||||
instruction_form.semantic_operands["destination"],
|
||||
MemoryOperand,
|
||||
@@ -406,6 +385,7 @@ class ArchSemantics(ISASemantics):
|
||||
|
||||
def _handle_instruction_found(self, instruction_data, port_number, instruction_form, flags):
|
||||
"""Apply performance data to instruction if it was found in the archDB"""
|
||||
instruction_form.check_normalized()
|
||||
throughput = instruction_data.throughput
|
||||
port_pressure = self._machine_model.average_port_pressure(instruction_data.port_pressure)
|
||||
instruction_form.port_uops = instruction_data.port_pressure
|
||||
@@ -441,12 +421,12 @@ class ArchSemantics(ISASemantics):
|
||||
|
||||
def convert_op_to_reg(self, reg_type, regtype="0"):
|
||||
"""Create register operand for a memory addressing operand"""
|
||||
if self._isa == "x86":
|
||||
if self._parser.isa() == "x86":
|
||||
if reg_type == "gpr":
|
||||
register = RegisterOperand(name="r" + str(int(regtype) + 9))
|
||||
else:
|
||||
register = RegisterOperand(name=reg_type + regtype)
|
||||
elif self._isa == "aarch64":
|
||||
elif self._parser.isa() == "aarch64":
|
||||
register = RegisterOperand(name=regtype, prefix=reg_type)
|
||||
return register
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ from pathlib import Path
|
||||
|
||||
import ruamel.yaml
|
||||
from osaca import __version__, utils
|
||||
from osaca.parser import ParserX86ATT
|
||||
from osaca.parser.instruction_form import InstructionForm
|
||||
from osaca.parser.operand import Operand
|
||||
from osaca.parser.memory import MemoryOperand
|
||||
@@ -79,7 +78,7 @@ class MachineModel(object):
|
||||
else:
|
||||
yaml = self._create_yaml_object()
|
||||
# otherwise load
|
||||
with open(self._path, "r") as f:
|
||||
with open(self._path, "r", encoding="utf8") as f:
|
||||
if not lazy:
|
||||
self._data = yaml.load(f)
|
||||
else:
|
||||
@@ -286,23 +285,38 @@ class MachineModel(object):
|
||||
######################################################
|
||||
|
||||
def get_instruction(self, name, operands):
|
||||
"""Find and return instruction data from name and operands."""
|
||||
"""Find and return instruction data from name and operands/arity."""
|
||||
# For use with dict instead of list as DB
|
||||
if name is None:
|
||||
return None
|
||||
name_matched_iforms = self._data["instruction_forms_dict"].get(name.upper(), [])
|
||||
|
||||
try:
|
||||
return next(
|
||||
instruction_form
|
||||
for instruction_form in name_matched_iforms
|
||||
if self._match_operands(
|
||||
instruction_form.operands,
|
||||
operands,
|
||||
# If `operands` is an integer, it represents the arity of the instruction. This is
|
||||
# useful to reorder the operands in the Intel syntax because in their original order
|
||||
# they may not match the model.
|
||||
if isinstance(operands, int):
|
||||
arity = operands
|
||||
return next(
|
||||
(
|
||||
instruction_form
|
||||
for instruction_form in name_matched_iforms
|
||||
if len(instruction_form.operands) == arity
|
||||
),
|
||||
None
|
||||
)
|
||||
else:
|
||||
return next(
|
||||
(
|
||||
instruction_form
|
||||
for instruction_form in name_matched_iforms
|
||||
if self._match_operands(
|
||||
instruction_form.operands,
|
||||
operands
|
||||
)
|
||||
),
|
||||
None
|
||||
)
|
||||
)
|
||||
except StopIteration:
|
||||
return None
|
||||
except TypeError as e:
|
||||
print("\nname: {}\noperands: {}".format(name, operands))
|
||||
raise TypeError from e
|
||||
@@ -878,6 +892,7 @@ class MachineModel(object):
|
||||
return True
|
||||
|
||||
def _is_x86_reg_type(self, i_reg, reg, consider_masking=False):
|
||||
from osaca.parser import ParserX86
|
||||
"""Check if register type match."""
|
||||
if reg is None:
|
||||
if i_reg is None:
|
||||
@@ -895,7 +910,7 @@ class MachineModel(object):
|
||||
if i_reg_name == self.WILDCARD or reg.name == self.WILDCARD:
|
||||
return True
|
||||
# differentiate between vector registers (mm, xmm, ymm, zmm) and others (gpr)
|
||||
parser_x86 = ParserX86ATT()
|
||||
parser_x86 = ParserX86()
|
||||
if parser_x86.is_vector_register(reg):
|
||||
if reg.name.rstrip(string.digits).lower() == i_reg_name:
|
||||
# Consider masking and zeroing for AVX512
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
from itertools import chain
|
||||
|
||||
from osaca import utils
|
||||
from osaca.parser import ParserAArch64, ParserX86ATT
|
||||
from osaca.parser.memory import MemoryOperand
|
||||
from osaca.parser.operand import Operand
|
||||
from osaca.parser.register import RegisterOperand
|
||||
@@ -26,20 +25,23 @@ class INSTR_FLAGS:
|
||||
|
||||
|
||||
class ISASemantics(object):
|
||||
GAS_SUFFIXES = "bswlqt"
|
||||
|
||||
def __init__(self, isa, path_to_yaml=None):
|
||||
self._isa = isa.lower()
|
||||
path = path_to_yaml or utils.find_datafile("isa/" + self._isa + ".yml")
|
||||
def __init__(self, parser, path_to_yaml=None):
|
||||
path = path_to_yaml or utils.find_datafile("isa/" + parser.isa() + ".yml")
|
||||
self._isa_model = MachineModel(path_to_yaml=path)
|
||||
if self._isa == "x86":
|
||||
self._parser = ParserX86ATT()
|
||||
elif self._isa == "aarch64":
|
||||
self._parser = ParserAArch64()
|
||||
self._parser = parser
|
||||
|
||||
@property
|
||||
def parser(self):
|
||||
return self._parser
|
||||
|
||||
@property
|
||||
def isa_model(self):
|
||||
return self._isa_model
|
||||
|
||||
def process(self, instruction_forms):
|
||||
"""Process a list of instruction forms."""
|
||||
for i in instruction_forms:
|
||||
i.check_normalized()
|
||||
self.assign_src_dst(i)
|
||||
|
||||
# get ;parser result and assign operands to
|
||||
@@ -48,6 +50,7 @@ class ISASemantics(object):
|
||||
# - source/destination
|
||||
def assign_src_dst(self, instruction_form):
|
||||
"""Update instruction form dictionary with source, destination and flag information."""
|
||||
instruction_form.check_normalized()
|
||||
# if the instruction form doesn't have operands or is None, there's nothing to do
|
||||
if instruction_form.operands is None or instruction_form.mnemonic is None:
|
||||
instruction_form.semantic_operands = {"source": [], "destination": [], "src_dst": []}
|
||||
@@ -57,21 +60,6 @@ class ISASemantics(object):
|
||||
isa_data = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic, instruction_form.operands
|
||||
)
|
||||
if (
|
||||
isa_data is None
|
||||
and self._isa == "x86"
|
||||
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
|
||||
):
|
||||
# Check for instruction without GAS suffix
|
||||
isa_data = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic[:-1], instruction_form.operands
|
||||
)
|
||||
if isa_data is None and self._isa == "aarch64" and "." in instruction_form.mnemonic:
|
||||
# Check for instruction without shape/cc suffix
|
||||
suffix_start = instruction_form.mnemonic.index(".")
|
||||
isa_data = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic[:suffix_start], instruction_form.operands
|
||||
)
|
||||
operands = instruction_form.operands
|
||||
op_dict = {}
|
||||
|
||||
@@ -88,36 +76,17 @@ class ISASemantics(object):
|
||||
isa_data_reg = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic, operands_reg
|
||||
)
|
||||
if (
|
||||
isa_data_reg is None
|
||||
and self._isa == "x86"
|
||||
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
|
||||
):
|
||||
# Check for instruction without GAS suffix
|
||||
isa_data_reg = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic[:-1], operands_reg
|
||||
)
|
||||
if (
|
||||
isa_data_reg is None
|
||||
and self._isa == "aarch64"
|
||||
and "." in instruction_form.mnemonic
|
||||
):
|
||||
# Check for instruction without shape/cc suffix
|
||||
suffix_start = instruction_form.mnemonic.index(".")
|
||||
isa_data_reg = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic[:suffix_start], operands_reg
|
||||
)
|
||||
if isa_data_reg:
|
||||
assign_default = False
|
||||
op_dict = self._apply_found_ISA_data(isa_data_reg, operands)
|
||||
|
||||
if assign_default:
|
||||
# no irregular operand structure, apply default
|
||||
op_dict["source"] = self._get_regular_source_operands(instruction_form)
|
||||
op_dict["destination"] = self._get_regular_destination_operands(instruction_form)
|
||||
op_dict["source"] = self._parser.get_regular_source_operands(instruction_form)
|
||||
op_dict["destination"] = self._parser.get_regular_destination_operands(instruction_form)
|
||||
op_dict["src_dst"] = []
|
||||
# post-process pre- and post-indexing for aarch64 memory operands
|
||||
if self._isa == "aarch64":
|
||||
if self._parser.isa() == "aarch64":
|
||||
for operand in [op for op in op_dict["source"] if isinstance(op, MemoryOperand)]:
|
||||
post_indexed = operand.post_indexed
|
||||
pre_indexed = operand.pre_indexed
|
||||
@@ -161,6 +130,7 @@ class ISASemantics(object):
|
||||
Empty dict if no changes of registers occured. None for registers with unknown changes.
|
||||
If only_postindexed is True, only considers changes due to post_indexed memory references.
|
||||
"""
|
||||
instruction_form.check_normalized()
|
||||
if instruction_form.mnemonic is None:
|
||||
return {}
|
||||
dest_reg_names = [
|
||||
@@ -174,21 +144,6 @@ class ISASemantics(object):
|
||||
isa_data = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic, instruction_form.operands
|
||||
)
|
||||
if (
|
||||
isa_data is None
|
||||
and self._isa == "x86"
|
||||
and instruction_form.mnemonic[-1] in self.GAS_SUFFIXES
|
||||
):
|
||||
# Check for instruction without GAS suffix
|
||||
isa_data = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic[:-1], instruction_form.operands
|
||||
)
|
||||
if isa_data is None and self._isa == "aarch64" and "." in instruction_form.mnemonic:
|
||||
# Check for instruction without shape/cc suffix
|
||||
suffix_start = instruction_form.mnemonic.index(".")
|
||||
isa_data = self._isa_model.get_instruction(
|
||||
instruction_form.mnemonic[:suffix_start], instruction_form.operands
|
||||
)
|
||||
|
||||
if only_postindexed:
|
||||
for o in instruction_form.operands:
|
||||
@@ -301,6 +256,7 @@ class ISASemantics(object):
|
||||
|
||||
def _has_load(self, instruction_form):
|
||||
"""Check if instruction form performs a LOAD"""
|
||||
instruction_form.check_normalized()
|
||||
for operand in chain(
|
||||
instruction_form.semantic_operands["source"],
|
||||
instruction_form.semantic_operands["src_dst"],
|
||||
@@ -311,6 +267,7 @@ class ISASemantics(object):
|
||||
|
||||
def _has_store(self, instruction_form):
|
||||
"""Check if instruction form perfroms a STORE"""
|
||||
instruction_form.check_normalized()
|
||||
for operand in chain(
|
||||
instruction_form.semantic_operands["destination"],
|
||||
instruction_form.semantic_operands["src_dst"],
|
||||
@@ -319,33 +276,6 @@ class ISASemantics(object):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _get_regular_source_operands(self, instruction_form):
|
||||
"""Get source operand of given instruction form assuming regular src/dst behavior."""
|
||||
# if there is only one operand, assume it is a source operand
|
||||
if len(instruction_form.operands) == 1:
|
||||
return [instruction_form.operands[0]]
|
||||
if self._isa == "x86":
|
||||
# return all but last operand
|
||||
return [op for op in instruction_form.operands[0:-1]]
|
||||
elif self._isa == "aarch64":
|
||||
return [op for op in instruction_form.operands[1:]]
|
||||
else:
|
||||
raise ValueError("Unsupported ISA {}.".format(self._isa))
|
||||
|
||||
def _get_regular_destination_operands(self, instruction_form):
|
||||
"""Get destination operand of given instruction form assuming regular src/dst behavior."""
|
||||
# if there is only one operand, assume no destination
|
||||
if len(instruction_form.operands) == 1:
|
||||
return []
|
||||
if self._isa == "x86":
|
||||
# return last operand
|
||||
return instruction_form.operands[-1:]
|
||||
if self._isa == "aarch64":
|
||||
# return first operand
|
||||
return instruction_form.operands[:1]
|
||||
else:
|
||||
raise ValueError("Unsupported ISA {}.".format(self._isa))
|
||||
|
||||
def substitute_mem_address(self, operands):
|
||||
"""Create memory wildcard for all memory operands"""
|
||||
return [
|
||||
|
||||
@@ -38,7 +38,8 @@ class KernelDG(nx.DiGraph):
|
||||
self.kernel, timeout, flag_dependencies
|
||||
)
|
||||
|
||||
def _extend_path(self, dst_list, kernel, dg, offset):
|
||||
@classmethod
|
||||
def _extend_path(cls, dst_list, kernel, dg, offset):
|
||||
for instr in kernel:
|
||||
generator_path = nx.algorithms.simple_paths.all_simple_paths(
|
||||
dg, instr.line_number, instr.line_number + offset
|
||||
@@ -138,7 +139,7 @@ class KernelDG(nx.DiGraph):
|
||||
all_paths = manager.list()
|
||||
processes = [
|
||||
Process(
|
||||
target=self._extend_path,
|
||||
target=KernelDG._extend_path,
|
||||
args=(all_paths, instr_section, dg, offset),
|
||||
)
|
||||
for instr_section in instrs
|
||||
@@ -164,9 +165,7 @@ class KernelDG(nx.DiGraph):
|
||||
# terminate running processes
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
# Python 3.6 does not support Process.kill().
|
||||
# Can be changed to `p.kill()` after EoL (01/22) of Py3.6
|
||||
os.kill(p.pid, signal.SIGKILL)
|
||||
p.kill()
|
||||
p.join()
|
||||
all_paths = list(all_paths)
|
||||
else:
|
||||
@@ -186,11 +185,11 @@ class KernelDG(nx.DiGraph):
|
||||
for s, d in nx.utils.pairwise(path):
|
||||
edge_lat = dg.edges[s, d]["latency"]
|
||||
# map source node back to original line numbers
|
||||
if s >= offset:
|
||||
if s > offset:
|
||||
s -= offset
|
||||
lat_path.append((s, edge_lat))
|
||||
lat_sum += edge_lat
|
||||
if d >= offset:
|
||||
if d > offset:
|
||||
d -= offset
|
||||
lat_path.sort()
|
||||
|
||||
@@ -413,7 +412,7 @@ class KernelDG(nx.DiGraph):
|
||||
addr_change = 0
|
||||
if isinstance(src.offset, ImmediateOperand) and src.offset.value is not None:
|
||||
addr_change += src.offset.value
|
||||
if mem.offset:
|
||||
if isinstance(mem.offset, ImmediateOperand) and mem.offset.value is not None:
|
||||
addr_change -= mem.offset.value
|
||||
if mem.base and src.base:
|
||||
base_change = register_changes.get(
|
||||
|
||||
@@ -1,29 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
from collections import OrderedDict
|
||||
from enum import Enum
|
||||
from functools import partial
|
||||
|
||||
from osaca.parser import ParserAArch64, ParserX86ATT, get_parser
|
||||
from osaca.parser.register import RegisterOperand
|
||||
from osaca.parser.instruction_form import InstructionForm
|
||||
from osaca.parser.directive import DirectiveOperand
|
||||
from osaca.parser.identifier import IdentifierOperand
|
||||
from osaca.parser.immediate import ImmediateOperand
|
||||
from osaca.parser.memory import MemoryOperand
|
||||
from osaca.parser.register import RegisterOperand
|
||||
|
||||
COMMENT_MARKER = {"start": "OSACA-BEGIN", "end": "OSACA-END"}
|
||||
|
||||
# State of marker matching.
|
||||
# No: we have determined that the code doesn't match the marker.
|
||||
# Partial: so far the code matches the marker, but we have not reached the end of the marker yet.
|
||||
# Full: the code matches all instructions in the marker.
|
||||
class Matching(Enum):
|
||||
No = 0
|
||||
Partial = 1
|
||||
Full = 2
|
||||
|
||||
def reduce_to_section(kernel, isa):
|
||||
|
||||
def reduce_to_section(kernel, parser):
|
||||
"""
|
||||
Finds OSACA markers in given kernel and returns marked section
|
||||
|
||||
:param list kernel: kernel to check
|
||||
:param str isa: ISA of given kernel
|
||||
:param BaseParser parser: parser used to produce the kernel
|
||||
:returns: `list` -- marked section of kernel as list of instruction forms
|
||||
"""
|
||||
isa = isa.lower()
|
||||
if isa == "x86":
|
||||
start, end = find_marked_kernel_x86ATT(kernel)
|
||||
elif isa == "aarch64":
|
||||
start, end = find_marked_kernel_AArch64(kernel)
|
||||
else:
|
||||
raise ValueError("ISA not supported.")
|
||||
start, end = find_marked_section(kernel, parser, COMMENT_MARKER)
|
||||
if start == -1:
|
||||
start = 0
|
||||
if end == -1:
|
||||
@@ -31,109 +38,21 @@ def reduce_to_section(kernel, isa):
|
||||
return kernel[start:end]
|
||||
|
||||
|
||||
def find_marked_kernel_AArch64(lines):
|
||||
"""
|
||||
Find marked section for AArch64
|
||||
|
||||
:param list lines: kernel
|
||||
:returns: `tuple of int` -- start and end line of marked section
|
||||
"""
|
||||
nop_bytes = [213, 3, 32, 31]
|
||||
return find_marked_section(
|
||||
lines,
|
||||
ParserAArch64(),
|
||||
["mov"],
|
||||
"x1",
|
||||
[111, 222],
|
||||
nop_bytes,
|
||||
reverse=True,
|
||||
comments=COMMENT_MARKER,
|
||||
)
|
||||
|
||||
|
||||
def find_marked_kernel_x86ATT(lines):
|
||||
"""
|
||||
Find marked section for x86
|
||||
|
||||
:param list lines: kernel
|
||||
:returns: `tuple of int` -- start and end line of marked section
|
||||
"""
|
||||
nop_bytes = [100, 103, 144]
|
||||
return find_marked_section(
|
||||
lines,
|
||||
ParserX86ATT(),
|
||||
["mov", "movl"],
|
||||
"ebx",
|
||||
[111, 222],
|
||||
nop_bytes,
|
||||
comments=COMMENT_MARKER,
|
||||
)
|
||||
|
||||
|
||||
def get_marker(isa, comment=""):
|
||||
"""Return tuple of start and end marker lines."""
|
||||
isa = isa.lower()
|
||||
if isa == "x86":
|
||||
start_marker_raw = (
|
||||
"movl $111, %ebx # OSACA START MARKER\n"
|
||||
".byte 100 # OSACA START MARKER\n"
|
||||
".byte 103 # OSACA START MARKER\n"
|
||||
".byte 144 # OSACA START MARKER\n"
|
||||
)
|
||||
if comment:
|
||||
start_marker_raw += "# {}\n".format(comment)
|
||||
end_marker_raw = (
|
||||
"movl $222, %ebx # OSACA END MARKER\n"
|
||||
".byte 100 # OSACA END MARKER\n"
|
||||
".byte 103 # OSACA END MARKER\n"
|
||||
".byte 144 # OSACA END MARKER\n"
|
||||
)
|
||||
elif isa == "aarch64":
|
||||
start_marker_raw = (
|
||||
"mov x1, #111 // OSACA START MARKER\n"
|
||||
".byte 213,3,32,31 // OSACA START MARKER\n"
|
||||
)
|
||||
if comment:
|
||||
start_marker_raw += "// {}\n".format(comment)
|
||||
# After loop
|
||||
end_marker_raw = (
|
||||
"mov x1, #222 // OSACA END MARKER\n"
|
||||
".byte 213,3,32,31 // OSACA END MARKER\n"
|
||||
)
|
||||
|
||||
parser = get_parser(isa)
|
||||
start_marker = parser.parse_file(start_marker_raw)
|
||||
end_marker = parser.parse_file(end_marker_raw)
|
||||
|
||||
return start_marker, end_marker
|
||||
|
||||
|
||||
def find_marked_section(
|
||||
lines, parser, mov_instr, mov_reg, mov_vals, nop_bytes, reverse=False, comments=None
|
||||
):
|
||||
def find_marked_section(lines, parser, comments=None):
|
||||
"""
|
||||
Return indexes of marked section
|
||||
|
||||
:param list lines: kernel
|
||||
:param parser: parser to use for checking
|
||||
:type parser: :class:`~parser.BaseParser`
|
||||
:param mov_instr: all MOV instruction possible for the marker
|
||||
:type mov_instr: `list of str`
|
||||
:param mov_reg: register used for the marker
|
||||
:type mov_reg: `str`
|
||||
:param mov_vals: values needed to be moved to ``mov_reg`` for valid marker
|
||||
:type mov_vals: `list of int`
|
||||
:param nop_bytes: bytes representing opcode of NOP
|
||||
:type nop_bytes: `list of int`
|
||||
:param reverse: indicating if ISA syntax requires reverse operand order, defaults to `False`
|
||||
:type reverse: boolean, optional
|
||||
:param comments: dictionary with start and end markers in comment format, defaults to None
|
||||
:type comments: dict, optional
|
||||
:returns: `tuple of int` -- start and end line of marked section
|
||||
"""
|
||||
# TODO match to instructions returned by get_marker
|
||||
index_start = -1
|
||||
index_end = -1
|
||||
start_marker = parser.start_marker()
|
||||
end_marker = parser.end_marker()
|
||||
for i, line in enumerate(lines):
|
||||
try:
|
||||
if line.mnemonic is None and comments is not None and line.comment is not None:
|
||||
@@ -141,59 +60,151 @@ def find_marked_section(
|
||||
index_start = i + 1
|
||||
elif comments["end"] == line.comment:
|
||||
index_end = i
|
||||
elif (
|
||||
line.mnemonic in mov_instr
|
||||
and len(lines) > i + 1
|
||||
and lines[i + 1].directive is not None
|
||||
):
|
||||
source = line.operands[0 if not reverse else 1]
|
||||
destination = line.operands[1 if not reverse else 0]
|
||||
# instruction pair matches, check for operands
|
||||
if (
|
||||
isinstance(source, ImmediateOperand)
|
||||
and parser.normalize_imd(source) == mov_vals[0]
|
||||
and isinstance(destination, RegisterOperand)
|
||||
and parser.get_full_reg_name(destination) == mov_reg
|
||||
):
|
||||
# operands of first instruction match start, check for second one
|
||||
match, line_count = match_bytes(lines, i + 1, nop_bytes)
|
||||
if match:
|
||||
# return first line after the marker
|
||||
index_start = i + 1 + line_count
|
||||
elif (
|
||||
isinstance(source, ImmediateOperand)
|
||||
and parser.normalize_imd(source) == mov_vals[1]
|
||||
and isinstance(destination, RegisterOperand)
|
||||
and parser.get_full_reg_name(destination) == mov_reg
|
||||
):
|
||||
# operand of first instruction match end, check for second one
|
||||
match, line_count = match_bytes(lines, i + 1, nop_bytes)
|
||||
if match:
|
||||
# return line of the marker
|
||||
index_end = i
|
||||
except TypeError:
|
||||
print(i, line)
|
||||
if index_start == -1:
|
||||
matching_lines = match_lines(parser, lines[i:], start_marker)
|
||||
if matching_lines > 0:
|
||||
# Return the first line after the marker.
|
||||
index_start = i + matching_lines
|
||||
if index_end == -1:
|
||||
if match_lines(parser, lines[i:], end_marker):
|
||||
index_end = i
|
||||
except TypeError as e:
|
||||
print(i, e, line)
|
||||
if index_start != -1 and index_end != -1:
|
||||
break
|
||||
return index_start, index_end
|
||||
|
||||
|
||||
def match_bytes(lines, index, byte_list):
|
||||
"""Match bytes directives of markers"""
|
||||
# either all bytes are in one line or in separate ones
|
||||
extracted_bytes = []
|
||||
line_count = 0
|
||||
while (
|
||||
index < len(lines)
|
||||
and lines[index].directive is not None
|
||||
and lines[index].directive.name == "byte"
|
||||
# This function and the following ones traverse the syntactic tree produced by the parser and try to
|
||||
# match it to the marker. This is necessary because the IACA markers are significantly different on
|
||||
# MSVC x86 than on other ISA/compilers. Therefore, simple string matching is not sufficient. Also,
|
||||
# the syntax of numeric literals depends on the parser and should not be known to this class.
|
||||
# The matching only checks for a limited number of properties (and the marker doesn't specify the
|
||||
# rest).
|
||||
def match_lines(parser, lines, marker):
|
||||
"""
|
||||
Returns True iff the `lines` match the `marker`.
|
||||
|
||||
:param list of `InstructionForm` lines: parsed assembly code.
|
||||
:param list of `InstructionForm` marker: pattern to match against the `lines`.
|
||||
:return int: the length of the match in the parsed code, 0 if there is no match.
|
||||
"""
|
||||
marker_iter = iter(marker)
|
||||
marker_line = next(marker_iter)
|
||||
for matched_lines, line in enumerate(lines):
|
||||
if isinstance(marker_line, list):
|
||||
# No support for partial matching in lists.
|
||||
for marker_alternative in marker_line:
|
||||
matching = match_line(parser, line, marker_alternative)
|
||||
if matching == Matching.Full:
|
||||
break
|
||||
else:
|
||||
return 0
|
||||
marker_line = next(marker_iter, None)
|
||||
else:
|
||||
matching = match_line(parser, line, marker_line)
|
||||
if matching == Matching.No:
|
||||
return 0
|
||||
elif matching == Matching.Partial:
|
||||
# Try the same marker line again. The call to `match_line` consumed some of the
|
||||
# directive parameters.
|
||||
pass
|
||||
elif matching == Matching.Full:
|
||||
# Move to the next marker line, the current one has been fully matched.
|
||||
marker_line = next(marker_iter, None)
|
||||
# If we have reached the last marker line, the parsed code matches the marker.
|
||||
if not marker_line:
|
||||
return matched_lines + 1
|
||||
|
||||
def match_line(parser, line, marker_line):
|
||||
"""
|
||||
Returns whether `line` matches `marker_line`.
|
||||
|
||||
:param `IntructionForm` line: parsed assembly code.
|
||||
:param marker_line `InstructionForm` marker: pattern to match against `line`.
|
||||
:return: Matching. In case of partial match, `marker_line` is modified and should be reused for
|
||||
matching the next line in the parsed assembly code.
|
||||
"""
|
||||
if (
|
||||
line.mnemonic
|
||||
and marker_line.mnemonic
|
||||
and line.mnemonic == marker_line.mnemonic
|
||||
and match_operands(line.operands, marker_line.operands)
|
||||
):
|
||||
line_count += 1
|
||||
extracted_bytes += [int(x, 0) for x in lines[index].directive.parameters]
|
||||
index += 1
|
||||
if extracted_bytes[0 : len(byte_list)] == byte_list:
|
||||
return True, line_count
|
||||
return False, -1
|
||||
return Matching.Full
|
||||
if (
|
||||
line.directive
|
||||
and marker_line.directive
|
||||
and line.directive.name == marker_line.directive.name
|
||||
):
|
||||
return match_parameters(parser, line.directive.parameters, marker_line.directive.parameters)
|
||||
else:
|
||||
return Matching.No
|
||||
|
||||
def match_operands(line_operands, marker_line_operands):
|
||||
if len(line_operands) != len(marker_line_operands):
|
||||
return False
|
||||
return all(
|
||||
match_operand(line_operand, marker_line_operand)
|
||||
for line_operand, marker_line_operand in
|
||||
zip(line_operands, marker_line_operands)
|
||||
)
|
||||
|
||||
def match_operand(line_operand, marker_line_operand):
|
||||
if (
|
||||
isinstance(line_operand, ImmediateOperand)
|
||||
and isinstance(marker_line_operand, ImmediateOperand)
|
||||
and line_operand.value == marker_line_operand.value
|
||||
):
|
||||
return True
|
||||
if (
|
||||
isinstance(line_operand, RegisterOperand)
|
||||
and isinstance(marker_line_operand, RegisterOperand)
|
||||
and line_operand.name.lower() == marker_line_operand.name.lower()
|
||||
):
|
||||
return True
|
||||
if (
|
||||
isinstance(line_operand, MemoryOperand)
|
||||
and isinstance(marker_line_operand, MemoryOperand)
|
||||
and match_operand(line_operand.base, marker_line_operand.base)
|
||||
and match_operand(line_operand.offset, line_operand.offset)
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def match_parameters(parser, line_parameters, marker_line_parameters):
|
||||
"""
|
||||
Returns whether `line_parameters` matches `marker_line_parameters`.
|
||||
|
||||
:param list of strings line_parameters: parameters of a directive in the parsed assembly code.
|
||||
:param list of strings marker_line_parameters: parameters of a directive in the marker.
|
||||
:return: Matching. In case of partial match, `marker_line_parameters` is modified and should be
|
||||
reused for matching the next line in the parsed assembly code.
|
||||
"""
|
||||
line_parameter_count = len(line_parameters)
|
||||
marker_line_parameter_count = len(marker_line_parameters)
|
||||
|
||||
# The elements of `marker_line_parameters` are consumed as they are matched.
|
||||
for line_parameter in line_parameters:
|
||||
if not marker_line_parameters:
|
||||
break;
|
||||
marker_line_parameter = marker_line_parameters[0]
|
||||
if not match_parameter(parser, line_parameter, marker_line_parameter):
|
||||
return Matching.No
|
||||
marker_line_parameters.pop(0)
|
||||
if marker_line_parameters:
|
||||
return Matching.Partial
|
||||
else:
|
||||
return Matching.Full
|
||||
|
||||
def match_parameter(parser, line_parameter, marker_line_parameter):
|
||||
if line_parameter.lower() == marker_line_parameter.lower():
|
||||
return True
|
||||
else:
|
||||
# If the parameters don't match verbatim, check if they represent the same immediate value.
|
||||
line_immediate = ImmediateOperand(value=line_parameter)
|
||||
marker_line_immediate = ImmediateOperand(value=marker_line_parameter)
|
||||
return parser.normalize_imd(line_immediate) == parser.normalize_imd(marker_line_immediate)
|
||||
|
||||
|
||||
def find_jump_labels(lines):
|
||||
|
||||
Reference in New Issue
Block a user