Files
OSACA/osaca/parser/parser_x86intel.py

808 lines
34 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import pyparsing as pp
import unicodedata
from osaca.parser import ParserX86
from osaca.parser.directive import DirectiveOperand
from osaca.parser.identifier import IdentifierOperand
from osaca.parser.immediate import ImmediateOperand
from osaca.parser.instruction_form import InstructionForm
from osaca.parser.label import LabelOperand
from osaca.parser.memory import MemoryOperand
from osaca.parser.register import RegisterOperand
# We assume any non-ASCII characters except control characters and line terminators can be part of
# identifiers; this is based on the assumption that no assembler uses non-ASCII white space and
# syntax characters.
# This approach is described at the end of https://www.unicode.org/reports/tr55/#Whitespace-Syntax.
# It is appropriate for tools, such as this one, which process source code but do not fully validate
# it (in this case, thats the job of the assembler).
NON_ASCII_PRINTABLE_CHARACTERS = "".join(
chr(cp)
for cp in range(0x80, 0x10FFFF + 1)
if unicodedata.category(chr(cp)) not in ("Cc", "Zl", "Zp", "Cs", "Cn")
)
# References:
# ASM386 Assembly Language Reference, document number 469165-003, https://mirror.math.princeton.edu/pub/oldlinux/Linux.old/Ref-docs/asm-ref.pdf.
# Microsoft Macro Assembler BNF Grammar, https://learn.microsoft.com/en-us/cpp/assembler/masm/masm-bnf-grammar?view=msvc-170.
# Intel Architecture Code Analyzer User's Guide, https://www.intel.com/content/dam/develop/external/us/en/documents/intel-architecture-code-analyzer-3-0-users-guide-157552.pdf.
class ParserX86Intel(ParserX86):
_instance = None
# Singleton pattern, as this is created very many times.
def __new__(cls):
if cls._instance is None:
cls._instance = super(ParserX86Intel, cls).__new__(cls)
return cls._instance
def __init__(self):
super().__init__()
self._equ = {}
# The IACA manual says: "For For Microsoft* Visual C++ compiler, 64-bit version, use
# IACA_VC64_START and IACA_VC64_END, instead" (of IACA_START and IACA_END).
# TODO: Inconveniently, the code generated with optimization disabled (/Od) has two
# instructions. We should support both patterns, but then who runs OSACA with /Od?
def start_marker(self):
return [
InstructionForm(
mnemonic="mov",
operands=[
MemoryOperand(
base=RegisterOperand(name="GS"), offset=ImmediateOperand(value=111)
),
ImmediateOperand(value=111),
],
),
]
def end_marker(self):
return [
InstructionForm(
mnemonic="mov",
operands=[
MemoryOperand(
base=RegisterOperand(name="GS"), offset=ImmediateOperand(value=222)
),
ImmediateOperand(value=222),
],
),
]
def normalize_instruction_form(self, instruction_form, isa_model, arch_model):
"""
If the model indicates that this instruction has a single destination that is the last
operand, move the first operand to the last position. This effectively converts the Intel
syntax to the AT&T one.
"""
if instruction_form.normalized:
return
instruction_form.normalized = True
mnemonic = instruction_form.mnemonic
if not mnemonic:
return
# The model may only contain the VEX-encoded instruction and we may have the non-VEX-encoded
# one, or vice-versa. Note that this doesn't work when the arguments differ between VEX-
# encoded and non-VEX-encoded, e.g., for psubq.
if not arch_model.get_instruction(mnemonic, len(instruction_form.operands)):
if mnemonic[0] == "v":
unvexed_mnemonic = mnemonic[1:]
if arch_model.get_instruction(unvexed_mnemonic, len(instruction_form.operands)):
mnemonic = unvexed_mnemonic
else:
vexed_mnemonic = "v" + mnemonic
if arch_model.get_instruction(vexed_mnemonic, len(instruction_form.operands)):
mnemonic = vexed_mnemonic
instruction_form.mnemonic = mnemonic
# We cannot pass the operands because they may not match before the reordering. We just
# pass the arity instead. Also, this must use the ISA model, because that's where the
# source/destination information is found.
model = isa_model.get_instruction(mnemonic, len(instruction_form.operands))
has_single_destination_at_end = False
has_destination = False
if model:
for o in model.operands:
if o.source:
if has_destination:
has_single_destination_at_end = False
if o.destination:
if has_destination:
has_single_destination_at_end = False
else:
has_destination = True
has_single_destination_at_end = True
else:
# if there is only one operand, assume it is a source operand
has_single_destination_at_end = len(instruction_form.operands) > 1
if has_single_destination_at_end:
# It is important to reverse the operands, we cannot just move the first one last. This
# makes a difference for instructions with 3 operands or more, such as roundsd: the
# model files expect the rounding mode (an immediate) first but the Intel syntax has it
# last.
instruction_form.operands.reverse()
# A hack to help with comparison instruction: if the instruction is in the model, and has
# exactly two sources, swap its operands.
if (
model
and not has_destination
and len(instruction_form.operands) == 2
and not isa_model.get_instruction(mnemonic, instruction_form.operands)
and not arch_model.get_instruction(mnemonic, instruction_form.operands)
):
instruction_form.operands.reverse()
# If the instruction has a well-known data type, append a suffix.
data_type_to_suffix = {"DWORD": "d", "QWORD": "q"}
for o in instruction_form.operands:
if isinstance(o, MemoryOperand) and o.data_type:
suffix = data_type_to_suffix.get(o.data_type, None)
if suffix:
suffixed_mnemonic = mnemonic + suffix
if isa_model.get_instruction(
suffixed_mnemonic, len(instruction_form.operands)
) or arch_model.get_instruction(
suffixed_mnemonic, len(instruction_form.operands)
):
instruction_form.mnemonic = suffixed_mnemonic
break
def construct_parser(self):
"""Create parser for x86 Intel ISA."""
# Numeric literal.
binary_number = pp.Combine(pp.Word("01") + pp.CaselessLiteral("B"))
octal_number = pp.Combine(pp.Word("01234567") + pp.CaselessLiteral("O"))
decimal_number = pp.Combine(pp.Optional(pp.Literal("-")) + pp.Word(pp.nums))
hex_number = pp.Combine(pp.Word(pp.hexnums) + pp.CaselessLiteral("H"))
float_number = pp.Combine(
pp.Optional(pp.Literal("-")) + pp.Word(pp.nums) + pp.Word(".", pp.nums)
).setResultsName("value")
integer_number = (
binary_number ^ octal_number ^ decimal_number ^ hex_number
).setResultsName("value")
# Comment.
self.comment = pp.Word(";#", exact=1) + pp.Group(
pp.ZeroOrMore(pp.Word(pp.printables + NON_ASCII_PRINTABLE_CHARACTERS))
).setResultsName(self.comment_id)
# Types.
data_type = (
pp.CaselessKeyword("BYTE")
| pp.CaselessKeyword("DWORD")
| pp.CaselessKeyword("FWORD")
| pp.CaselessKeyword("MMWORD")
| pp.CaselessKeyword("OWORD")
| pp.CaselessKeyword("QWORD")
| pp.CaselessKeyword("REAL10")
| pp.CaselessKeyword("REAL4")
| pp.CaselessKeyword("REAL8")
| pp.CaselessKeyword("SBYTE")
| pp.CaselessKeyword("SDWORD")
| pp.CaselessKeyword("SQWORD")
| pp.CaselessKeyword("SWORD")
| pp.CaselessKeyword("TBYTE")
| pp.CaselessKeyword("WORD")
| pp.CaselessKeyword("XMMWORD")
| pp.CaselessKeyword("YMMWORD")
).setResultsName("data_type")
# Identifier. Note that $ is not mentioned in the ASM386 Assembly Language Reference,
# but it is mentioned in the MASM syntax. < and > apparently show up in C++ mangled names.
# ICC allows ".", at least in labels.
first = pp.Word(pp.alphas + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>", exact=1)
rest = pp.Word(pp.alphanums + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>")
identifier = pp.Group(
pp.Combine(first + pp.Optional(rest)).setResultsName("name")
).setResultsName("identifier")
# Register.
# This follows the MASM grammar.
special_register = (
pp.CaselessKeyword("CR0")
| pp.CaselessKeyword("CR2")
| pp.CaselessKeyword("CR3")
| pp.CaselessKeyword("DR0")
| pp.CaselessKeyword("DR1")
| pp.CaselessKeyword("DR2")
| pp.CaselessKeyword("DR3")
| pp.CaselessKeyword("DR6")
| pp.CaselessKeyword("DR7")
| pp.CaselessKeyword("TR3")
| pp.CaselessKeyword("TR4")
| pp.CaselessKeyword("TR5")
| pp.CaselessKeyword("TR6")
| pp.CaselessKeyword("TR7")
).setResultsName("name")
gp_register = (
pp.CaselessKeyword("AX")
| pp.CaselessKeyword("EAX")
| pp.CaselessKeyword("CX")
| pp.CaselessKeyword("ECX")
| pp.CaselessKeyword("DX")
| pp.CaselessKeyword("EDX")
| pp.CaselessKeyword("BX")
| pp.CaselessKeyword("EBX")
| pp.CaselessKeyword("DI")
| pp.CaselessKeyword("EDI")
| pp.CaselessKeyword("SI")
| pp.CaselessKeyword("ESI")
| pp.CaselessKeyword("BP")
| pp.CaselessKeyword("EBP")
| pp.CaselessKeyword("SP")
| pp.CaselessKeyword("ESP")
| pp.CaselessKeyword("R8W")
| pp.CaselessKeyword("R8D")
| pp.CaselessKeyword("R9W")
| pp.CaselessKeyword("R9D")
| pp.CaselessKeyword("R12D")
| pp.CaselessKeyword("R13W")
| pp.CaselessKeyword("R13D")
| pp.CaselessKeyword("R14W")
| pp.CaselessKeyword("R14D")
).setResultsName("name")
byte_register = (
pp.CaselessKeyword("AL")
| pp.CaselessKeyword("AH")
| pp.CaselessKeyword("CL")
| pp.CaselessKeyword("CH")
| pp.CaselessKeyword("DL")
| pp.CaselessKeyword("DH")
| pp.CaselessKeyword("BL")
| pp.CaselessKeyword("BH")
| pp.CaselessKeyword("R8B")
| pp.CaselessKeyword("R9B")
| pp.CaselessKeyword("R10B")
| pp.CaselessKeyword("R11B")
| pp.CaselessKeyword("R12B")
| pp.CaselessKeyword("R13B")
).setResultsName("name")
qword_register = (
pp.CaselessKeyword("RAX")
| pp.CaselessKeyword("RCX")
| pp.CaselessKeyword("RDX")
| pp.CaselessKeyword("RBX")
| pp.CaselessKeyword("RSP")
| pp.CaselessKeyword("RBP")
| pp.CaselessKeyword("RSI")
| pp.CaselessKeyword("RDI")
| pp.CaselessKeyword("R8")
| pp.CaselessKeyword("R9")
| pp.CaselessKeyword("R10")
| pp.CaselessKeyword("R11")
| pp.CaselessKeyword("R12")
| pp.CaselessKeyword("R13")
| pp.CaselessKeyword("R14")
| pp.CaselessKeyword("R15")
).setResultsName("name")
fpu_register = pp.Combine(
pp.CaselessKeyword("ST")
+ pp.Optional(pp.Literal("(") + pp.Word("01234567") + pp.Literal(")"))
).setResultsName("name")
xmm_register = pp.Combine(pp.CaselessLiteral("XMM") + pp.Word(pp.nums)) | pp.Combine(
pp.CaselessLiteral("XMM1") + pp.Word("012345")
)
simd_register = (
pp.Combine(pp.CaselessLiteral("MM") + pp.Word("01234567"))
| xmm_register
| pp.Combine(pp.CaselessLiteral("YMM") + pp.Word(pp.nums))
| pp.Combine(pp.CaselessLiteral("YMM1") + pp.Word("012345"))
).setResultsName("name")
segment_register = (
pp.CaselessKeyword("CS")
| pp.CaselessKeyword("DS")
| pp.CaselessKeyword("ES")
| pp.CaselessKeyword("FS")
| pp.CaselessKeyword("GS")
| pp.CaselessKeyword("SS")
).setResultsName("name")
self.register = pp.Group(
special_register
| gp_register
| byte_register
| qword_register
| fpu_register
| simd_register
| segment_register
| pp.CaselessKeyword("RIP")
).setResultsName(self.register_id)
# Register expressions.
base_register = self.register
index_register = self.register
scale = pp.Word("1248", exact=1)
base = base_register.setResultsName("base")
displacement = pp.Group(
pp.Group(integer_number ^ identifier).setResultsName(self.immediate_id)
).setResultsName("displacement")
short_indexed = index_register.setResultsName("index")
long_indexed = (
index_register.setResultsName("index")
+ pp.Literal("*")
+ scale.setResultsName("scale")
)
indexed = pp.Group(short_indexed ^ long_indexed).setResultsName("indexed")
operator = pp.Word("+-", exact=1)
operator_index = pp.Word("+-", exact=1).setResultsName("operator_idx")
operator_displacement = pp.Word("+-", exact=1).setResultsName("operator_disp")
# Syntax:
# `base` always preceedes `indexed`.
# `short_indexed` is only allowed if it follows `base`, not alone.
# `displacement` can go anywhere.
# It's easier to list all the alternatives than to represent these rules using complicated
# `Optional` and what not.
register_expression = pp.Group(
pp.Literal("[")
+ (
base
^ (base + operator_displacement + displacement)
^ (base + operator_displacement + displacement + operator_index + indexed)
^ (base + operator_index + indexed)
^ (base + operator_index + indexed + operator_displacement + displacement)
^ (displacement + operator + base)
^ (displacement + operator + base + operator_index + indexed)
^ (
displacement
+ operator_index
+ pp.Group(long_indexed).setResultsName("indexed")
)
^ pp.Group(long_indexed).setResultsName("indexed")
^ (
pp.Group(long_indexed).setResultsName("indexed")
+ operator_displacement
+ displacement
)
)
+ pp.Literal("]")
).setResultsName("register_expression")
# Immediate.
immediate = pp.Group(integer_number | float_number | identifier).setResultsName(
self.immediate_id
)
# Expressions.
# The ASM86 manual has weird expressions on page 130 (displacement outside of the register
# expression, multiple register expressions). Let's ignore those for now, but see
# https://stackoverflow.com/questions/71540754/why-sometimes-use-offset-flatlabel-and-sometimes-not.
address_expression = pp.Group(
self.register.setResultsName("segment") + pp.Literal(":") + immediate
^ immediate + register_expression
^ register_expression
^ identifier + pp.Optional(operator + immediate)
).setResultsName("address_expression")
offset_expression = pp.Group(
pp.CaselessKeyword("OFFSET")
+ pp.Group(
pp.CaselessKeyword("GROUP")
| pp.CaselessKeyword("SEGMENT")
| pp.CaselessKeyword("FLAT")
)
# The MASM grammar has the ":" immediately after "OFFSET", but that's not what MSVC
# outputs.
+ pp.Literal(":")
+ identifier.setResultsName("identifier")
+ pp.Optional(pp.Literal("+") + immediate.setResultsName("displacement"))
).setResultsName("offset_expression")
ptr_expression = pp.Group(
data_type + pp.CaselessKeyword("PTR") + address_expression
).setResultsName("ptr_expression")
short_expression = pp.Group(pp.CaselessKeyword("SHORT") + identifier).setResultsName(
"short_expression"
)
# Instructions.
mnemonic = pp.Word(pp.alphas, pp.alphanums).setResultsName("mnemonic")
operand = pp.Group(
self.register
| pp.Group(
offset_expression | ptr_expression | short_expression | address_expression
).setResultsName(self.memory_id)
| immediate
)
self.instruction_parser = (
mnemonic
+ pp.Optional(operand.setResultsName("operand1"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand.setResultsName("operand2"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand.setResultsName("operand3"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand.setResultsName("operand4"))
+ pp.Optional(self.comment)
)
# Label.
self.label = pp.Group(
identifier.setResultsName("name")
+ pp.Literal(":")
+ pp.Optional(self.instruction_parser)
+ pp.Optional(self.comment)
).setResultsName(self.label_id)
# Directives.
# The identifiers at the beginnig of a directive cannot start with a "." otherwise we end up
# with ambiguities.
directive_first = pp.Word(pp.alphas + NON_ASCII_PRINTABLE_CHARACTERS + "$?@_<>", exact=1)
directive_rest = pp.Word(pp.alphanums + NON_ASCII_PRINTABLE_CHARACTERS + ".$?@_<>")
directive_identifier = pp.Group(
pp.Combine(directive_first + pp.Optional(directive_rest)).setResultsName("name")
).setResultsName("identifier")
# Parameter can be any quoted string or sequence of characters besides ';' (for comments)
# or ',' (parameter delimiter). See ASM386 p. 38.
directive_parameter = (
pp.quotedString
^ (
pp.Word(pp.printables + NON_ASCII_PRINTABLE_CHARACTERS, excludeChars=",;")
+ pp.Optional(pp.Suppress(pp.Literal(",")))
)
^ pp.Suppress(pp.Literal(","))
)
# The directives that don't start with a "." are ambiguous with instructions, so we list
# them explicitly.
# TODO: The directives that are types introduce a nasty ambiguity with instructions. Skip
# them for now, apparently the MSVC output uses the short D? directives.
directive_keywords = (
pp.CaselessKeyword("ALIAS")
| pp.CaselessKeyword("ALIGN")
| pp.CaselessKeyword("ASSUME")
# | pp.CaselessKeyword("BYTE")
| pp.CaselessKeyword("CATSTR")
| pp.CaselessKeyword("COMM")
| pp.CaselessKeyword("COMMENT")
| pp.CaselessKeyword("DB")
| pp.CaselessKeyword("DD")
| pp.CaselessKeyword("DF")
| pp.CaselessKeyword("DQ")
| pp.CaselessKeyword("DT")
| pp.CaselessKeyword("DW")
# | pp.CaselessKeyword("DWORD")
| pp.CaselessKeyword("ECHO")
| pp.CaselessKeyword("END")
| pp.CaselessKeyword("ENDP")
| pp.CaselessKeyword("ENDS")
| pp.CaselessKeyword("EQU")
| pp.CaselessKeyword("EVEN")
| pp.CaselessKeyword("EXTRN")
| pp.CaselessKeyword("EXTERNDEF")
# | pp.CaselessKeyword("FWORD")
| pp.CaselessKeyword("GROUP")
| pp.CaselessKeyword("INCLUDE")
| pp.CaselessKeyword("INCLUDELIB")
| pp.CaselessKeyword("INSTR")
| pp.CaselessKeyword("INVOKE")
| pp.CaselessKeyword("LABEL")
# | pp.CaselessKeyword("MMWORD")
| pp.CaselessKeyword("OPTION")
| pp.CaselessKeyword("ORG")
| pp.CaselessKeyword("PAGE")
| pp.CaselessKeyword("POPCONTEXT")
| pp.CaselessKeyword("PROC")
| pp.CaselessKeyword("PROTO")
| pp.CaselessKeyword("PUBLIC")
| pp.CaselessKeyword("PUSHCONTEXT")
# | pp.CaselessKeyword("QWORD")
# | pp.CaselessKeyword("REAL10")
# | pp.CaselessKeyword("REAL4")
# | pp.CaselessKeyword("REAL8")
| pp.CaselessKeyword("RECORD")
# | pp.CaselessKeyword("SBYTE")
# | pp.CaselessKeyword("SDWORD")
| pp.CaselessKeyword("SEGMENT")
| pp.CaselessKeyword("SIZESTR")
| pp.CaselessKeyword("STRUCT")
| pp.CaselessKeyword("SUBSTR")
| pp.CaselessKeyword("SUBTITLE")
# | pp.CaselessKeyword("SWORD")
# | pp.CaselessKeyword("TBYTE")
| pp.CaselessKeyword("TEXTEQU")
| pp.CaselessKeyword("TITLE")
| pp.CaselessKeyword("TYPEDEF")
| pp.CaselessKeyword("UNION")
# | pp.CaselessKeyword("WORD")
# | pp.CaselessKeyword("XMMWORD")
# | pp.CaselessKeyword("YMMWORD")
)
self.directive = pp.Group(
pp.Optional(~directive_keywords + directive_identifier)
+ (
pp.Combine(pp.Literal(".") + pp.Word(pp.alphanums + "_"))
| pp.Literal("=")
| directive_keywords
).setResultsName("name")
+ pp.ZeroOrMore(directive_parameter).setResultsName("parameters")
+ pp.Optional(self.comment)
).setResultsName(self.directive_id)
def parse_line(self, line, line_number=None):
"""
Parse line and return instruction form.
:param str line: line of assembly code
:param line_number: default None, identifier of instruction form
:type line_number: int, optional
:return: ``dict`` -- parsed asm line (comment, label, directive or instruction form)
"""
instruction_form = InstructionForm(line=line, line_number=line_number)
result = None
# 1. Parse comment.
try:
result = self.process_operand(self.comment.parseString(line, parseAll=True))
instruction_form.comment = " ".join(result[self.comment_id])
except pp.ParseException:
pass
# 2. Parse label.
if not result:
try:
# Returns tuple with label operand and comment, if any.
result = self.process_operand(self.label.parseString(line, parseAll=True))
instruction_form.label = result[0].name
if result[1]:
instruction_form.comment = " ".join(result[1])
except pp.ParseException:
pass
# 3. Parse directive.
if not result:
try:
# Returns tuple with directive operand and comment, if any.
result = self.process_operand(self.directive.parseString(line, parseAll=True))
instruction_form.directive = result[0]
if result[1]:
instruction_form.comment = " ".join(result[1])
except pp.ParseException:
pass
# 4. Parse instruction.
if not result:
try:
result = self.parse_instruction(line)
except pp.ParseException as e:
raise ValueError(
"Could not parse instruction on line {}: {!r}".format(line_number, line)
) from e
instruction_form.mnemonic = result.mnemonic
instruction_form.operands = result.operands
instruction_form.comment = result.comment
return instruction_form
def make_instruction(self, parse_result):
"""
Parse instruction in asm line.
:param parse_result: tuple resulting from calling `parseString` on the `instruction_parser`.
:returns: `dict` -- parsed instruction form
"""
operands = []
# Add operands to list
# Check first operand
if "operand1" in parse_result:
operands.append(self.process_operand(parse_result.operand1))
# Check second operand
if "operand2" in parse_result:
operands.append(self.process_operand(parse_result.operand2))
# Check third operand
if "operand3" in parse_result:
operands.append(self.process_operand(parse_result.operand3))
# Check fourth operand
if "operand4" in parse_result:
operands.append(self.process_operand(parse_result.operand4))
return_dict = InstructionForm(
mnemonic=parse_result.mnemonic,
operands=operands,
label_id=None,
comment_id=(
" ".join(parse_result[self.comment_id])
if self.comment_id in parse_result
else None
),
)
return return_dict
def parse_instruction(self, instruction):
"""
Parse instruction in asm line.
:param str instruction: Assembly line string.
:returns: `dict` -- parsed instruction form
"""
return self.make_instruction(
self.instruction_parser.parseString(instruction, parseAll=True)
)
def parse_register(self, register_string):
"""Parse register string"""
try:
return self.process_operand(self.register.parseString(register_string, parseAll=True))
except pp.ParseException:
return None
def process_operand(self, operand):
"""Post-process operand"""
if self.directive_id in operand:
return self.process_directive(operand[self.directive_id])
if self.identifier in operand:
return self.process_identifier(operand[self.identifier])
if self.immediate_id in operand:
return self.process_immediate(operand[self.immediate_id])
if self.label_id in operand:
return self.process_label(operand[self.label_id])
if self.memory_id in operand:
return self.process_memory_address(operand[self.memory_id])
if self.register_id in operand:
return self.process_register(operand[self.register_id])
return operand
def process_directive(self, directive):
# TODO: This is putting the identifier in the parameters. No idea if it's right.
parameters = [directive.identifier.name] if "identifier" in directive else []
parameters.extend(directive.parameters)
directive_new = DirectiveOperand(name=directive.name, parameters=parameters or None)
# Interpret the "=" directives because the generated assembly is full of symbols that are
# defined there.
if directive.name == "=":
self._equ[parameters[0]] = parameters[1]
return directive_new, directive.get("comment")
def process_register(self, operand):
return RegisterOperand(name=operand.name)
def process_register_expression(self, register_expression):
base = register_expression.get("base")
displacement = register_expression.get("displacement")
indexed = register_expression.get("indexed")
index = None
scale = 1
if indexed:
index = indexed.get("index")
scale = int(indexed.get("scale", "1"), 0)
if register_expression.get("operator_index") == "-":
scale *= -1
displacement_op = self.process_immediate(displacement.immediate) if displacement else None
if displacement_op and register_expression.get("operator_disp") == "-":
displacement_op.value *= -1
base_op = RegisterOperand(name=base.name) if base else None
index_op = RegisterOperand(name=index.name) if index else None
new_memory = MemoryOperand(
offset=displacement_op, base=base_op, index=index_op, scale=scale
)
return new_memory
def process_address_expression(self, address_expression, data_type=None):
# TODO: It seems that we could have a prefix immediate operand, a displacement in the
# brackets, and an offset. How all of this works together is somewhat mysterious.
immediate_operand = (
self.process_immediate(address_expression.immediate)
if "immediate" in address_expression
else None
)
register_expression = (
self.process_register_expression(address_expression.register_expression)
if "register_expression" in address_expression
else None
)
segment = (
self.process_register(address_expression.segment)
if "segment" in address_expression
else None
)
identifier = (
self.process_identifier(address_expression.identifier)
if "identifier" in address_expression
else None
)
if register_expression:
if immediate_operand:
register_expression.offset = immediate_operand
if data_type:
register_expression.data_type = data_type
return register_expression
elif segment:
return MemoryOperand(base=segment, offset=immediate_operand, data_type=data_type)
elif identifier:
if immediate_operand:
identifier.offset = immediate_operand
elif not data_type:
# An address expression without a data type or an offset is just an identifier.
# This matters for jumps.
return identifier
return MemoryOperand(offset=identifier, data_type=data_type)
else:
return MemoryOperand(base=immediate_operand, data_type=data_type)
def process_offset_expression(self, offset_expression):
# TODO: Record that this is an offset expression.
displacement = (
self.process_immediate(offset_expression.displacement)
if "displacement" in offset_expression
else None
)
if displacement and "operator_disp" == "-":
displacement.value *= -1
identifier = self.process_identifier(offset_expression.identifier)
identifier.offset = displacement
return MemoryOperand(offset=identifier)
def process_ptr_expression(self, ptr_expression):
# TODO: Do something with the data_type.
return self.process_address_expression(
ptr_expression.address_expression, ptr_expression.data_type
)
def process_short_expression(self, short_expression):
# TODO: Do something with the fact that it is short.
return LabelOperand(name=short_expression.identifier.name)
def process_memory_address(self, memory_address):
"""Post-process memory address operand"""
if "address_expression" in memory_address:
return self.process_address_expression(memory_address.address_expression)
elif "offset_expression" in memory_address:
return self.process_offset_expression(memory_address.offset_expression)
elif "ptr_expression" in memory_address:
return self.process_ptr_expression(memory_address.ptr_expression)
elif "short_expression" in memory_address:
return self.process_short_expression(memory_address.short_expression)
return memory_address
def process_label(self, label):
"""Post-process label asm line"""
# Remove duplicated 'name' level due to identifier. Note that there is no place to put the
# comment, if any.
label["name"] = label["name"]["name"]
return (
LabelOperand(name=label.name),
self.make_instruction(label) if "mnemonic" in label else None,
)
def process_immediate(self, immediate):
"""Post-process immediate operand"""
if "identifier" in immediate:
# Actually an identifier, change declaration.
return self.process_identifier(immediate.identifier)
new_immediate = ImmediateOperand(value=immediate.get("sign", "") + immediate.value)
new_immediate.value = self.normalize_imd(new_immediate)
return new_immediate
def process_identifier(self, identifier):
if identifier.name in self._equ:
# Actually an immediate, change declaration.
new_immediate = ImmediateOperand(
identifier=identifier.name, value=self._equ[identifier.name]
)
new_immediate.value = self.normalize_imd(new_immediate)
return new_immediate
return IdentifierOperand(name=identifier.name)
def normalize_imd(self, imd):
"""Normalize immediate to decimal based representation"""
if isinstance(imd.value, str):
if "." in imd.value:
return float(imd.value)
# Now parse depending on the base.
base = {"B": 2, "O": 8, "H": 16}.get(imd.value[-1], 10)
value = 0
negative = imd.value[0] == "-"
positive = imd.value[0] == "+"
start = +(negative or positive)
stop = len(imd.value) if base == 10 else -1
for c in imd.value[start:stop]:
value = value * base + int(c, base)
return -value if negative else value
else:
return imd.value