black formatting

2026-01-04 18:20:09 +01:00 · 2025-03-05 10:20:47 +01:00
parent 02716e7b41
commit 9c2f559983
9 changed files with 104 additions and 156 deletions
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -243,9 +243,7 @@ def check_arguments(args, parser):
            "Microarchitecture not supported. Please see --help for all valid architecture codes."
        )
    if args.syntax and args.arch and MachineModel.get_isa_for_arch(args.arch) != "x86":
-        parser.error(
+        parser.error("Syntax can only be explicitly specified for an x86 microarchitecture")
            "Syntax can only be explicitly specified for an x86 microarchitecture"
        )
    if args.syntax and args.syntax.upper() not in SUPPORTED_SYNTAXES:
        parser.error(
            "Assembly syntax not supported. Please see --help for all valid assembly syntaxes."
--- a/osaca/parser/base_parser.py
+++ b/osaca/parser/base_parser.py
@@ -48,12 +48,7 @@ class BaseParser(object):
    # Performs all the normalization needed to match the instruction to the ISO/arch model.  This
    # method must set the `normalized` property of the instruction and must be idempotent.
-    def normalize_instruction_form(
+    def normalize_instruction_form(self, instruction_form, isa_model, arch_model):
        self,
        instruction_form,
        isa_model,
        arch_model
    ):
        raise NotImplementedError
    @staticmethod
--- a/osaca/parser/parser_AArch64.py
+++ b/osaca/parser/parser_AArch64.py
@@ -34,30 +34,25 @@ class ParserAArch64(BaseParser):
        return [
            InstructionForm(
                mnemonic="mov",
-                operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=111)]
+                operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=111)],
            ),
            InstructionForm(
                directive_id=DirectiveOperand(name="byte", parameters=["213", "3", "32", "31"])
-            )
+            ),
        ]
    def end_marker(self):
        return [
            InstructionForm(
                mnemonic="mov",
-                operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=222)]
+                operands=[RegisterOperand(name="1", prefix="x"), ImmediateOperand(value=222)],
            ),
            InstructionForm(
                directive_id=DirectiveOperand(name="byte", parameters=["213", "3", "32", "31"])
-            )
+            ),
        ]
-    def normalize_instruction_form(
+    def normalize_instruction_form(self, instruction_form, isa_model, arch_model):
        self,
        instruction_form,
        isa_model,
        arch_model
    ):
        """
        If the instruction doesn't exist in the machine model, normalize it by dropping the shape
        suffix.
--- a/osaca/parser/parser_x86att.py
+++ b/osaca/parser/parser_x86att.py
@@ -30,16 +30,16 @@ class ParserX86ATT(ParserX86):
            [
                InstructionForm(
                    mnemonic="mov",
-                    operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")]
+                    operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")],
                ),
                InstructionForm(
                    mnemonic="movl",
-                    operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")]
+                    operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")],
-                )
+                ),
            ],
            InstructionForm(
                directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"])
-            )
+            ),
        ]
    def end_marker(self):
@@ -47,24 +47,19 @@ class ParserX86ATT(ParserX86):
            [
                InstructionForm(
                    mnemonic="mov",
-                    operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")]
+                    operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")],
                ),
                InstructionForm(
                    mnemonic="movl",
-                    operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")]
+                    operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")],
-                )
+                ),
            ],
            InstructionForm(
                directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"])
-            )
+            ),
        ]
-    def normalize_instruction_form(
+    def normalize_instruction_form(self, instruction_form, isa_model, arch_model):
        self,
        instruction_form,
        isa_model,
        arch_model
    ):
        """
        If the instruction doesn't exist in the machine model, normalize it by dropping the GAS
        suffix.
--- a/osaca/parser/parser_x86intel.py
+++ b/osaca/parser/parser_x86intel.py
@@ -19,7 +19,8 @@ from osaca.parser.register import RegisterOperand
 # It is appropriate for tools, such as this one, which process source code but do not fully validate
 # it (in this case, that’s the job of the assembler).
 NON_ASCII_PRINTABLE_CHARACTERS = "".join(
-    chr(cp) for cp in range(0x80, 0x10FFFF + 1)
+    chr(cp)
    for cp in range(0x80, 0x10FFFF + 1)
    if unicodedata.category(chr(cp)) not in ("Cc", "Zl", "Zp", "Cs", "Cn")
 )
@@ -51,11 +52,10 @@ class ParserX86Intel(ParserX86):
                mnemonic="mov",
                operands=[
                    MemoryOperand(
-                        base=RegisterOperand(name="GS"),
+                        base=RegisterOperand(name="GS"), offset=ImmediateOperand(value=111)
                        offset=ImmediateOperand(value=111)
                    ),
-                    ImmediateOperand(value=111)
+                    ImmediateOperand(value=111),
-                ]
+                ],
            ),
        ]
@@ -65,20 +65,14 @@ class ParserX86Intel(ParserX86):
                mnemonic="mov",
                operands=[
                    MemoryOperand(
-                        base=RegisterOperand(name="GS"),
+                        base=RegisterOperand(name="GS"), offset=ImmediateOperand(value=222)
                        offset=ImmediateOperand(value=222)
                    ),
-                    ImmediateOperand(value=222)
+                    ImmediateOperand(value=222),
-                ]
+                ],
            ),
        ]
-    def normalize_instruction_form(
+    def normalize_instruction_form(self, instruction_form, isa_model, arch_model):
        self,
        instruction_form,
        isa_model,
        arch_model
    ):
        """
        If the model indicates that this instruction has a single destination that is the last
        operand, move the first operand to the last position.  This effectively converts the Intel
@@ -95,23 +89,14 @@ class ParserX86Intel(ParserX86):
        # The model may only contain the VEX-encoded instruction and we may have the non-VEX-encoded
        # one, or vice-versa.  Note that this doesn't work when the arguments differ between VEX-
        # encoded and non-VEX-encoded, e.g., for psubq.
-        if not arch_model.get_instruction(
+        if not arch_model.get_instruction(mnemonic, len(instruction_form.operands)):
-            mnemonic,
+            if mnemonic[0] == "v":
            len(instruction_form.operands)
        ):
            if mnemonic[0] == 'v':
                unvexed_mnemonic = mnemonic[1:]
-                if arch_model.get_instruction(
+                if arch_model.get_instruction(unvexed_mnemonic, len(instruction_form.operands)):
                    unvexed_mnemonic,
                    len(instruction_form.operands)
                ):
                    mnemonic = unvexed_mnemonic
            else:
-                vexed_mnemonic = 'v' + mnemonic
+                vexed_mnemonic = "v" + mnemonic
-                if arch_model.get_instruction(
+                if arch_model.get_instruction(vexed_mnemonic, len(instruction_form.operands)):
                    vexed_mnemonic,
                    len(instruction_form.operands)
                ):
                    mnemonic = vexed_mnemonic
            instruction_form.mnemonic = mnemonic
@@ -149,13 +134,8 @@ class ParserX86Intel(ParserX86):
            model
            and not has_destination
            and len(instruction_form.operands) == 2
-            and not isa_model.get_instruction(
+            and not isa_model.get_instruction(mnemonic, instruction_form.operands)
-                mnemonic,
+            and not arch_model.get_instruction(mnemonic, instruction_form.operands)
                instruction_form.operands
            ) and not arch_model.get_instruction(
                mnemonic,
                instruction_form.operands
            )
        ):
            instruction_form.operands.reverse()
@@ -167,11 +147,9 @@ class ParserX86Intel(ParserX86):
                if suffix:
                    suffixed_mnemonic = mnemonic + suffix
                    if isa_model.get_instruction(
-                        suffixed_mnemonic,
+                        suffixed_mnemonic, len(instruction_form.operands)
                        len(instruction_form.operands)
                    ) or arch_model.get_instruction(
-                        suffixed_mnemonic,
+                        suffixed_mnemonic, len(instruction_form.operands)
                        len(instruction_form.operands)
                    ):
                        instruction_form.mnemonic = suffixed_mnemonic
                        break
@@ -179,18 +157,10 @@ class ParserX86Intel(ParserX86):
    def construct_parser(self):
        """Create parser for x86 Intel ISA."""
        # Numeric literal.
-        binary_number = pp.Combine(
+        binary_number = pp.Combine(pp.Word("01") + pp.CaselessLiteral("B"))
-            pp.Word("01") + pp.CaselessLiteral("B")
+        octal_number = pp.Combine(pp.Word("01234567") + pp.CaselessLiteral("O"))
-        )
+        decimal_number = pp.Combine(pp.Optional(pp.Literal("-")) + pp.Word(pp.nums))
-        octal_number = pp.Combine(
+        hex_number = pp.Combine(pp.Word(pp.hexnums) + pp.CaselessLiteral("H"))
            pp.Word("01234567") + pp.CaselessLiteral("O")
        )
        decimal_number = pp.Combine(
            pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)
        )
        hex_number = pp.Combine(
            pp.Word(pp.hexnums) + pp.CaselessLiteral("H")
        )
        float_number = pp.Combine(
            pp.Optional(pp.Literal("-")) + pp.Word(pp.nums) + pp.Word(".", pp.nums)
        ).setResultsName("value")
@@ -316,9 +286,8 @@ class ParserX86Intel(ParserX86):
            pp.CaselessKeyword("ST")
            + pp.Optional(pp.Literal("(") + pp.Word("01234567") + pp.Literal(")"))
        ).setResultsName("name")
-        xmm_register = (
+        xmm_register = pp.Combine(pp.CaselessLiteral("XMM") + pp.Word(pp.nums)) | pp.Combine(
-            pp.Combine(pp.CaselessLiteral("XMM") + pp.Word(pp.nums))
+            pp.CaselessLiteral("XMM1") + pp.Word("012345")
            | pp.Combine(pp.CaselessLiteral("XMM1") + pp.Word("012345"))
        )
        simd_register = (
            pp.Combine(pp.CaselessLiteral("MM") + pp.Word("01234567"))
@@ -350,18 +319,15 @@ class ParserX86Intel(ParserX86):
        index_register = self.register
        scale = pp.Word("1248", exact=1)
        post_displacement = pp.Group(
-            (pp.Literal("+") ^ pp.Literal("-")).setResultsName("sign")
+            (pp.Literal("+") ^ pp.Literal("-")).setResultsName("sign") + integer_number
-            + integer_number | identifier
+            | identifier
        ).setResultsName(self.immediate_id)
        pre_displacement = pp.Group(
            integer_number + pp.Literal("+")
        ).setResultsName(self.immediate_id)
        pre_displacement = pp.Group(integer_number + pp.Literal("+")).setResultsName(
            self.immediate_id
        )
        indexed = pp.Group(
            index_register.setResultsName("index")
-            + pp.Optional(
+            + pp.Optional(pp.Literal("*") + scale.setResultsName("scale"))
                pp.Literal("*")
                + scale.setResultsName("scale")
            )
        ).setResultsName("indexed")
        register_expression = pp.Group(
            pp.Literal("[")
@@ -369,9 +335,8 @@ class ParserX86Intel(ParserX86):
            + pp.Group(
                base_register.setResultsName("base")
                ^ pp.Group(
-                    base_register.setResultsName("base")
+                    base_register.setResultsName("base") + pp.Literal("+") + indexed
-                    + pp.Literal("+")
+                ).setResultsName("base_and_indexed")
                    + indexed).setResultsName("base_and_indexed")
                ^ indexed
            ).setResultsName("non_displacement")
            + pp.Optional(pp.Group(post_displacement).setResultsName("post_displacement"))
@@ -379,9 +344,9 @@ class ParserX86Intel(ParserX86):
        ).setResultsName("register_expression")
        # Immediate.
-        immediate = pp.Group(
+        immediate = pp.Group(integer_number | float_number | identifier).setResultsName(
-            integer_number | float_number | identifier
+            self.immediate_id
-        ).setResultsName(self.immediate_id)
+        )
        # Expressions.
        # The ASM86 manual has weird expressions on page 130 (displacement outside of the register
@@ -410,21 +375,16 @@ class ParserX86Intel(ParserX86):
        ptr_expression = pp.Group(
            data_type + pp.CaselessKeyword("PTR") + address_expression
        ).setResultsName("ptr_expression")
-        short_expression = pp.Group(
+        short_expression = pp.Group(pp.CaselessKeyword("SHORT") + identifier).setResultsName(
-            pp.CaselessKeyword("SHORT") + identifier
+            "short_expression"
-        ).setResultsName("short_expression")
+        )
        # Instructions.
-        mnemonic = pp.Word(
+        mnemonic = pp.Word(pp.alphas, pp.alphanums).setResultsName("mnemonic")
            pp.alphas, pp.alphanums
        ).setResultsName("mnemonic")
        operand = pp.Group(
            self.register
            | pp.Group(
-                offset_expression
+                offset_expression | ptr_expression | short_expression | address_expression
                | ptr_expression
                | short_expression
                | address_expression
            ).setResultsName(self.memory_id)
            | immediate
        )
@@ -622,8 +582,11 @@ class ParserX86Intel(ParserX86):
            mnemonic=parse_result.mnemonic,
            operands=operands,
            label_id=None,
-            comment_id=" ".join(parse_result[self.comment_id])
+            comment_id=(
-                       if self.comment_id in parse_result else None,
+                " ".join(parse_result[self.comment_id])
                if self.comment_id in parse_result
                else None
            ),
        )
        return return_dict
@@ -642,9 +605,7 @@ class ParserX86Intel(ParserX86):
    def parse_register(self, register_string):
        """Parse register string"""
        try:
-            return self.process_operand(
+            return self.process_operand(self.register.parseString(register_string, parseAll=True))
                self.register.parseString(register_string, parseAll=True)
            )
        except pp.ParseException:
            return None
@@ -668,10 +629,7 @@ class ParserX86Intel(ParserX86):
        # TODO: This is putting the identifier in the parameters.  No idea if it's right.
        parameters = [directive.identifier.name] if "identifier" in directive else []
        parameters.extend(directive.parameters)
-        directive_new = DirectiveOperand(
+        directive_new = DirectiveOperand(name=directive.name, parameters=parameters or None)
            name=directive.name,
            parameters=parameters or None
        )
        # Interpret the "=" directives because the generated assembly is full of symbols that are
        # defined there.
        if directive.name == "=":
@@ -707,11 +665,14 @@ class ParserX86Intel(ParserX86):
        )
        displacement_op = (
            self.process_immediate(post_displacement.immediate)
-            if post_displacement else displacement_op
+            if post_displacement
            else displacement_op
        )
        base_op = RegisterOperand(name=base.name) if base else None
        index_op = RegisterOperand(name=index.name) if index else None
-        new_memory = MemoryOperand(offset=displacement_op, base=base_op, index=index_op, scale=scale)
+        new_memory = MemoryOperand(
            offset=displacement_op, base=base_op, index=index_op, scale=scale
        )
        return new_memory
    def process_address_expression(self, address_expression, data_type=None):
@@ -719,19 +680,23 @@ class ParserX86Intel(ParserX86):
        # brackets, and an offset.  How all of this works together is somewhat mysterious.
        immediate_operand = (
            self.process_immediate(address_expression.immediate)
-            if "immediate" in address_expression else None
+            if "immediate" in address_expression
            else None
        )
        register_expression = (
            self.process_register_expression(address_expression.register_expression)
-            if "register_expression" in address_expression else None
+            if "register_expression" in address_expression
            else None
        )
        segment = (
            self.process_register(address_expression.segment)
-            if "segment" in address_expression else None
+            if "segment" in address_expression
            else None
        )
        identifier = (
            self.process_identifier(address_expression.identifier)
-            if "identifier" in address_expression else None
+            if "identifier" in address_expression
            else None
        )
        if register_expression:
            if immediate_operand:
@@ -756,7 +721,8 @@ class ParserX86Intel(ParserX86):
        # TODO: Record that this is an offset expression.
        displacement = (
            self.process_immediate(offset_expression.displacement)
-            if "displacement" in offset_expression else None
+            if "displacement" in offset_expression
            else None
        )
        identifier = self.process_identifier(offset_expression.identifier)
        identifier.offset = displacement
@@ -765,8 +731,7 @@ class ParserX86Intel(ParserX86):
    def process_ptr_expression(self, ptr_expression):
        # TODO: Do something with the data_type.
        return self.process_address_expression(
-            ptr_expression.address_expression,
+            ptr_expression.address_expression, ptr_expression.data_type
            ptr_expression.data_type
        )
    def process_short_expression(self, short_expression):
@@ -790,8 +755,10 @@ class ParserX86Intel(ParserX86):
        # Remove duplicated 'name' level due to identifier.  Note that there is no place to put the
        # comment, if any.
        label["name"] = label["name"]["name"]
-        return (LabelOperand(name=label.name),
+        return (
-                self.make_instruction(label) if "mnemonic" in label else None)
+            LabelOperand(name=label.name),
            self.make_instruction(label) if "mnemonic" in label else None,
        )
    def process_immediate(self, immediate):
        """Post-process immediate operand"""
@@ -806,8 +773,7 @@ class ParserX86Intel(ParserX86):
        if identifier.name in self._equ:
            # Actually an immediate, change declaration.
            new_immediate = ImmediateOperand(
-                identifier=identifier.name,
+                identifier=identifier.name, value=self._equ[identifier.name]
                value=self._equ[identifier.name]
            )
            new_immediate.value = self.normalize_imd(new_immediate)
            return new_immediate
@@ -816,13 +782,13 @@ class ParserX86Intel(ParserX86):
    def normalize_imd(self, imd):
        """Normalize immediate to decimal based representation"""
        if isinstance(imd.value, str):
-            if '.' in imd.value:
+            if "." in imd.value:
                return float(imd.value)
            # Now parse depending on the base.
-            base = {'B': 2, 'O': 8, 'H': 16}.get(imd.value[-1], 10)
+            base = {"B": 2, "O": 8, "H": 16}.get(imd.value[-1], 10)
            value = 0
-            negative = imd.value[0] == '-'
+            negative = imd.value[0] == "-"
-            positive = imd.value[0] == '+'
+            positive = imd.value[0] == "+"
            start = +(negative or positive)
            stop = len(imd.value) if base == 10 else -1
            for c in imd.value[start:stop]:
--- a/osaca/semantics/arch_semantics.py
+++ b/osaca/semantics/arch_semantics.py
@@ -19,9 +19,7 @@ class ArchSemantics(ISASemantics):
    def normalize_instruction_form(self, instruction_form):
        self.parser.normalize_instruction_form(
-            instruction_form,
+            instruction_form, self.isa_model, self._machine_model
            self.isa_model,
            self._machine_model
        )
    def normalize_instruction_forms(self, instruction_forms):
--- a/osaca/semantics/hw_model.py
+++ b/osaca/semantics/hw_model.py
@@ -303,19 +303,16 @@ class MachineModel(object):
                        for instruction_form in name_matched_iforms
                        if len(instruction_form.operands) == arity
                    ),
-                    None
+                    None,
                )
            else:
                return next(
                    (
                        instruction_form
                        for instruction_form in name_matched_iforms
-                        if self._match_operands(
+                        if self._match_operands(instruction_form.operands, operands)
                            instruction_form.operands,
                            operands
                        )
                    ),
-                    None
+                    None,
                )
        except TypeError as e:
            print("\nname: {}\noperands: {}".format(name, operands))
@@ -893,6 +890,7 @@ class MachineModel(object):
    def _is_x86_reg_type(self, i_reg, reg, consider_masking=False):
        from osaca.parser import ParserX86
        """Check if register type match."""
        if reg is None:
            if i_reg is None:
--- a/osaca/semantics/isa_semantics.py
+++ b/osaca/semantics/isa_semantics.py
@@ -83,7 +83,9 @@ class ISASemantics(object):
        if assign_default:
            # no irregular operand structure, apply default
            op_dict["source"] = self._parser.get_regular_source_operands(instruction_form)
-            op_dict["destination"] = self._parser.get_regular_destination_operands(instruction_form)
+            op_dict["destination"] = self._parser.get_regular_destination_operands(
                instruction_form
            )
            op_dict["src_dst"] = []
        # post-process pre- and post-indexing for aarch64 memory operands
        if self._parser.isa() == "aarch64":
--- a/osaca/semantics/marker_utils.py
+++ b/osaca/semantics/marker_utils.py
@@ -193,7 +193,9 @@ def match_line(parser, line, marker_line):
        and marker_line.directive
        and line.directive.name == marker_line.directive.name
    ):
-        return match_parameters(parser, line.directive.parameters, marker_line.directive.parameters)
+        return match_parameters(
            parser, line.directive.parameters, marker_line.directive.parameters
        )
    else:
        return Matching.No
@@ -203,8 +205,7 @@ def match_operands(line_operands, marker_line_operands):
        return False
    return all(
        match_operand(line_operand, marker_line_operand)
-        for line_operand, marker_line_operand in
+        for line_operand, marker_line_operand in zip(line_operands, marker_line_operands)
        zip(line_operands, marker_line_operands)
    )
@@ -222,10 +223,10 @@ def match_operand(line_operand, marker_line_operand):
    ):
        return True
    if (
-            isinstance(line_operand, MemoryOperand)
+        isinstance(line_operand, MemoryOperand)
-            and isinstance(marker_line_operand, MemoryOperand)
+        and isinstance(marker_line_operand, MemoryOperand)
-            and match_operand(line_operand.base, marker_line_operand.base)
+        and match_operand(line_operand.base, marker_line_operand.base)
-            and match_operand(line_operand.offset, line_operand.offset)
+        and match_operand(line_operand.offset, line_operand.offset)
    ):
        return True
    return False