diff --git a/osaca/osaca.py b/osaca/osaca.py index 12677e8..1029e17 100644 --- a/osaca/osaca.py +++ b/osaca/osaca.py @@ -11,7 +11,7 @@ from ruamel.yaml import YAML from osaca.db_interface import import_benchmark_output, sanity_check from osaca.frontend import Frontend -from osaca.parser import BaseParser, ParserAArch64, ParserX86, ParserX86ATT, ParserX86Intel +from osaca.parser import BaseParser, ParserAArch64, ParserX86ATT, ParserX86Intel from osaca.semantics import ( INSTR_FLAGS, ArchSemantics, @@ -355,7 +355,7 @@ def inspect(args, output_file=sys.stdout): (arch, syntax) for arch in archs_to_try for syntax in syntaxes_to_try - if (syntax != None) == (MachineModel.get_isa_for_arch(arch) == "x86") + if (syntax is not None) == (MachineModel.get_isa_for_arch(arch) == "x86") ] # Parse file. diff --git a/osaca/parser/parser_x86att.py b/osaca/parser/parser_x86att.py index 216a428..5cbccb0 100644 --- a/osaca/parser/parser_x86att.py +++ b/osaca/parser/parser_x86att.py @@ -1,8 +1,5 @@ #!/usr/bin/env python3 -import string -import re - import pyparsing as pp from osaca.parser import ParserX86 @@ -34,11 +31,11 @@ class ParserX86ATT(ParserX86): InstructionForm( mnemonic="mov", operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")] - ), - InstructionForm( - mnemonic="movl", - operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")] - ) + ), + InstructionForm( + mnemonic="movl", + operands=[ImmediateOperand(value=111), RegisterOperand(name="ebx")] + ) ], InstructionForm( directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"]) @@ -51,11 +48,11 @@ class ParserX86ATT(ParserX86): InstructionForm( mnemonic="mov", operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")] - ), - InstructionForm( - mnemonic="movl", - operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")] - ) + ), + InstructionForm( + mnemonic="movl", + operands=[ImmediateOperand(value=222), RegisterOperand(name="ebx")] + ) ], InstructionForm( directive_id=DirectiveOperand(name="byte", parameters=["100", "103", "144"]) diff --git a/osaca/parser/parser_x86intel.py b/osaca/parser/parser_x86intel.py index df231d0..dc1d898 100644 --- a/osaca/parser/parser_x86intel.py +++ b/osaca/parser/parser_x86intel.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 import pyparsing as pp -import re -import string import unicodedata from osaca.parser import ParserX86 @@ -25,6 +23,7 @@ NON_ASCII_PRINTABLE_CHARACTERS = "".join( if unicodedata.category(chr(cp)) not in ("Cc", "Zl", "Zp", "Cs", "Cn") ) + # References: # ASM386 Assembly Language Reference, document number 469165-003, https://mirror.math.princeton.edu/pub/oldlinux/Linux.old/Ref-docs/asm-ref.pdf. # Microsoft Macro Assembler BNF Grammar, https://learn.microsoft.com/en-us/cpp/assembler/masm/masm-bnf-grammar?view=msvc-170. @@ -146,16 +145,18 @@ class ParserX86Intel(ParserX86): # A hack to help with comparison instruction: if the instruction is in the model, and has # exactly two sources, swap its operands. - if (model and - not has_destination and - len(instruction_form.operands) == 2 + if ( + model + and not has_destination + and len(instruction_form.operands) == 2 and not isa_model.get_instruction( mnemonic, instruction_form.operands ) and not arch_model.get_instruction( mnemonic, instruction_form.operands - )): + ) + ): instruction_form.operands.reverse() # If the instruction has a well-known data type, append a suffix. @@ -175,7 +176,6 @@ class ParserX86Intel(ParserX86): instruction_form.mnemonic = suffixed_mnemonic break - def construct_parser(self): """Create parser for x86 Intel ISA.""" # Numeric literal. @@ -353,12 +353,15 @@ class ParserX86Intel(ParserX86): (pp.Literal("+") ^ pp.Literal("-")).setResultsName("sign") + integer_number | identifier ).setResultsName(self.immediate_id) - pre_displacement = pp.Group(integer_number + pp.Literal("+") + pre_displacement = pp.Group( + integer_number + pp.Literal("+") ).setResultsName(self.immediate_id) indexed = pp.Group( index_register.setResultsName("index") - + pp.Optional(pp.Literal("*") - + scale.setResultsName("scale")) + + pp.Optional( + pp.Literal("*") + + scale.setResultsName("scale") + ) ).setResultsName("indexed") register_expression = pp.Group( pp.Literal("[") @@ -370,7 +373,7 @@ class ParserX86Intel(ParserX86): + pp.Literal("+") + indexed).setResultsName("base_and_indexed") ^ indexed - ).setResultsName("non_displacement") + ).setResultsName("non_displacement") + pp.Optional(pp.Group(post_displacement).setResultsName("post_displacement")) + pp.Literal("]") ).setResultsName("register_expression") @@ -472,7 +475,7 @@ class ParserX86Intel(ParserX86): pp.CaselessKeyword("ALIAS") | pp.CaselessKeyword("ALIGN") | pp.CaselessKeyword("ASSUME") - #| pp.CaselessKeyword("BYTE") + # | pp.CaselessKeyword("BYTE") | pp.CaselessKeyword("CATSTR") | pp.CaselessKeyword("COMM") | pp.CaselessKeyword("COMMENT") @@ -482,7 +485,7 @@ class ParserX86Intel(ParserX86): | pp.CaselessKeyword("DQ") | pp.CaselessKeyword("DT") | pp.CaselessKeyword("DW") - #| pp.CaselessKeyword("DWORD") + # | pp.CaselessKeyword("DWORD") | pp.CaselessKeyword("ECHO") | pp.CaselessKeyword("END") | pp.CaselessKeyword("ENDP") @@ -491,14 +494,14 @@ class ParserX86Intel(ParserX86): | pp.CaselessKeyword("EVEN") | pp.CaselessKeyword("EXTRN") | pp.CaselessKeyword("EXTERNDEF") - #| pp.CaselessKeyword("FWORD") + # | pp.CaselessKeyword("FWORD") | pp.CaselessKeyword("GROUP") | pp.CaselessKeyword("INCLUDE") | pp.CaselessKeyword("INCLUDELIB") | pp.CaselessKeyword("INSTR") | pp.CaselessKeyword("INVOKE") | pp.CaselessKeyword("LABEL") - #| pp.CaselessKeyword("MMWORD") + # | pp.CaselessKeyword("MMWORD") | pp.CaselessKeyword("OPTION") | pp.CaselessKeyword("ORG") | pp.CaselessKeyword("PAGE") @@ -507,27 +510,27 @@ class ParserX86Intel(ParserX86): | pp.CaselessKeyword("PROTO") | pp.CaselessKeyword("PUBLIC") | pp.CaselessKeyword("PUSHCONTEXT") - #| pp.CaselessKeyword("QWORD") - #| pp.CaselessKeyword("REAL10") - #| pp.CaselessKeyword("REAL4") - #| pp.CaselessKeyword("REAL8") + # | pp.CaselessKeyword("QWORD") + # | pp.CaselessKeyword("REAL10") + # | pp.CaselessKeyword("REAL4") + # | pp.CaselessKeyword("REAL8") | pp.CaselessKeyword("RECORD") - #| pp.CaselessKeyword("SBYTE") - #| pp.CaselessKeyword("SDWORD") + # | pp.CaselessKeyword("SBYTE") + # | pp.CaselessKeyword("SDWORD") | pp.CaselessKeyword("SEGMENT") | pp.CaselessKeyword("SIZESTR") | pp.CaselessKeyword("STRUCT") | pp.CaselessKeyword("SUBSTR") | pp.CaselessKeyword("SUBTITLE") - #| pp.CaselessKeyword("SWORD") - #| pp.CaselessKeyword("TBYTE") + # | pp.CaselessKeyword("SWORD") + # | pp.CaselessKeyword("TBYTE") | pp.CaselessKeyword("TEXTEQU") | pp.CaselessKeyword("TITLE") | pp.CaselessKeyword("TYPEDEF") | pp.CaselessKeyword("UNION") - #| pp.CaselessKeyword("WORD") - #| pp.CaselessKeyword("XMMWORD") - #| pp.CaselessKeyword("YMMWORD") + # | pp.CaselessKeyword("WORD") + # | pp.CaselessKeyword("XMMWORD") + # | pp.CaselessKeyword("YMMWORD") ) self.directive = pp.Group( pp.Optional(~directive_keywords + directive_identifier) diff --git a/osaca/semantics/arch_semantics.py b/osaca/semantics/arch_semantics.py index 5e485d2..f952cbb 100644 --- a/osaca/semantics/arch_semantics.py +++ b/osaca/semantics/arch_semantics.py @@ -1,7 +1,5 @@ #!/usr/bin/env python3 """Semantics opbject responsible for architecture specific semantic operations""" - -from dis import Instruction import sys import warnings from itertools import chain diff --git a/osaca/semantics/kernel_dg.py b/osaca/semantics/kernel_dg.py index d8d48ac..7d3eb46 100644 --- a/osaca/semantics/kernel_dg.py +++ b/osaca/semantics/kernel_dg.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 import copy -import os -import signal import time from itertools import chain from multiprocessing import Manager, Process, cpu_count diff --git a/osaca/semantics/marker_utils.py b/osaca/semantics/marker_utils.py index 86c59c5..5b60ef9 100644 --- a/osaca/semantics/marker_utils.py +++ b/osaca/semantics/marker_utils.py @@ -1,11 +1,8 @@ #!/usr/bin/env python3 from collections import OrderedDict from enum import Enum -from functools import partial from osaca.parser import get_parser -from osaca.parser.instruction_form import InstructionForm -from osaca.parser.directive import DirectiveOperand from osaca.parser.identifier import IdentifierOperand from osaca.parser.immediate import ImmediateOperand from osaca.parser.memory import MemoryOperand @@ -13,6 +10,7 @@ from osaca.parser.register import RegisterOperand COMMENT_MARKER = {"start": "OSACA-BEGIN", "end": "OSACA-END"} + # State of marker matching. # No: we have determined that the code doesn't match the marker. # Partial: so far the code matches the marker, but we have not reached the end of the marker yet. @@ -173,6 +171,7 @@ def get_marker(isa, syntax="ATT", comment=""): return start_marker, end_marker + def match_line(parser, line, marker_line): """ Returns whether `line` matches `marker_line`. @@ -198,6 +197,7 @@ def match_line(parser, line, marker_line): else: return Matching.No + def match_operands(line_operands, marker_line_operands): if len(line_operands) != len(marker_line_operands): return False @@ -207,6 +207,7 @@ def match_operands(line_operands, marker_line_operands): zip(line_operands, marker_line_operands) ) + def match_operand(line_operand, marker_line_operand): if ( isinstance(line_operand, ImmediateOperand) @@ -221,14 +222,15 @@ def match_operand(line_operand, marker_line_operand): ): return True if ( - isinstance(line_operand, MemoryOperand) - and isinstance(marker_line_operand, MemoryOperand) - and match_operand(line_operand.base, marker_line_operand.base) - and match_operand(line_operand.offset, line_operand.offset) - ): + isinstance(line_operand, MemoryOperand) + and isinstance(marker_line_operand, MemoryOperand) + and match_operand(line_operand.base, marker_line_operand.base) + and match_operand(line_operand.offset, line_operand.offset) + ): return True return False + def match_parameters(parser, line_parameters, marker_line_parameters): """ Returns whether `line_parameters` matches `marker_line_parameters`. @@ -238,13 +240,10 @@ def match_parameters(parser, line_parameters, marker_line_parameters): :return: Matching. In case of partial match, `marker_line_parameters` is modified and should be reused for matching the next line in the parsed assembly code. """ - line_parameter_count = len(line_parameters) - marker_line_parameter_count = len(marker_line_parameters) - # The elements of `marker_line_parameters` are consumed as they are matched. for line_parameter in line_parameters: if not marker_line_parameters: - break; + break marker_line_parameter = marker_line_parameters[0] if not match_parameter(parser, line_parameter, marker_line_parameter): return Matching.No @@ -254,6 +253,7 @@ def match_parameters(parser, line_parameters, marker_line_parameters): else: return Matching.Full + def match_parameter(parser, line_parameter, marker_line_parameter): if line_parameter.lower() == marker_line_parameter.lower(): return True diff --git a/tests/test_parser_x86intel.py b/tests/test_parser_x86intel.py index c95a070..380efa8 100755 --- a/tests/test_parser_x86intel.py +++ b/tests/test_parser_x86intel.py @@ -6,8 +6,6 @@ Unit tests for x86 Intel assembly parser import os import unittest -from pyparsing import ParseException - from osaca.parser import ParserX86Intel, InstructionForm from osaca.parser.directive import DirectiveOperand from osaca.parser.identifier import IdentifierOperand @@ -134,13 +132,13 @@ class TestParserX86Intel(unittest.TestCase): self.assertEqual(parsed_4.mnemonic, "mov") self.assertEqual(parsed_4.operands[0], RegisterOperand(name="EAX")) - self.assertEqual(parsed_4.operands[1], - MemoryOperand(offset=ImmediateOperand( - identifier="cur_elements$", - value=104 - ), - base=RegisterOperand(name="RBP"))) - + self.assertEqual( + parsed_4.operands[1], + MemoryOperand( + offset=ImmediateOperand(identifier="cur_elements$", value=104), + base=RegisterOperand(name="RBP") + ) + ) self.assertEqual(parsed_5.mnemonic, "mov") self.assertEqual(parsed_5.operands[0], MemoryOperand(offset=ImmediateOperand(value=24), @@ -252,18 +250,21 @@ class TestParserX86Intel(unittest.TestCase): self.assertEqual(parsed[0].line_number, 1) # Check specifically that the values of the symbols defined by "=" were correctly # propagated. - self.assertEqual(parsed[69], - InstructionForm(mnemonic="mov", - operands=[MemoryOperand( - base=RegisterOperand("RBP"), - offset=ImmediateOperand( - value=4, - identifier="r$1" - ) - ), - ImmediateOperand(value=0)], - line="\tmov\tDWORD PTR r$1[rbp], 0", - line_number=73)) + self.assertEqual( + parsed[69], + InstructionForm( + mnemonic="mov", + operands=[ + MemoryOperand( + base=RegisterOperand("RBP"), + offset=ImmediateOperand(value=4, identifier="r$1") + ), + ImmediateOperand(value=0) + ], + line="\tmov\tDWORD PTR r$1[rbp], 0", + line_number=73 + ) + ) # Check a few lines to make sure that we produced something reasonable. self.assertEqual(parsed[60], InstructionForm(mnemonic="mov", diff --git a/tests/test_semantics.py b/tests/test_semantics.py index 46d38b4..b02e669 100755 --- a/tests/test_semantics.py +++ b/tests/test_semantics.py @@ -314,11 +314,11 @@ class TestSemanticTools(unittest.TestCase): def test_src_dst_assignment_x86_intel(self): for instruction_form in self.kernel_x86_intel: - with self.subTest(instruction_form=instruction_form): - if instruction_form.semantic_operands is not None: - self.assertTrue("source" in instruction_form.semantic_operands) - self.assertTrue("destination" in instruction_form.semantic_operands) - self.assertTrue("src_dst" in instruction_form.semantic_operands) + with self.subTest(instruction_form=instruction_form): + if instruction_form.semantic_operands is not None: + self.assertTrue("source" in instruction_form.semantic_operands) + self.assertTrue("destination" in instruction_form.semantic_operands) + self.assertTrue("src_dst" in instruction_form.semantic_operands) def test_src_dst_assignment_AArch64(self): for instruction_form in self.kernel_AArch64: