#!/usr/bin/env python3 import string import re import pyparsing as pp from osaca.parser import BaseParser from osaca.parser.instruction_form import instructionForm from osaca.parser.operand import Operand from osaca.parser.directive import DirectiveOperand from osaca.parser.memory import MemoryOperand from osaca.parser.label import LabelOperand from osaca.parser.register import RegisterOperand from osaca.parser.identifier import IdentifierOperand from osaca.parser.immediate import ImmediateOperand from osaca.parser.operand import Operand class ParserX86ATT(BaseParser): _instance = None # Singelton pattern, as this is created very many times def __new__(cls): if cls._instance is None: cls._instance = super(ParserX86ATT, cls).__new__(cls) return cls._instance def __init__(self): super().__init__() self.isa = "x86" def construct_parser(self): """Create parser for x86 AT&T ISA.""" decimal_number = pp.Combine( pp.Optional(pp.Literal("-")) + pp.Word(pp.nums) ).setResultsName("value") hex_number = pp.Combine( pp.Optional(pp.Literal("-")) + pp.Literal("0x") + pp.Word(pp.hexnums) ).setResultsName("value") # Comment - either '#' or '//' (icc) self.comment = (pp.Literal("#") | pp.Literal("//")) + pp.Group( pp.ZeroOrMore(pp.Word(pp.printables)) ).setResultsName(self.comment_id) # Define x86 assembly identifier relocation = pp.Combine(pp.Literal("@") + pp.Word(pp.alphas)) id_offset = pp.Word(pp.nums) + pp.Suppress(pp.Literal("+")) first = pp.Word(pp.alphas + "-_.", exact=1) rest = pp.Word(pp.alphanums + "$_.+-") identifier = pp.Group( pp.Optional(id_offset).setResultsName("offset") + pp.Combine( pp.delimitedList(pp.Combine(first + pp.Optional(rest)), delim="::"), joinString="::", ).setResultsName("name") + pp.Optional(relocation).setResultsName("relocation") ).setResultsName("identifier") # Label label_rest = pp.Word(pp.alphanums + "$_.+-()") label_identifier = pp.Group( pp.Optional(id_offset).setResultsName("offset") + pp.Combine( pp.delimitedList(pp.Combine(first + pp.Optional(label_rest)), delim="::"), joinString="::", ).setResultsName("name") + pp.Optional(relocation).setResultsName("relocation") ).setResultsName("identifier") numeric_identifier = pp.Group( pp.Word(pp.nums).setResultsName("name") + pp.Optional(pp.oneOf("b f", caseless=True).setResultsName("suffix")) ).setResultsName("identifier") self.label = pp.Group( (label_identifier | numeric_identifier).setResultsName("name") + pp.Literal(":") + pp.Optional(self.comment) ).setResultsName(self.label_id) # Register: pp.Regex('^%[0-9a-zA-Z]+{}{z},?') self.register = pp.Group( pp.Literal("%") + pp.Word(pp.alphanums).setResultsName("name") + pp.Optional(pp.Literal("(") + pp.Word(pp.nums) + pp.Literal(")")) + pp.Optional( pp.Literal("{") + pp.Optional(pp.Suppress(pp.Literal("%"))) + pp.Word(pp.alphanums).setResultsName("mask") + pp.Literal("}") + pp.Optional( pp.Suppress(pp.Literal("{")) + pp.Literal("z").setResultsName("zeroing") + pp.Suppress(pp.Literal("}")) ) ) ).setResultsName(self.REGISTER_ID) # Immediate: pp.Regex('^\$(-?[0-9]+)|(0x[0-9a-fA-F]+),?') symbol_immediate = "$" immediate = pp.Group( pp.Literal(symbol_immediate) + (hex_number | decimal_number | identifier) ).setResultsName(self.IMMEDIATE_ID) # Memory preparations offset = pp.Group(hex_number | decimal_number | identifier).setResultsName( self.IMMEDIATE_ID ) scale = pp.Word("1248", exact=1) # Segment register extension segment_extension = ( hex_number ^ pp.Word(pp.nums) ^ pp.Group( pp.Optional(offset.setResultsName("offset")) + pp.Literal("(") + pp.Optional(self.register.setResultsName("base")) + pp.Optional(pp.Suppress(pp.Literal(","))) + pp.Optional(self.register.setResultsName("index")) + pp.Optional(pp.Suppress(pp.Literal(","))) + pp.Optional(scale.setResultsName("scale")) + pp.Literal(")") ) ) memory_segmentation = ( pp.Optional(pp.Suppress(pp.Literal("*"))) + self.register.setResultsName("base") + pp.Literal(":") + segment_extension.setResultsName(self.segment_ext_id) ) # Memory: offset | seg:seg_ext | offset(base, index, scale){mask} memory_abs = pp.Suppress(pp.Literal("*")) + (offset | self.register).setResultsName( "offset" ) memory = pp.Group( ( pp.Optional(pp.Suppress(pp.Literal("*"))) + pp.Optional(offset.setResultsName("offset")) + pp.Literal("(") + pp.Optional(self.register.setResultsName("base")) + pp.Optional(pp.Suppress(pp.Literal(","))) + pp.Optional(self.register.setResultsName("index")) + pp.Optional(pp.Suppress(pp.Literal(","))) + pp.Optional(scale.setResultsName("scale")) + pp.Literal(")") + pp.Optional( pp.Literal("{") + pp.Optional(pp.Suppress(pp.Literal("%"))) + pp.Word(pp.alphanums).setResultsName("mask") + pp.Literal("}") ) ) | memory_abs | memory_segmentation | (hex_number | pp.Word(pp.nums)).setResultsName("offset") ).setResultsName(self.MEMORY_ID) # Directive # parameter can be any quoted string or sequence of characters besides '#' (for comments) # or ',' (parameter delimiter) directive_parameter = ( pp.quotedString ^ ( pp.Word(pp.printables, excludeChars=",#") + pp.Optional(pp.Suppress(pp.Literal(","))) ) ^ pp.Suppress(pp.Literal(",")) ) self.directive = pp.Group( pp.Literal(".") + pp.Word(pp.alphanums + "_").setResultsName("name") + pp.ZeroOrMore(directive_parameter).setResultsName("parameters") + pp.Optional(self.comment) ).setResultsName(self.directive_id) # Instructions # Mnemonic mnemonic = pp.ZeroOrMore(pp.Literal("data16") | pp.Literal("data32")) + pp.Word( pp.alphanums + "," ).setResultsName("mnemonic") # Combine to instruction form operand_first = pp.Group( self.register ^ immediate ^ memory ^ identifier ^ numeric_identifier ) operand_rest = pp.Group(self.register ^ immediate ^ memory) self.instruction_parser = ( mnemonic + pp.Optional(operand_first.setResultsName("operand1")) + pp.Optional(pp.Suppress(pp.Literal(","))) + pp.Optional(operand_rest.setResultsName("operand2")) + pp.Optional(pp.Suppress(pp.Literal(","))) + pp.Optional(operand_rest.setResultsName("operand3")) + pp.Optional(pp.Suppress(pp.Literal(","))) + pp.Optional(operand_rest.setResultsName("operand4")) + pp.Optional(self.comment) ) def parse_register(self, register_string): """Parse register string""" try: return self.process_operand( self.register.parseString(register_string, parseAll=True).asDict() ) except pp.ParseException: return None def parse_line(self, line, line_number=None): """ Parse line and return instruction form. :param str line: line of assembly code :param line_number: default None, identifier of instruction form :type line_number: int, optional :return: ``dict`` -- parsed asm line (comment, label, directive or instruction form) """ instruction_form = instructionForm(line=line, line_number=line_number) result = None # 1. Parse comment try: result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict()) instruction_form.comment = " ".join(result[self.comment_id]) except pp.ParseException: pass # 2. Parse label if result is None: try: result = self.process_operand(self.label.parseString(line, parseAll=True).asDict()) instruction_form.label = result.name if result.comment != None: instruction_form.comment = " ".join(result.comment) except pp.ParseException: pass # 3. Parse directive if result is None: try: result = self.process_operand( self.directive.parseString(line, parseAll=True).asDict() ) instruction_form.directive = DirectiveOperand( name_id=result.name, parameter_id=result.parameters, ) if result.comment != None: instruction_form.comment = " ".join(result.comment) except pp.ParseException: pass # 4. Parse instruction if result is None: try: result = self.parse_instruction(line) except pp.ParseException: raise ValueError( "Could not parse instruction on line {}: {!r}".format(line_number, line) ) instruction_form.instruction = result.instruction instruction_form.operands = result.operands instruction_form.comment = result.comment return instruction_form def parse_instruction(self, instruction): """ Parse instruction in asm line. :param str instruction: Assembly line string. :returns: `dict` -- parsed instruction form """ result = self.instruction_parser.parseString(instruction, parseAll=True).asDict() operands = [] # Add operands to list # Check first operand if "operand1" in result: operands.append(self.process_operand(result["operand1"])) # Check second operand if "operand2" in result: operands.append(self.process_operand(result["operand2"])) # Check third operand if "operand3" in result: operands.append(self.process_operand(result["operand3"])) # Check fourth operand if "operand4" in result: operands.append(self.process_operand(result["operand4"])) return_dict = instructionForm( instruction_id=result["mnemonic"].split(",")[0], operands_id=operands, comment_id=" ".join(result[self.comment_id]) if self.comment_id in result else None, ) return return_dict def process_operand(self, operand): """Post-process operand""" # For the moment, only used to structure memory addresses if self.MEMORY_ID in operand: return self.process_memory_address(operand[self.MEMORY_ID]) if self.IMMEDIATE_ID in operand: return self.process_immediate(operand[self.IMMEDIATE_ID]) if self.label_id in operand: return self.process_label(operand[self.label_id]) if self.directive_id in operand: return self.process_directive(operand[self.directive_id]) if self.REGISTER_ID in operand: return RegisterOperand( prefix_id=operand["register"]["prefix"] if "prefix" in operand["register"] else None, name_id=operand["register"]["name"], shape=operand["register"]["shape"] if "shape" in operand["register"] else None, lanes=operand["register"]["lanes"] if "lanes" in operand["register"] else None, index=operand["register"]["index"] if "index" in operand["register"] else None, predication=operand["register"]["predication"] if "predication" in operand["register"] else None, ) if self.IDENTIFIER_ID in operand: return IdentifierOperand(name=operand[self.IDENTIFIER_ID]["name"]) return operand def process_directive(self, directive): directive_new = DirectiveOperand(name_id=directive["name"], parameter_id=[]) if "parameters" in directive: directive_new.parameters = directive["parameters"] if "comment" in directive: directive_new.comment = directive["comment"] return directive_new def process_memory_address(self, memory_address): """Post-process memory address operand""" # Remove unecessarily created dictionary entries during memory address parsing offset = memory_address.get("offset", None) base = memory_address.get("base", None) baseOp = None indexOp = None index = memory_address.get("index", None) scale = 1 if "scale" not in memory_address else int(memory_address["scale"], 0) if isinstance(offset, str) and base is None and index is None: try: offset = ImmediateOperand(value_id=int(offset, 0)) except ValueError: offset = ImmediateOperand(value_id=offset) elif offset is not None and "value" in offset: offset = ImmediateOperand(value_id=int(offset["value"], 0)) if base != None: baseOp = RegisterOperand( name_id=base["name"], prefix_id=base["prefix"] if "prefix" in base else None ) if index != None: indexOp = RegisterOperand( name_id=index["name"], prefix_id=index["prefix"] if "prefix" in index else None ) if isinstance(offset, dict) and "identifier" in offset: offset = IdentifierOperand(name=offset["identifier"]["name"]) new_dict = MemoryOperand( offset_ID=offset, base_id=baseOp, index_id=indexOp, scale_id=scale ) # Add segmentation extension if existing if self.segment_ext_id in memory_address: new_dict.segment_ext_id = memory_address[self.segment_ext_id] return new_dict def process_label(self, label): """Post-process label asm line""" # remove duplicated 'name' level due to identifier label["name"] = label["name"][0]["name"] new_label = LabelOperand( name_id=label["name"], comment_id=label["comment"] if "comment" in label else None ) return new_label def process_immediate(self, immediate): """Post-process immediate operand""" if "identifier" in immediate: # actually an identifier, change declaration return immediate # otherwise just make sure the immediate is a decimal #immediate["value"] = int(immediate["value"], 0) new_immediate = ImmediateOperand(value_id = int(immediate["value"], 0)) return new_immediate def get_full_reg_name(self, register): """Return one register name string including all attributes""" # nothing to do return register.name def normalize_imd(self, imd): """Normalize immediate to decimal based representation""" if isinstance(imd, IdentifierOperand): return imd if imd.value!=None: if isinstance(imd.value, str): # return decimal return int(imd.value, 0) else: return imd.value # identifier return imd def is_flag_dependend_of(self, flag_a, flag_b): """Check if ``flag_a`` is dependent on ``flag_b``""" # we assume flags are independent of each other, e.g., CF can be read while ZF gets written # TODO validate this assumption if flag_a.name == flag_b.name: return True return False def is_reg_dependend_of(self, reg_a, reg_b): """Check if ``reg_a`` is dependent on ``reg_b``""" reg_a_name = reg_a.name.upper() reg_b_name = reg_b.name.upper() # Check if they are the same registers if reg_a_name == reg_b_name: return True # Check vector registers first if self.is_vector_register(reg_a): if self.is_vector_register(reg_b): if reg_a_name[1:] == reg_b_name[1:]: # Registers in the same vector space return True return False # Check basic GPRs gpr_groups = { "A": ["RAX", "EAX", "AX", "AH", "AL"], "B": ["RBX", "EBX", "BX", "BH", "BL"], "C": ["RCX", "ECX", "CX", "CH", "CL"], "D": ["RDX", "EDX", "DX", "DH", "DL"], "SP": ["RSP", "ESP", "SP", "SPL"], "SRC": ["RSI", "ESI", "SI", "SIL"], "DST": ["RDI", "EDI", "DI", "DIL"], } if self.is_basic_gpr(reg_a): if self.is_basic_gpr(reg_b): for dep_group in gpr_groups.values(): if reg_a_name in dep_group: if reg_b_name in dep_group: return True return False # Check other GPRs ma = re.match(r"R([0-9]+)[DWB]?", reg_a_name) mb = re.match(r"R([0-9]+)[DWB]?", reg_b_name) if ma and mb and ma.group(1) == mb.group(1): return True # No dependencies return False def is_basic_gpr(self, register): """Check if register is a basic general purpose register (ebi, rax, ...)""" if any(char.isdigit() for char in register.name) or any( register.name.lower().startswith(x) for x in ["mm", "xmm", "ymm", "zmm"] ): return False return True def is_gpr(self, register): """Check if register is a general purpose register""" if register is None: return False if self.is_basic_gpr(register): return True return re.match(r"R([0-9]+)[DWB]?", register.name, re.IGNORECASE) def is_vector_register(self, register): """Check if register is a vector register""" if register is None: return False if register.name.rstrip(string.digits).lower() in [ "mm", "xmm", "ymm", "zmm", ]: return True return False def get_reg_type(self, register): """Get register type""" if register is None: return False if self.is_gpr(register): return "gpr" elif self.is_vector_register(register): return register.name.rstrip(string.digits).lower() raise ValueError