OSACA/osaca/parser/parser_x86att.py

#!/usr/bin/env python3

import pyparsing as pp

from .base_parser import BaseParser
from .attr_dict import AttrDict


class ParserX86ATT(BaseParser):
    def __init__(self):
        super().__init__()

    def construct_parser(self):
        # Comment
        symbol_comment = '#'
        self.comment = pp.Literal(symbol_comment) + pp.Group(
            pp.ZeroOrMore(pp.Word(pp.printables))
        ).setResultsName(self.COMMENT_ID)
        # Define x86 assembly identifier
        first = pp.Word(pp.alphas + '_.', exact=1)
        rest = pp.Word(pp.alphanums + '_.')
        identifier = pp.Group(
            pp.Combine(first + pp.Optional(rest)).setResultsName('name')
        ).setResultsName('identifier')
        # Label
        self.label = pp.Group(
            identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
        ).setResultsName(self.LABEL_ID)
        # Directive
        decimal_number = pp.Combine(
            pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
        ).setResultsName('value')
        hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
        directive_option = pp.Combine(
            pp.Word('#@.', exact=1) + pp.Word(pp.printables, excludeChars=',')
        )
        directive_parameter = (
            pp.quotedString | directive_option | identifier | hex_number | decimal_number
        )
        commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
        self.directive = pp.Group(
            pp.Literal('.')
            + pp.Word(pp.alphanums + '_').setResultsName('name')
            + commaSeparatedList.setResultsName('parameters')
            + pp.Optional(self.comment)
        ).setResultsName(self.DIRECTIVE_ID)

        ##############################
        # Instructions
        # Mnemonic
        mnemonic = pp.ZeroOrMore(pp.Literal('data16') | pp.Literal('data32')) + pp.Word(
            pp.alphanums
        ).setResultsName('mnemonic')
        # Register: pp.Regex('^%[0-9a-zA-Z]+,?')
        register = pp.Group(
            pp.Literal('%')
            + pp.Word(pp.alphanums).setResultsName('name')
            + pp.Optional(
                pp.Literal('{')
                + pp.Literal('%')
                + pp.Word(pp.alphanums).setResultsName('mask')
                + pp.Literal('}')
            )
        ).setResultsName(self.REGISTER_ID)
        # Immediate: pp.Regex('^\$(-?[0-9]+)|(0x[0-9a-fA-F]+),?')
        symbol_immediate = '$'
        immediate = pp.Group(
            pp.Literal(symbol_immediate) + (hex_number | decimal_number | identifier)
        ).setResultsName(self.IMMEDIATE_ID)
        # Memory: offset(base, index, scale)
        offset = pp.Group(identifier | hex_number | decimal_number).setResultsName(
            self.IMMEDIATE_ID
        )
        scale = pp.Word('1248', exact=1)
        memory = pp.Group(
            pp.Optional(offset.setResultsName('offset'))
            + pp.Literal('(')
            + pp.Optional(register.setResultsName('base'))
            + pp.Optional(pp.Suppress(pp.Literal(',')))
            + pp.Optional(register.setResultsName('index'))
            + pp.Optional(pp.Suppress(pp.Literal(',')))
            + pp.Optional(scale.setResultsName('scale'))
            + pp.Literal(')')
        ).setResultsName(self.MEMORY_ID)
        # Combine to instruction form
        operand_first = pp.Group(register ^ immediate ^ memory ^ identifier)
        operand_rest = pp.Group(register ^ immediate ^ memory)
        self.instruction_parser = (
            mnemonic
            + pp.Optional(operand_first.setResultsName('operand1'))
            + pp.Optional(pp.Suppress(pp.Literal(',')))
            + pp.Optional(operand_rest.setResultsName('operand2'))
            + pp.Optional(pp.Suppress(pp.Literal(',')))
            + pp.Optional(operand_rest.setResultsName('operand3'))
            + pp.Optional(self.comment)
        )

    def parse_line(self, line, line_number=None):
        """
        Parse line and return instruction form.

        :param str line: line of assembly code
        :param int line_id: default None, identifier of instruction form
        :return: parsed instruction form
        """
        instruction_form = AttrDict({
            self.INSTRUCTION_ID: None,
            self.OPERANDS_ID: None,
            self.DIRECTIVE_ID: None,
            self.COMMENT_ID: None,
            self.LABEL_ID: None,
            'line_number': line_number,
        })
        result = None

        # 1. Parse comment
        try:
            result = self._process_operand(self.comment.parseString(line, parseAll=True).asDict())
            result = AttrDict.convert_dict(result)
            instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
        except pp.ParseException:
            pass

        # 2. Parse label
        if result is None:
            try:
                result = self._process_operand(
                    self.label.parseString(line, parseAll=True).asDict()
                )
                result = AttrDict.convert_dict(result)
                instruction_form[self.LABEL_ID] = result[self.LABEL_ID]['name']
                if self.COMMENT_ID in result[self.LABEL_ID]:
                    instruction_form[self.COMMENT_ID] = ' '.join(
                        result[self.LABEL_ID][self.COMMENT_ID]
                    )
            except pp.ParseException:
                pass

        # 3. Parse directive
        if result is None:
            try:
                result = self._process_operand(
                    self.directive.parseString(line, parseAll=True).asDict()
                )
                result = AttrDict.convert_dict(result)
                instruction_form[self.DIRECTIVE_ID] = {
                    'name': result[self.DIRECTIVE_ID]['name'],
                    'parameters': result[self.DIRECTIVE_ID]['parameters'],
                }
                if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
                    instruction_form[self.COMMENT_ID] = ' '.join(
                        result[self.DIRECTIVE_ID][self.COMMENT_ID]
                    )
            except pp.ParseException:
                pass

        # 4. Parse instruction
        if result is None:
            try:
                result = self.parse_instruction(line)
            except pp.ParseException:
                print(
                    '\n\n*-*-*-*-*-*-*-*-*-*-\n{}: {}\n*-*-*-*-*-*-*-*-*-*-\n\n'.format(
                        line_number, line
                    )
                )
            instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
            instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
            instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]

        return instruction_form

    def parse_instruction(self, instruction):
        result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
        result = AttrDict.convert_dict(result)
        operands = AttrDict({'source': [], 'destination': []})
        # Check from right to left
        # Check third operand
        if 'operand3' in result:
            operands['destination'].append(self._process_operand(result['operand3']))
        # Check second operand
        if 'operand2' in result:
            if len(operands['destination']) != 0:
                operands['source'].insert(0, self._process_operand(result['operand2']))
            else:
                operands['destination'].append(self._process_operand(result['operand2']))
        # Check first operand
        if 'operand1' in result:
            if len(operands['destination']) != 0:
                operands['source'].insert(0, self._process_operand(result['operand1']))
            else:
                operands['destination'].append(self._process_operand(result['operand1']))
        return_dict = AttrDict({
            self.INSTRUCTION_ID: result['mnemonic'],
            self.OPERANDS_ID: operands,
            self.COMMENT_ID: ' '.join(result[self.COMMENT_ID])
            if self.COMMENT_ID in result
            else None,
        })
        return return_dict

    def _process_operand(self, operand):
        # For the moment, only used to structure memory addresses
        if self.MEMORY_ID in operand:
            return self.substitute_memory_address(operand[self.MEMORY_ID])
        if self.IMMEDIATE_ID in operand:
            return self.substitue_immediate(operand[self.IMMEDIATE_ID])
        if self.LABEL_ID in operand:
            return self.substitute_label(operand[self.LABEL_ID])
        return operand

    def substitute_memory_address(self, memory_address):
        # Remove unecessarily created dictionary entries during memory address parsing
        offset = None if 'offset' not in memory_address else memory_address['offset']
        base = None if 'base' not in memory_address else memory_address['base']
        index = None if 'index' not in memory_address else memory_address['index']
        scale = '1' if 'scale' not in memory_address else memory_address['scale']
        new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale})
        return AttrDict({self.MEMORY_ID: new_dict})

    def substitute_label(self, label):
        # remove duplicated 'name' level due to identifier
        label['name'] = label['name']['name']
        return AttrDict({self.LABEL_ID: label})

    def substitue_immediate(self, immediate):
        if 'identifier' in immediate:
            # actually an identifier, change declaration
            return immediate
        # otherwise nothing to do
        return AttrDict({self.IMMEDIATE_ID: immediate})

    def get_full_reg_name(self, register):
        # nothing to do
        return register['name']

    def normalize_imd(self, imd):
        if 'value' in imd:
            if imd['value'].lower().startswith('0x'):
                # hex, return decimal
                return int(imd['value'], 16)
            return int(imd['value'], 10)
        # identifier
        return imd