Files
OSACA/osaca/parser/parser_x86att.py
2019-06-04 12:55:32 +02:00

245 lines
9.9 KiB
Python
Executable File

#!/usr/bin/env python3
import pyparsing as pp
from .base_parser import BaseParser
from .attr_dict import AttrDict
class ParserX86ATT(BaseParser):
def __init__(self):
super().__init__()
def construct_parser(self):
# Comment
symbol_comment = '#'
self.comment = pp.Literal(symbol_comment) + pp.Group(
pp.ZeroOrMore(pp.Word(pp.printables))
).setResultsName(self.COMMENT_ID)
# Define x86 assembly identifier
first = pp.Word(pp.alphas + '_.', exact=1)
rest = pp.Word(pp.alphanums + '_.')
identifier = pp.Group(
pp.Combine(first + pp.Optional(rest)).setResultsName('name')
).setResultsName('identifier')
# Label
self.label = pp.Group(
identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
).setResultsName(self.LABEL_ID)
# Directive
decimal_number = pp.Combine(
pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
).setResultsName('value')
hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
directive_option = pp.Combine(
pp.Word('#@.', exact=1) + pp.Word(pp.printables, excludeChars=',')
)
directive_parameter = (
pp.quotedString | directive_option | identifier | hex_number | decimal_number
)
commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
self.directive = pp.Group(
pp.Literal('.')
+ pp.Word(pp.alphanums + '_').setResultsName('name')
+ commaSeparatedList.setResultsName('parameters')
+ pp.Optional(self.comment)
).setResultsName(self.DIRECTIVE_ID)
##############################
# Instructions
# Mnemonic
mnemonic = pp.ZeroOrMore(pp.Literal('data16') | pp.Literal('data32')) + pp.Word(
pp.alphanums
).setResultsName('mnemonic')
# Register: pp.Regex('^%[0-9a-zA-Z]+,?')
register = pp.Group(
pp.Literal('%')
+ pp.Word(pp.alphanums).setResultsName('name')
+ pp.Optional(
pp.Literal('{')
+ pp.Literal('%')
+ pp.Word(pp.alphanums).setResultsName('mask')
+ pp.Literal('}')
)
).setResultsName(self.REGISTER_ID)
# Immediate: pp.Regex('^\$(-?[0-9]+)|(0x[0-9a-fA-F]+),?')
symbol_immediate = '$'
immediate = pp.Group(
pp.Literal(symbol_immediate) + (hex_number | decimal_number | identifier)
).setResultsName(self.IMMEDIATE_ID)
# Memory: offset(base, index, scale)
offset = pp.Group(identifier | hex_number | decimal_number).setResultsName(
self.IMMEDIATE_ID
)
scale = pp.Word('1248', exact=1)
memory = pp.Group(
pp.Optional(offset.setResultsName('offset'))
+ pp.Literal('(')
+ pp.Optional(register.setResultsName('base'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(register.setResultsName('index'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(scale.setResultsName('scale'))
+ pp.Literal(')')
).setResultsName(self.MEMORY_ID)
# Combine to instruction form
operand_first = pp.Group(register ^ immediate ^ memory ^ identifier)
operand_rest = pp.Group(register ^ immediate ^ memory)
self.instruction_parser = (
mnemonic
+ pp.Optional(operand_first.setResultsName('operand1'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand2'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand3'))
+ pp.Optional(self.comment)
)
def parse_line(self, line, line_number=None):
"""
Parse line and return instruction form.
:param str line: line of assembly code
:param int line_id: default None, identifier of instruction form
:return: parsed instruction form
"""
instruction_form = AttrDict({
self.INSTRUCTION_ID: None,
self.OPERANDS_ID: None,
self.DIRECTIVE_ID: None,
self.COMMENT_ID: None,
self.LABEL_ID: None,
'line_number': line_number,
})
result = None
# 1. Parse comment
try:
result = self._process_operand(self.comment.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
except pp.ParseException:
pass
# 2. Parse label
if result is None:
try:
result = self._process_operand(
self.label.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.LABEL_ID] = result[self.LABEL_ID]['name']
if self.COMMENT_ID in result[self.LABEL_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.LABEL_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 3. Parse directive
if result is None:
try:
result = self._process_operand(
self.directive.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.DIRECTIVE_ID] = {
'name': result[self.DIRECTIVE_ID]['name'],
'parameters': result[self.DIRECTIVE_ID]['parameters'],
}
if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.DIRECTIVE_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 4. Parse instruction
if result is None:
try:
result = self.parse_instruction(line)
except pp.ParseException:
print(
'\n\n*-*-*-*-*-*-*-*-*-*-\n{}: {}\n*-*-*-*-*-*-*-*-*-*-\n\n'.format(
line_number, line
)
)
instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
return instruction_form
def parse_instruction(self, instruction):
result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
result = AttrDict.convert_dict(result)
operands = AttrDict({'source': [], 'destination': []})
# Check from right to left
# Check third operand
if 'operand3' in result:
operands['destination'].append(self._process_operand(result['operand3']))
# Check second operand
if 'operand2' in result:
if len(operands['destination']) != 0:
operands['source'].insert(0, self._process_operand(result['operand2']))
else:
operands['destination'].append(self._process_operand(result['operand2']))
# Check first operand
if 'operand1' in result:
if len(operands['destination']) != 0:
operands['source'].insert(0, self._process_operand(result['operand1']))
else:
operands['destination'].append(self._process_operand(result['operand1']))
return_dict = AttrDict({
self.INSTRUCTION_ID: result['mnemonic'],
self.OPERANDS_ID: operands,
self.COMMENT_ID: ' '.join(result[self.COMMENT_ID])
if self.COMMENT_ID in result
else None,
})
return return_dict
def _process_operand(self, operand):
# For the moment, only used to structure memory addresses
if self.MEMORY_ID in operand:
return self.substitute_memory_address(operand[self.MEMORY_ID])
if self.IMMEDIATE_ID in operand:
return self.substitue_immediate(operand[self.IMMEDIATE_ID])
if self.LABEL_ID in operand:
return self.substitute_label(operand[self.LABEL_ID])
return operand
def substitute_memory_address(self, memory_address):
# Remove unecessarily created dictionary entries during memory address parsing
offset = None if 'offset' not in memory_address else memory_address['offset']
base = None if 'base' not in memory_address else memory_address['base']
index = None if 'index' not in memory_address else memory_address['index']
scale = '1' if 'scale' not in memory_address else memory_address['scale']
new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale})
return AttrDict({self.MEMORY_ID: new_dict})
def substitute_label(self, label):
# remove duplicated 'name' level due to identifier
label['name'] = label['name']['name']
return AttrDict({self.LABEL_ID: label})
def substitue_immediate(self, immediate):
if 'identifier' in immediate:
# actually an identifier, change declaration
return immediate
# otherwise nothing to do
return AttrDict({self.IMMEDIATE_ID: immediate})
def get_full_reg_name(self, register):
# nothing to do
return register['name']
def normalize_imd(self, imd):
if 'value' in imd:
if imd['value'].lower().startswith('0x'):
# hex, return decimal
return int(imd['value'], 16)
return int(imd['value'], 10)
# identifier
return imd