diff --git a/osaca/parser/base_parser.py b/osaca/parser/base_parser.py index 540e34f..ebe695f 100755 --- a/osaca/parser/base_parser.py +++ b/osaca/parser/base_parser.py @@ -3,6 +3,7 @@ class BaseParser(object): # Identifiers for operand types + COMMENT_ID = 'comment' DIRECTIVE_ID = 'directive' IMMEDIATE_ID = 'immediate' LABEL_ID = 'label' diff --git a/osaca/parser/parser_x86att.py b/osaca/parser/parser_x86att.py index 8ffa2d9..7ba05da 100755 --- a/osaca/parser/parser_x86att.py +++ b/osaca/parser/parser_x86att.py @@ -2,7 +2,7 @@ import pyparsing as pp -from .parser import BaseParser +from .base_parser import BaseParser class ParserX86ATT(BaseParser): @@ -14,7 +14,7 @@ class ParserX86ATT(BaseParser): symbol_comment = '#' self.comment = pp.Literal(symbol_comment) + pp.Group( pp.ZeroOrMore(pp.Word(pp.printables)) - ).setResultsName('comment') + ).setResultsName(self.COMMENT_ID) # Define x86 assembly identifier first = pp.Word(pp.alphas + '_.', exact=1) rest = pp.Word(pp.alphanums + '_.') @@ -24,15 +24,13 @@ class ParserX86ATT(BaseParser): identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment) ).setResultsName(self.LABEL_ID) # Directive - commaSeparatedList = pp.delimitedList( - pp.Optional(pp.quotedString | pp.Word(pp.alphanums)), delim=',' - ) + commaSeparatedList = pp.delimitedList(pp.Optional(pp.quotedString | identifier), delim=',') self.directive = pp.Group( pp.Literal('.') + pp.Word(pp.alphanums + '_').setResultsName('name') + commaSeparatedList.setResultsName('parameters') + pp.Optional(self.comment) - ).setResultsName(self.DIRECTIVE_LABEL) + ).setResultsName(self.DIRECTIVE_ID) ############################## # Instructions @@ -77,7 +75,7 @@ class ParserX86ATT(BaseParser): + pp.Optional(self.comment) ).setResultsName(self.MEMORY_ID) # Combine to instruction form - operand1 = pp.Group(register ^ immediate ^ memory ^ self.label).setResultsName('operand1') + operand1 = pp.Group(register ^ immediate ^ memory ^ identifier).setResultsName('operand1') operand2 = pp.Group(register ^ immediate ^ memory).setResultsName('operand2') operand3 = pp.Group(register ^ immediate ^ memory).setResultsName('operand3') self.instruction_parser = ( @@ -99,69 +97,79 @@ class ParserX86ATT(BaseParser): instruction_form = { 'instruction': None, 'operands': None, + 'directive': None, 'comment': None, - 'label_name': None, - 'id': line_number, + 'label': None, + 'line_number': line_number, } result = None # 1. Parse comment try: - result = self.comment.parseString(line, parseAll=True) - instruction_form['comment'] = result['comment'].join(' ') + result = self.comment.parseString(line, parseAll=True).asDict() + instruction_form['comment'] = ' '.join(result[self.COMMENT_ID]) except pp.ParseException: pass # 2. Parse label if result is None: try: - result = self.label.parseString(line, parseAll=True) - instruction_form['comment'] = result['comment'].join(' ') - instruction_form['label_name'] = result['label_name'] + result = self.label.parseString(line, parseAll=True).asDict() + instruction_form['label'] = result[self.LABEL_ID]['name'] + if self.COMMENT_ID in result[self.LABEL_ID]: + instruction_form['comment'] = ' '.join(result[self.COMMENT_ID]) except pp.ParseException: pass # 3. Parse directive - # TODO + if result is None: + try: + result = self.directive.parseString(line, parseAll=True).asDict() + instruction_form['directive']['name'] = result[self.DIRECTIVE_ID]['name'] + instruction_form['directive']['parameters'] = result[self.DIRECTIVE_ID][ + 'parameters' + ] + if self.COMMENT_ID in result[self.DIRECTIVE_ID]: + instruction_form['comment'] = ' '.join( + result[self.DIRECTIVE_ID][self.COMMENT_ID] + ) + except pp.ParseException: + pass # 4. Parse instruction if result is None: result = self.parse_instruction(line) - # TODO instruction_form['instruction'] = result['instruction'] instruction_form['operands'] = result['operands'] - instruction_form['comment'] = result['comment'].join(' ') + instruction_form['comment'] = result['comment'] return instruction_form def parse_instruction(self, instruction): - result = self.instruction_parser.parseString(instruction, parseAll=True) - # Check first operand - # Check for register - if self.REGISTER_ID in result['operand1']: - # TODO - pass - # Check for immediate - elif self.IMMEDIATE_ID in result['operand1']: - # TODO - pass - # Check for memory address - elif self.MEMORY_ID in result['operand1']: - # TODO - pass - # Check for label - elif self.LABEL_ID in result['operand1']: - # TODO - pass - - # Check second operand - if 'operand2' in result: - # if('reg' in op2): ... - # TODO - pass - + result = self.instruction_parser.parseString(instruction, parseAll=True).asDict() + operands = {'sources': []} + # Check from right to left # Check third operand if 'operand3' in result: - # TODO - pass - return result + operands['destination'] = result['operand3'] + # Check second operand + if 'operand2' in result: + if 'destination' in operands: + operands['sources'].insert(0, result['operand2']) + else: + operands['destination'] = result['operand2'] + # Add first operand + if 'destination' in operands: + operands['sources'].insert(0, result['operand1']) + else: + operands['destination'] = result['operand1'] + return_dict = { + 'instruction': result['mnemonic'], + 'operands': operands, + 'comment': ' '.join(result['comment']) if 'comment' in result else None, + } + return return_dict + + def substitute_memory_address(self, memory_address): + # remove unecessarily created dictionary entries + raise NotImplementedError diff --git a/tests/all_tests.py b/tests/all_tests.py index 9b42fed..33f5c42 100755 --- a/tests/all_tests.py +++ b/tests/all_tests.py @@ -6,6 +6,7 @@ import unittest sys.path[0:0] = ['.', '..'] suite = unittest.TestLoader().loadTestsFromNames( [ + 'test_parser_x86_att' ] ) diff --git a/tests/test_parser_x86att.py b/tests/test_parser_x86att.py new file mode 100644 index 0000000..fb4e21b --- /dev/null +++ b/tests/test_parser_x86att.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +Unit tests for x86 AT&T assembly parser +""" + +import unittest + +from pyparsing import ParseException + +from osaca.parser import ParserX86ATT + + +class TestParserX86ATT(unittest.TestCase): + def setUp(self): + self.parser = ParserX86ATT() + + ################## + # Test + ################## + + def test_comment_parser(self): + self.assertEqual(self.get_comment('# some comments'), 'some comments') + self.assertEqual(self.get_comment('\t\t#AA BB CC \t end \t'), 'AA BB CC end') + self.assertEqual(self.get_comment('\t## comment ## comment'), '# comment ## comment') + + def test_label_parser(self): + self.assertEqual(self.get_label('main:')['name'], 'main') + self.assertEqual(self.get_label('..B1.10:')['name'], '.B1.10') + self.assertEqual(self.get_label('.2.3_2_pack.3:')['name'], '.2.3_2_pack.3') + self.assertEqual(self.get_label('.L1:\t\t\t#label1')['name'], '.L1') + self.assertEqual(self.get_label('.L1:\t\t\t#label1')['comment'], 'label1') + with self.assertRaises(ParseException): + self.get_label('\t.cfi_startproc') + + def test_directive_parser(self): + self.assertEqual(self.get_directive('\t.text')['name'], 'text') + self.assertEqual(len(self.get_directive('\t.text')['parameters']), 0) + self.assertEqual(self.get_directive('\t.align\t16,0x90')['name'], 'align') + self.assertEqual(len(self.get_directive('\t.align\t16,0x90')['parameters']), 2) + self.assertEqual(self.get_directive('\t.align\t16,0x90')['parameters'][1], '0x90') + self.assertEqual( + self.get_directive(' .byte 100,103,144 #IACA START')['name'], 'byte' + ) + self.assertEqual( + self.get_directive(' .byte 100,103,144 #IACA START')['parameters'][2], + '144', + ) + self.assertEqual( + self.get_directive(' .byte 100,103,144 #IACA START')['comment'], + 'IACA START', + ) + + def test_parse_instruciton(self): + instr1 = '\t\tvcvtsi2ss %edx, %xmm2, %xmm2\t\t\t#12.27' + instr2 = 'jb ..B1.4 \t' + instr3 = ' movl $222,%ebx #IACA END' + instr4 = 'vmovss %xmm4, -4(%rsp,%rax,8) #12.9' + + parsed_1 = self.parser.parse_instruction(instr1) + parsed_2 = self.parser.parse_instruction(instr2) + parsed_3 = self.parser.parse_instruction(instr3) + parsed_4 = self.parser.parse_instruction(instr4) + + self.assertEqual(parsed_1['instruction'], 'vcvtsi2ss') + self.assertEqual(parsed_1['operands']['destination']['register']['name'], 'xmm2') + self.assertEqual(parsed_1['operands']['sources'][0]['register']['name'], 'edx') + self.assertEqual(parsed_1['comment'], '12.27') + + self.assertEqual(parsed_2['instruction'], 'jb') + self.assertEqual(parsed_2['operands']['destination'], '..B1.4') + self.assertEqual(len(parsed_2['operands']['sources']), 0) + self.assertIsNone(parsed_2['comment']) + + self.assertEqual(parsed_3['instruction'], 'movl') + self.assertEqual(parsed_3['operands']['destination']['register']['name'], 'ebx') + self.assertEqual(parsed_3['operands']['sources'][0]['immediate']['value'], '222') + self.assertEqual(parsed_3['comment'], 'IACA END') + + self.assertEqual(parsed_4['instruction'], 'vmovss') + self.assertEqual(parsed_4['operands']['destination']['memory']['offset'], '-4') + self.assertEqual(parsed_4['operands']['destination']['memory']['base'], 'rsp') + self.assertEqual(parsed_4['operands']['destination']['memory']['index'], 'rax') + self.assertEqual(parsed_4['operands']['destination']['memory']['scale'], '8') + self.assertEqual(parsed_4['operands']['sources'][0]['register']['name'], 'xmm4') + self.assertEqual(parsed_4['comment'], '12.9') + + def test_parse_line(self): + line_comment = '# -- Begin main' + line_label = '..B1.7: # Preds ..B1.6' + line_directive = '\t\t.quad .2.3_2__kmpc_loc_pack.2 #qed' + # line_instruction = '\t\tlea 2(%rax,%rax), %ecx #12.9' + + instruction_form_1 = { + 'instruction': None, + 'operands': None, + 'directive': None, + 'comment': '-- Begin main', + 'label': None, + 'line_number': 1, + } + instruction_form_2 = { + 'instruction': None, + 'operands': None, + 'directive': None, + 'comment': None, + 'label': '..B1.7', + 'line_number': 2, + } + instruction_form_3 = { + 'instruction': None, + 'operands': None, + 'directive': {'name': 'quad', 'parameters': ['.2.3_2__kmpc_loc_pack.2']}, + 'comment': 'qed', + 'label': None, + 'line_number': 3, + } + # TODO + # instruction_form_4 = { + # 'instruction': 'lea', + # 'operands': {'sources': {'memory': {'offset': '2', 'base': {'name': rax}, ''}}}, + # 'directive': None, + # 'comment': '-- Begin main', + # 'label': None, + # 'line_number': 1, + # } + + parsed_1 = self.parser.parse_line(line_comment, 1) + parsed_2 = self.parser.parse_line(line_label, 2) + parsed_3 = self.parser.parse_line(line_directive, 3) + # TODO parsed_4 + # parsed_4 = self.parser.parse_line(line_instruction, 4) + + self.assertEqual(parsed_1, instruction_form_1) + self.assertEqual(parsed_2, instruction_form_2) + self.assertEqual(parsed_3, instruction_form_3) + # self.assertEqual(parsed_4, instruction_form_4) + + ################## + # Helper functions + ################## + def get_comment(self, comment): + return ' '.join( + self.parser.comment.parseString(comment, parseAll=True).asDict()['comment'] + ) + + def get_label(self, label): + return self.parser.label.parseString(label, parseAll=True).asDict() + + def get_directive(self, directive): + return self.parser.directive.parseString(directive, parseAll=True).asDict()