Rewrite the parsing of register expressions. GCC, for reasons unknown, put the displacement in the middle.

I am completely restructuring the parser definition so that they are more explicit.  They are more verbose too, but at least I understand what they do.
This commit is contained in:
pleroy
2025-03-12 22:26:38 +01:00
parent 91da9a311a
commit d61330404b
3 changed files with 178 additions and 40 deletions

View File

@@ -318,28 +318,45 @@ class ParserX86Intel(ParserX86):
base_register = self.register
index_register = self.register
scale = pp.Word("1248", exact=1)
post_displacement = pp.Group(
(pp.Literal("+") ^ pp.Literal("-")).setResultsName("sign") + integer_number
| identifier
).setResultsName(self.immediate_id)
pre_displacement = pp.Group(integer_number + pp.Literal("+")).setResultsName(
self.immediate_id
base = base_register.setResultsName("base")
displacement = pp.Group(
pp.Group(
integer_number ^ identifier
).setResultsName(self.immediate_id)
).setResultsName("displacement")
short_indexed = index_register.setResultsName("index")
long_indexed = (
index_register.setResultsName("index")
+ pp.Literal("*")
+ scale.setResultsName("scale")
)
indexed = pp.Group(
index_register.setResultsName("index")
+ pp.Optional(pp.Literal("*") + scale.setResultsName("scale"))
short_indexed
^ long_indexed
).setResultsName("indexed")
operator = pp.Word("+-", exact=1)
# Syntax:
# `base` always preceedes `indexed`.
# `short_indexed` is only allowed if it follows `base`, not alone.
# `displacement` can go anywhere.
# It's easier to list all the alternatives than to represent these rules using complicated
# `Optional` and what not.
register_expression = pp.Group(
pp.Literal("[")
+ pp.Optional(pp.Group(pre_displacement).setResultsName("pre_displacement"))
+ pp.Group(
base_register.setResultsName("base")
^ pp.Group(
base_register.setResultsName("base") + pp.Literal("+") + indexed
).setResultsName("base_and_indexed")
^ indexed
).setResultsName("non_displacement")
+ pp.Optional(pp.Group(post_displacement).setResultsName("post_displacement"))
+ (
base
^ (base + operator + displacement)
^ (base + operator + displacement + operator + indexed)
^ (base + operator + indexed)
^ (base + operator + indexed + operator + displacement)
^ (displacement + operator + base)
^ (displacement + operator + base + operator + indexed)
^ (displacement + operator + pp.Group(long_indexed).setResultsName("indexed"))
^ pp.Group(long_indexed).setResultsName("indexed")
^ (pp.Group(long_indexed).setResultsName("indexed") + operator + displacement)
)
+ pp.Literal("]")
).setResultsName("register_expression")
@@ -640,33 +657,16 @@ class ParserX86Intel(ParserX86):
return RegisterOperand(name=operand.name)
def process_register_expression(self, register_expression):
pre_displacement = register_expression.get("pre_displacement")
post_displacement = register_expression.get("post_displacement")
non_displacement = register_expression.get("non_displacement")
base = None
indexed = None
if non_displacement:
base_and_indexed = non_displacement.get("base_and_indexed")
if base_and_indexed:
base = base_and_indexed.get("base")
indexed = base_and_indexed.get("indexed")
else:
base = non_displacement.get("base")
if not base:
indexed = non_displacement.get("indexed")
base = register_expression.get("base")
displacement = register_expression.get("displacement")
indexed = register_expression.get("indexed")
index = None
scale = 1
if indexed:
index = indexed.get("index")
scale = int(indexed.get("scale", "1"), 0)
else:
index = None
scale = 1
displacement_op = (
self.process_immediate(pre_displacement.immediate) if pre_displacement else None
)
displacement_op = (
self.process_immediate(post_displacement.immediate)
if post_displacement
else displacement_op
self.process_immediate(displacement.immediate) if displacement else None
)
base_op = RegisterOperand(name=base.name) if base else None
index_op = RegisterOperand(name=index.name) if index else None

View File

@@ -0,0 +1,102 @@
# Produced with gcc 14.2 with -O3 -march=sapphirerapids -fopenmp-simd -mprefer-vector-width=512, https://godbolt.org/z/drE47x1b4.
.LC3:
.string "%f\n"
main:
push r14
xor edi, edi
push r13
push r12
push rbp
push rbx
call time
mov edi, eax
call srand
mov edi, 1600
call malloc
mov r12, rax
mov rbp, rax
lea r13, [rax+1600]
mov rbx, rax
.L2:
mov edi, 1600
add rbx, 8
call malloc
mov QWORD PTR [rbx-8], rax
cmp r13, rbx
jne .L2
lea rbx, [r12+8]
lea r13, [r12+1592]
.L5:
mov r14d, 8
.L4:
call rand
vxorpd xmm2, xmm2, xmm2
mov rcx, QWORD PTR [rbx]
movsx rdx, eax
mov esi, eax
imul rdx, rdx, 351843721
sar esi, 31
sar rdx, 45
sub edx, esi
imul edx, edx, 100000
sub eax, edx
vcvtsi2sd xmm0, xmm2, eax
vdivsd xmm0, xmm0, QWORD PTR .LC0[rip]
vmovsd QWORD PTR [rcx+r14], xmm0
add r14, 8
cmp r14, 1592
jne .L4
add rbx, 8
cmp r13, rbx
jne .L5
vmovsd xmm1, QWORD PTR .LC1[rip]
lea rdi, [r12+1584]
.L6:
mov rdx, QWORD PTR [rbp+8]
mov rcx, QWORD PTR [rbp+16]
mov eax, 1
mov rsi, QWORD PTR [rbp+0]
vmovsd xmm0, QWORD PTR [rdx]
.L7:
vaddsd xmm0, xmm0, QWORD PTR [rcx+rax*8]
vaddsd xmm0, xmm0, QWORD PTR [rdx+8+rax*8]
vaddsd xmm0, xmm0, QWORD PTR [rsi+rax*8]
vmulsd xmm0, xmm0, xmm1
vmovsd QWORD PTR [rdx+rax*8], xmm0
inc rax
cmp rax, 199
jne .L7
vmovsd xmm0, QWORD PTR [rdx+1592]
add rbp, 8
vmovsd QWORD PTR [rcx+8], xmm0
cmp rdi, rbp
jne .L6
mov rax, QWORD PTR [r12+1584]
vmovsd xmm0, QWORD PTR .LC2[rip]
vucomisd xmm0, QWORD PTR [rax+1584]
jp .L9
je .L19
.L9:
pop rbx
xor eax, eax
pop rbp
pop r12
pop r13
pop r14
ret
.L19:
mov rax, QWORD PTR [r12]
mov edi, OFFSET FLAT:.LC3
vmovsd xmm0, QWORD PTR [rax]
mov eax, 1
call printf
jmp .L9
.LC0:
.long 0
.long 1083129856
.LC1:
.long 2061584302
.long 1072934420
.LC2:
.long -57724360
.long 1072939201

View File

@@ -25,6 +25,8 @@ class TestParserX86Intel(unittest.TestCase):
self.triad_iaca_code = f.read()
with open(self._find_file("gs_x86_icc.s")) as f:
self.gs_icc_code = f.read()
with open(self._find_file("gs_x86_gcc.s")) as f:
self.gs_gcc_code = f.read()
##################
# Test
@@ -100,6 +102,7 @@ class TestParserX86Intel(unittest.TestCase):
instr11 = "\tlea\trcx, OFFSET FLAT:??_R0N@8+8"
instr12 = "\tvfmadd213sd xmm0, xmm1, QWORD PTR __real@bfc5555555555555"
instr13 = "\tjmp\t$LN18@operator"
instr14 = "vaddsd xmm0, xmm0, QWORD PTR [rdx+8+rax*8]"
parsed_1 = self.parser.parse_instruction(instr1)
parsed_2 = self.parser.parse_instruction(instr2)
@@ -114,6 +117,7 @@ class TestParserX86Intel(unittest.TestCase):
parsed_11 = self.parser.parse_instruction(instr11)
parsed_12 = self.parser.parse_instruction(instr12)
parsed_13 = self.parser.parse_instruction(instr13)
parsed_14 = self.parser.parse_instruction(instr14)
self.assertEqual(parsed_1.mnemonic, "sub")
self.assertEqual(parsed_1.operands[0], RegisterOperand(name="RSP"))
@@ -204,6 +208,17 @@ class TestParserX86Intel(unittest.TestCase):
self.assertEqual(parsed_13.mnemonic, "jmp")
self.assertEqual(parsed_13.operands[0], IdentifierOperand(name="$LN18@operator"))
self.assertEqual(parsed_14.mnemonic, "vaddsd")
self.assertEqual(parsed_14.operands[0],
RegisterOperand(name="XMM0"))
self.assertEqual(parsed_14.operands[1],
RegisterOperand(name="XMM0"))
self.assertEqual(parsed_14.operands[2],
MemoryOperand(base=RegisterOperand(name="RDX"),
offset=ImmediateOperand(value=8),
index=RegisterOperand(name="RAX"),
scale=8))
def test_parse_line(self):
line_comment = "; -- Begin main"
line_instruction = "\tret\t0"
@@ -344,6 +359,27 @@ class TestParserX86Intel(unittest.TestCase):
)
self.assertEqual(len(parsed), 227)
def test_parse_file4(self):
parsed = self.parser.parse_file(self.gs_gcc_code)
self.assertEqual(parsed[0].line_number, 1)
# Check a few lines to make sure that we produced something reasonable.
self.assertEqual(parsed[61],
InstructionForm(mnemonic="vaddsd",
operands=[RegisterOperand("XMM0"),
RegisterOperand("XMM0"),
MemoryOperand(base=RegisterOperand("RDX"),
index=RegisterOperand("RAX"),
scale=8,
offset=ImmediateOperand(value=8))],
line=" vaddsd xmm0, xmm0, QWORD PTR [rdx+8+rax*8]",
line_number=62))
self.assertEqual(parsed[101],
InstructionForm(directive_id=DirectiveOperand(name=".long",
parameters=["1072939201"]),
line=" .long 1072939201",
line_number=102))
self.assertEqual(len(parsed), 102)
def test_normalize_imd(self):
imd_binary = ImmediateOperand(value="1001111B")
imd_octal = ImmediateOperand(value="117O")