Compare commits

...

6 Commits

Author SHA1 Message Date
JanLJL
2a231bf20b version bump 2025-03-17 10:28:06 +01:00
JanLJL
fb8c8ec7db remove AT&T limitation 2025-03-17 10:27:41 +01:00
Jan
2f069000e9 add syntax flag in README 2025-03-17 10:26:50 +01:00
Jan
bdbcb18817 Merge pull request #112 from pleroy/Intel
Add support for the Intel syntax produced by MSVC and ICC
2025-03-17 10:20:40 +01:00
JanLJL
7930e4d704 take +- operator of offset/index in mem-addr into account 2025-03-14 18:46:12 +01:00
pleroy
d61330404b Rewrite the parsing of register expressions. GCC, for reasons unknown, put the displacement in the middle.
I am completely restructuring the parser definition so that they are more explicit.  They are more verbose too, but at least I understand what they do.
2025-03-12 22:26:38 +01:00
6 changed files with 220 additions and 60 deletions

View File

@@ -34,7 +34,7 @@ Getting started
===============
OSACA is as a python module with a command line interface.
OSACA is also integrated into the `Compiler Explorer at godbolt.org <https://godbolt.org>`_, which allows using OSACA from a browser without any installation. To analyze an assembly snippet, go to https://godbolt.org change language to "Analysis", insert an AArch64 or AT&T(!) x86 assembly code and make sure OSACA is selected in the corresponding analysis panel, e.g., https://godbolt.org/z/shK4f8. When analyzing a high-level language code, use the "Add tool..." menu in the compiler output panel to add OSACA analysis, e.g. https://godbolt.org/z/hbMoPn. To change the micro architecture model, add ``--arch`` and µarch shortname (e.g., ``SKX`` for Skylake, ``ZEN2``, ``N1`` for ARM Neoverse) to the "Compiler options..." (when using "Analysis" mode) or "Arguments" (when analyzing compiler output of a high-level code).
OSACA is also integrated into the `Compiler Explorer at godbolt.org <https://godbolt.org>`_, which allows using OSACA from a browser without any installation. To analyze an assembly snippet, go to https://godbolt.org change language to "Analysis", insert an AArch64 or x86 assembly code and make sure OSACA is selected in the corresponding analysis panel, e.g., https://godbolt.org/z/shK4f8. When analyzing a high-level language code, use the "Add tool..." menu in the compiler output panel to add OSACA analysis, e.g. https://godbolt.org/z/hbMoPn. To change the micro architecture model, add ``--arch`` and µarch shortname (e.g., ``SKX`` for Skylake, ``ZEN2``, ``N1`` for ARM Neoverse) to the "Compiler options..." (when using "Analysis" mode) or "Arguments" (when analyzing compiler output of a high-level code).
Installation
------------
@@ -100,9 +100,9 @@ The usage of OSACA can be listed as:
shows the programs version number.
--arch ARCH
needs to be replaced with the target architecture abbreviation.
Possible options are ``SNB``, ``IVB``, ``HSW``, ``BDW``, ``SKX``, ``CSX``, ``ICL`` (Client), ``ICX`` (Server), ``SPR`` for the latest Intel micro architectures starting from Intel Sandy Bridge and ``ZEN[1-4]`` for AMD Zen architectures.
Furthermore, ``TX2`` for Marvell`s ARM-based ThunderX2 , ``N1`` for ARM's Neoverse, ``A72`` for ARM Cortex-A72, ``TSV110`` for the HiSilicon TaiShan v110, ``A64FX`` for Fujitsu's HPC ARM architecture, ``M1`` for the Apple M1-Firestorm performance core, and ``V2`` for the Neoverse V2 (used in NVIDIA's Grace CPU) are available.
If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
See `the table of supported microarchitectures below <https://github.com/RRZE-HPC/OSACA?tab=readme-ov-file#supported-microarchitectures>`__ for all possible options. If no micro-architecture is given, OSACA assumes a default architecture for x86/AArch64.
--syntax SYNTAX
Define the assembly syntax (ATT, Intel) for x86. If no syntax is given, OSACA tries to determine automatically the syntax to use.
--fixed
Run the throughput analysis with fixed port utilization for all suitable ports per instruction.
Otherwise, OSACA will print out the optimal port utilization for the kernel.

View File

@@ -1,7 +1,7 @@
"""Open Source Architecture Code Analyzer"""
name = "osaca"
__version__ = "0.6.1"
__version__ = "0.7.0"
# To trigger travis deployment to pypi, do the following:
# 1. Increment __version___

View File

@@ -318,28 +318,50 @@ class ParserX86Intel(ParserX86):
base_register = self.register
index_register = self.register
scale = pp.Word("1248", exact=1)
post_displacement = pp.Group(
(pp.Literal("+") ^ pp.Literal("-")).setResultsName("sign") + integer_number
| identifier
).setResultsName(self.immediate_id)
pre_displacement = pp.Group(integer_number + pp.Literal("+")).setResultsName(
self.immediate_id
)
indexed = pp.Group(
base = base_register.setResultsName("base")
displacement = pp.Group(
pp.Group(integer_number ^ identifier).setResultsName(self.immediate_id)
).setResultsName("displacement")
short_indexed = index_register.setResultsName("index")
long_indexed = (
index_register.setResultsName("index")
+ pp.Optional(pp.Literal("*") + scale.setResultsName("scale"))
).setResultsName("indexed")
+ pp.Literal("*")
+ scale.setResultsName("scale")
)
indexed = pp.Group(short_indexed ^ long_indexed).setResultsName("indexed")
operator = pp.Word("+-", exact=1)
operator_index = pp.Word("+-", exact=1).setResultsName("operator_idx")
operator_displacement = pp.Word("+-", exact=1).setResultsName("operator_disp")
# Syntax:
# `base` always preceedes `indexed`.
# `short_indexed` is only allowed if it follows `base`, not alone.
# `displacement` can go anywhere.
# It's easier to list all the alternatives than to represent these rules using complicated
# `Optional` and what not.
register_expression = pp.Group(
pp.Literal("[")
+ pp.Optional(pp.Group(pre_displacement).setResultsName("pre_displacement"))
+ pp.Group(
base_register.setResultsName("base")
^ pp.Group(
base_register.setResultsName("base") + pp.Literal("+") + indexed
).setResultsName("base_and_indexed")
^ indexed
).setResultsName("non_displacement")
+ pp.Optional(pp.Group(post_displacement).setResultsName("post_displacement"))
+ (
base
^ (base + operator_displacement + displacement)
^ (base + operator_displacement + displacement + operator_index + indexed)
^ (base + operator_index + indexed)
^ (base + operator_index + indexed + operator_displacement + displacement)
^ (displacement + operator + base)
^ (displacement + operator + base + operator_index + indexed)
^ (
displacement
+ operator_index
+ pp.Group(long_indexed).setResultsName("indexed")
)
^ pp.Group(long_indexed).setResultsName("indexed")
^ (
pp.Group(long_indexed).setResultsName("indexed")
+ operator_displacement
+ displacement
)
)
+ pp.Literal("]")
).setResultsName("register_expression")
@@ -356,7 +378,7 @@ class ParserX86Intel(ParserX86):
self.register.setResultsName("segment") + pp.Literal(":") + immediate
^ immediate + register_expression
^ register_expression
^ identifier + pp.Optional(pp.Literal("+") + immediate)
^ identifier + pp.Optional(operator + immediate)
).setResultsName("address_expression")
offset_expression = pp.Group(
@@ -640,34 +662,19 @@ class ParserX86Intel(ParserX86):
return RegisterOperand(name=operand.name)
def process_register_expression(self, register_expression):
pre_displacement = register_expression.get("pre_displacement")
post_displacement = register_expression.get("post_displacement")
non_displacement = register_expression.get("non_displacement")
base = None
indexed = None
if non_displacement:
base_and_indexed = non_displacement.get("base_and_indexed")
if base_and_indexed:
base = base_and_indexed.get("base")
indexed = base_and_indexed.get("indexed")
else:
base = non_displacement.get("base")
if not base:
indexed = non_displacement.get("indexed")
base = register_expression.get("base")
displacement = register_expression.get("displacement")
indexed = register_expression.get("indexed")
index = None
scale = 1
if indexed:
index = indexed.get("index")
scale = int(indexed.get("scale", "1"), 0)
else:
index = None
scale = 1
displacement_op = (
self.process_immediate(pre_displacement.immediate) if pre_displacement else None
)
displacement_op = (
self.process_immediate(post_displacement.immediate)
if post_displacement
else displacement_op
)
if register_expression.get("operator_index") == "-":
scale *= -1
displacement_op = self.process_immediate(displacement.immediate) if displacement else None
if displacement_op and register_expression.get("operator_disp") == "-":
displacement_op.value *= -1
base_op = RegisterOperand(name=base.name) if base else None
index_op = RegisterOperand(name=index.name) if index else None
new_memory = MemoryOperand(
@@ -724,6 +731,8 @@ class ParserX86Intel(ParserX86):
if "displacement" in offset_expression
else None
)
if displacement and "operator_disp" == "-":
displacement.value *= -1
identifier = self.process_identifier(offset_expression.identifier)
identifier.offset = displacement
return MemoryOperand(offset=identifier)

View File

@@ -0,0 +1,102 @@
# Produced with gcc 14.2 with -O3 -march=sapphirerapids -fopenmp-simd -mprefer-vector-width=512, https://godbolt.org/z/drE47x1b4.
.LC3:
.string "%f\n"
main:
push r14
xor edi, edi
push r13
push r12
push rbp
push rbx
call time
mov edi, eax
call srand
mov edi, 1600
call malloc
mov r12, rax
mov rbp, rax
lea r13, [rax+1600]
mov rbx, rax
.L2:
mov edi, 1600
add rbx, 8
call malloc
mov QWORD PTR [rbx-8], rax
cmp r13, rbx
jne .L2
lea rbx, [r12+8]
lea r13, [r12+1592]
.L5:
mov r14d, 8
.L4:
call rand
vxorpd xmm2, xmm2, xmm2
mov rcx, QWORD PTR [rbx]
movsx rdx, eax
mov esi, eax
imul rdx, rdx, 351843721
sar esi, 31
sar rdx, 45
sub edx, esi
imul edx, edx, 100000
sub eax, edx
vcvtsi2sd xmm0, xmm2, eax
vdivsd xmm0, xmm0, QWORD PTR .LC0[rip]
vmovsd QWORD PTR [rcx+r14], xmm0
add r14, 8
cmp r14, 1592
jne .L4
add rbx, 8
cmp r13, rbx
jne .L5
vmovsd xmm1, QWORD PTR .LC1[rip]
lea rdi, [r12+1584]
.L6:
mov rdx, QWORD PTR [rbp+8]
mov rcx, QWORD PTR [rbp+16]
mov eax, 1
mov rsi, QWORD PTR [rbp+0]
vmovsd xmm0, QWORD PTR [rdx]
.L7:
vaddsd xmm0, xmm0, QWORD PTR [rcx+rax*8]
vaddsd xmm0, xmm0, QWORD PTR [rdx+8+rax*8]
vaddsd xmm0, xmm0, QWORD PTR [rsi+rax*8]
vmulsd xmm0, xmm0, xmm1
vmovsd QWORD PTR [rdx+rax*8], xmm0
inc rax
cmp rax, 199
jne .L7
vmovsd xmm0, QWORD PTR [rdx+1592]
add rbp, 8
vmovsd QWORD PTR [rcx+8], xmm0
cmp rdi, rbp
jne .L6
mov rax, QWORD PTR [r12+1584]
vmovsd xmm0, QWORD PTR .LC2[rip]
vucomisd xmm0, QWORD PTR [rax+1584]
jp .L9
je .L19
.L9:
pop rbx
xor eax, eax
pop rbp
pop r12
pop r13
pop r14
ret
.L19:
mov rax, QWORD PTR [r12]
mov edi, OFFSET FLAT:.LC3
vmovsd xmm0, QWORD PTR [rax]
mov eax, 1
call printf
jmp .L9
.LC0:
.long 0
.long 1083129856
.LC1:
.long 2061584302
.long 1072934420
.LC2:
.long -57724360
.long 1072939201

View File

@@ -1,15 +1,15 @@
; Translated from kernel_x86_memdep.s
L4:
vmovsd [rax+8], xmm0
add rax, 8
vmovsd [rax+rcx*8+8], xmm0
vaddsd xmm0, xmm0, [rax]
sub rax, -8
vaddsd xmm0, xmm0, [rax-8]
dec rcx
vaddsd xmm0, xmm0, [rax+rcx*8+8]
mov rdx, rcx
vaddsd xmm0, xmm0, [rax+rdx*8+8]
vmovsd [rax+8], xmm0 # line 3 <---------------------------------+
add rax, 8 # rax=rax_orig+8 |
vmovsd [rax+rcx*8+8], xmm0 # line 5 <------------------------------------------+
vaddsd xmm0, xmm0, [rax] # depends on line 3, rax+8;[rax] == [rax+8] --------+ |
sub rax, -8 # rax=rax_orig+16 | |
vaddsd xmm0, xmm0, [rax-8] # depends on line 3, rax+16;[rax-8] == [rax+8] -----+ |
dec rcx # rcx=rcx_orig-1 |
vaddsd xmm0, xmm0, [rax+rcx*8+8] # depends on line 5, [(rax+8)+(rcx-1)*8+8] == [rax+rcx*+8] --+
mov rdx, rcx # |
vaddsd xmm0, xmm0, [rax+rdx*8+8] # depends on line 5, rcx == rdx -----------------------------+
vmulsd xmm0, xmm0, xmm1
add rax, 8
cmp rsi, rax

View File

@@ -25,6 +25,8 @@ class TestParserX86Intel(unittest.TestCase):
self.triad_iaca_code = f.read()
with open(self._find_file("gs_x86_icc.s")) as f:
self.gs_icc_code = f.read()
with open(self._find_file("gs_x86_gcc.s")) as f:
self.gs_gcc_code = f.read()
##################
# Test
@@ -100,6 +102,7 @@ class TestParserX86Intel(unittest.TestCase):
instr11 = "\tlea\trcx, OFFSET FLAT:??_R0N@8+8"
instr12 = "\tvfmadd213sd xmm0, xmm1, QWORD PTR __real@bfc5555555555555"
instr13 = "\tjmp\t$LN18@operator"
instr14 = "vaddsd xmm0, xmm0, QWORD PTR [rdx+8+rax*8]"
parsed_1 = self.parser.parse_instruction(instr1)
parsed_2 = self.parser.parse_instruction(instr2)
@@ -114,6 +117,7 @@ class TestParserX86Intel(unittest.TestCase):
parsed_11 = self.parser.parse_instruction(instr11)
parsed_12 = self.parser.parse_instruction(instr12)
parsed_13 = self.parser.parse_instruction(instr13)
parsed_14 = self.parser.parse_instruction(instr14)
self.assertEqual(parsed_1.mnemonic, "sub")
self.assertEqual(parsed_1.operands[0], RegisterOperand(name="RSP"))
@@ -204,6 +208,19 @@ class TestParserX86Intel(unittest.TestCase):
self.assertEqual(parsed_13.mnemonic, "jmp")
self.assertEqual(parsed_13.operands[0], IdentifierOperand(name="$LN18@operator"))
self.assertEqual(parsed_14.mnemonic, "vaddsd")
self.assertEqual(parsed_14.operands[0], RegisterOperand(name="XMM0"))
self.assertEqual(parsed_14.operands[1], RegisterOperand(name="XMM0"))
self.assertEqual(
parsed_14.operands[2],
MemoryOperand(
base=RegisterOperand(name="RDX"),
offset=ImmediateOperand(value=8),
index=RegisterOperand(name="RAX"),
scale=8,
),
)
def test_parse_line(self):
line_comment = "; -- Begin main"
line_instruction = "\tret\t0"
@@ -344,6 +361,38 @@ class TestParserX86Intel(unittest.TestCase):
)
self.assertEqual(len(parsed), 227)
def test_parse_file4(self):
parsed = self.parser.parse_file(self.gs_gcc_code)
self.assertEqual(parsed[0].line_number, 1)
# Check a few lines to make sure that we produced something reasonable.
self.assertEqual(
parsed[61],
InstructionForm(
mnemonic="vaddsd",
operands=[
RegisterOperand("XMM0"),
RegisterOperand("XMM0"),
MemoryOperand(
base=RegisterOperand("RDX"),
index=RegisterOperand("RAX"),
scale=8,
offset=ImmediateOperand(value=8),
),
],
line=" vaddsd xmm0, xmm0, QWORD PTR [rdx+8+rax*8]",
line_number=62,
),
)
self.assertEqual(
parsed[101],
InstructionForm(
directive_id=DirectiveOperand(name=".long", parameters=["1072939201"]),
line=" .long 1072939201",
line_number=102,
),
)
self.assertEqual(len(parsed), 102)
def test_normalize_imd(self):
imd_binary = ImmediateOperand(value="1001111B")
imd_octal = ImmediateOperand(value="117O")