Compare commits

...

25 Commits

Author SHA1 Message Date
JanLJL
bce837dec9 version bump 2021-06-01 00:13:38 +02:00
JanLJL
090c24ade1 fixed parsing of reg ranges and lists 2021-06-01 00:10:05 +02:00
JanLJL
03a2a1da33 version bump 2021-05-10 12:56:35 +02:00
JanLJL
d59b100fa8 changed immediate type from str to int 2021-05-10 01:12:30 +02:00
JanLJL
5c741a8a2d version bump 2021-05-05 11:16:43 +02:00
JanLJL
2f4849f44e added tests for timeout in LCD analyis 2021-05-02 22:48:22 +02:00
JanLJL
f13a97e5b5 fixed bug in case of no uarch in CLI 2021-05-02 22:39:07 +02:00
JanLJL
66282b0eef fix #73 2021-05-02 22:22:30 +02:00
Julian Hammer
9ec7c161ab added missing testfile for sve instructions 2021-05-02 21:44:17 +02:00
Julian Hammer
8d8eaa8e4f addd LD2 and ST2 instructions to a64fx 2021-04-23 13:33:32 +02:00
Julian Hammer
88d5094bf1 Merge branch 'master' of github.com:RRZE-HPC/OSACA 2021-04-23 13:18:23 +02:00
Julian Hammer
1f32252f91 improved register range and list support on AArch64 2021-04-23 13:12:18 +02:00
JanLJL
1de644cd62 fixed incompatibilty to py3.6 2021-04-20 13:59:56 +02:00
JanLJL
3d1c6aae8d set min requirement to py3.6 2021-04-20 13:59:32 +02:00
JanLJL
dafec70e6e added wheel to pypi publishing 2021-04-19 11:33:29 +02:00
JanLJL
6d85fbe9e4 fixed duplicate hyperlink tags 2021-04-19 10:58:11 +02:00
JanLJL
3f31235f8a added no timeout option 2021-04-19 10:57:51 +02:00
JanLJL
cfc061e5e3 version bump 2021-04-19 10:14:26 +02:00
JanLJL
5eb3e07ad6 Merge branch 'master' of https://github.com/RRZE-HPC/OSACA 2021-04-19 00:34:32 +02:00
JanLJL
a82a0e24a3 bugfixed CLX as uarch flag 2021-04-19 00:34:21 +02:00
Jan
6db08c7e8e added lcd-timeout flag, citations and updated credits 2021-04-19 00:27:24 +02:00
JanLJL
e6a54ee131 added CLX as synonym for CSX uarch 2021-04-19 00:05:53 +02:00
JanLJL
152360bad2 enhanced LCD analysis by making it parallel and added timeout flag 2021-04-19 00:04:03 +02:00
JanLJL
607d459569 keep dependency paths as generators instead of lists 2021-04-17 12:46:44 +02:00
JanLJL
b033b3b7aa allow different base with prefix for offset values 2021-04-17 11:06:39 +02:00
18 changed files with 1397 additions and 159 deletions

View File

@@ -31,11 +31,11 @@ jobs:
- uses: codecov/codecov-action@v1
- name: Build package
run: |
python setup.py build sdist
python setup.py build sdist bdist_wheel
- name: Publish to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@master
with:
skip_existing: true
user: __token__
password: ${{ secrets.pypi_password }}
password: ${{ secrets.pypi_password }}

View File

@@ -82,10 +82,10 @@ The usage of OSACA can be listed as:
.. code:: bash
osaca [-h] [-V] [--arch ARCH] [--fixed] [--lines LINES] [--db-check]
[--import MICROBENCH] [--insert-marker]
[--export-graph GRAPHNAME] [--ignore-unknown] [--verbose]
[--out OUT]
osaca [-h] [-V] [--arch ARCH] [--fixed] [--lines LINES]
[--ignore-unknown] [--lcd-timeout SECONDS]
[--db-check] [--import MICROBENCH] [--insert-marker]
[--export-graph GRAPHNAME] [--out OUT] [--verbose]
FILEPATH
-h, --help
@@ -118,6 +118,9 @@ The usage of OSACA can be listed as:
--ignore-unknown
Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
--lcd-timeout SECONDS
Set timeout in seconds for LCD analysis. After timeout, OSACA will continue its analysis with the dependency paths found up to this point.
Defaults to `10`.
-v, --verbose
Increases verbosity level
-o OUT, --out OUT
@@ -370,9 +373,16 @@ In the bottom, all loop-carried dependencies are shown, each with a list of line
You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
Citations
=========
If you use OSACA for scientific work you can cite us as (for the Bibtex, see the `Wiki <https://github.com/RRZE-HPC/OSACA/wiki#acknowledgement>`_):
* `Automated Instruction Stream Throughput Prediction for Intel and AMD Microarchitectures <https://doi.org/10.1109/PMBS.2018.8641578>`_ (`Pre-print PMBS18 <https://arxiv.org/abs/1809.00912>`_)
* `Automatic Throughput and Critical Path Analysis of x86 and ARM Assembly Kernels <https://doi.org/10.1109/PMBS49563.2019.00006>`_ (`Pre-print PMBS19 <https://arxiv.org/abs/1910.00214>`_)
Credits
=======
Implementation: Jan Laukemann
Implementation: Jan Laukemann, Julian Hammer
License
=======

View File

@@ -1,6 +1,6 @@
"""Open Source Architecture Code Analyzer"""
name = "osaca"
__version__ = "0.4.0"
__version__ = "0.4.4"
# To trigger travis deployment to pypi, do the following:
# 1. Increment __version___

View File

@@ -1132,6 +1132,27 @@ instruction_forms:
throughput: 2.0
latency: 11.0 # 1*p0+1*p3+4*p56+1*p5D6D
port_pressure: [[1, '0'],[1, '3'],[4, '56'], [4, ['5D', '6D']]] # not sure if we also have 4 data accesses
- name: ld2d
operands:
- class: register
prefix: 'z'
shape: 'd'
- class: register
prefix: 'z'
shape: 'd'
- class: register
prefix: p
predication: '*'
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
pre-indexed: false
post-indexed: false
throughput: 2.0
latency: 11.0 # 1*p0+1*p3+4*p56+1*p5D6D
port_pressure: [[2, '56'], [4, ['5D', '6D']]]
- name: ldp
operands:
- class: register
@@ -1414,6 +1435,22 @@ instruction_forms:
throughput: 0.0
latency: 0.0
port_pressure: []
- name: ld2
operands:
- class: register
prefix: v
- class: register
prefix: v
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
throughput: 1.0
latency: 11.0 # 1*p56+2*p5D6D
port_pressure: [[1, '56'], [2, ['5D','6D']]]
- name: lsl
operands:
- class: register
@@ -1980,6 +2017,43 @@ instruction_forms:
throughput: 1.0
latency: 0 # 1*p5+1*p6+1*p0
port_pressure: [[1, '5'], [1, '6'], [1, '0']]
- name: st2d
operands:
- class: register
prefix: 'z'
shape: 'd'
- class: register
prefix: 'z'
shape: 'd'
- class: register
prefix: p
predication: '*'
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
pre-indexed: false
post-indexed: false
throughput: 1.0
latency: 0 # 1*p5+1*p6+1*p0
port_pressure: [[1, '5'], [1, '6'], [1, '0']]
- name: st2
operands:
- class: register
prefix: v
- class: register
prefix: v
- class: memory
base: x
offset: '*'
index: '*'
scale: '*'
post-indexed: false
pre-indexed: false
throughput: 1.0
latency: 11.0 # 1*p56+2*p5D6D
port_pressure: [[1, '5'], [1, ['6']], [1, '0']]
- name: sub
operands:
- class: register

View File

@@ -163,6 +163,7 @@ class Frontend(object):
ignore_unknown=False,
arch_warning=False,
length_warning=False,
lcd_warning=False,
verbose=False,
):
"""
@@ -176,17 +177,19 @@ class Frontend(object):
:param ignore_unknown: flag for ignore warning if performance data is missing, defaults to
`False`
:type ignore_unknown: boolean, optional
:param print_arch_warning: flag for additional user warning to specify micro-arch
:type print_arch_warning: boolean, optional
:param print_length_warning: flag for additional user warning to specify kernel length with
:param arch_warning: flag for additional user warning to specify micro-arch
:type arch_warning: boolean, optional
:param length_warning: flag for additional user warning to specify kernel length with
--lines
:type print_length_warning: boolean, optional
:type length_warning: boolean, optional
:param lcd_warning: flag for additional user warning due to LCD analysis timed out
:type lcd_warning: boolean, optional
:param verbose: flag for verbosity level, defaults to False
:type verbose: boolean, optional
"""
return (
self._header_report()
+ self._user_warnings(arch_warning, length_warning)
+ self._user_warnings_header(arch_warning, length_warning)
+ self._symbol_map()
+ self.combined_view(
kernel,
@@ -194,6 +197,7 @@ class Frontend(object):
kernel_dg.get_loopcarried_dependencies(),
ignore_unknown,
)
+ self._user_warnings_footer(lcd_warning)
+ self.loopcarried_dependencies(kernel_dg.get_loopcarried_dependencies())
)
@@ -236,8 +240,9 @@ class Frontend(object):
if dep_dict:
longest_lcd = max(dep_dict, key=lambda ln: dep_dict[ln]['latency'])
lcd_sum = dep_dict[longest_lcd]['latency']
lcd_lines = {instr["line_number"]: lat
for instr, lat in dep_dict[longest_lcd]["dependencies"]}
lcd_lines = {
instr["line_number"]: lat for instr, lat in dep_dict[longest_lcd]["dependencies"]
}
s += headline_str.format(headline) + "\n"
s += (
@@ -311,18 +316,24 @@ class Frontend(object):
).format(amount, "-" * len(str(amount)))
return s
def _user_warnings(self, arch_warning, length_warning):
def _user_warnings_header(self, arch_warning, length_warning):
"""Returns warning texts for giving the user more insight in what he is doing."""
dashed_line = (
"-------------------------------------------------------------------------"
"------------------------\n"
)
arch_text = (
"WARNING: No micro-architecture was specified and a default uarch was used.\n"
" Specify the uarch with --arch. See --help for more information.\n"
"-------------------------- WARNING: No micro-architecture was specified "
"-------------------------\n"
" A default uarch for this particular ISA was used. Specify "
"the uarch with --arch.\n See --help for more information.\n" + dashed_line
)
length_text = (
"WARNING: You are analyzing a large amount of instruction forms. Analysis "
"across loops/block boundaries often do not make much sense.\n"
" Specify the kernel length with --length. See --help for more "
"information.\n"
" If this is intentional, you can safely ignore this message.\n"
"----------------- WARNING: You are analyzing a large amount of instruction forms "
"----------------\n Analysis across loops/block boundaries often do not make"
" much sense.\n Specify the kernel length with --length. See --help for more "
"information.\n If this is intentional, you can safely ignore this message.\n"
+ dashed_line
)
warnings = ""
@@ -331,6 +342,24 @@ class Frontend(object):
warnings += "\n"
return warnings
def _user_warnings_footer(self, lcd_warning):
"""Returns warning texts for giving the user more insight in what he is doing."""
dashed_line = (
"-------------------------------------------------------------------------"
"------------------------\n"
)
lcd_text = (
"-------------------------------- WARNING: LCD analysis timed out "
"-------------------------------\n While searching for all dependency chains"
" the analysis timed out and might be\n incomplete. Decrease the number of "
"instructions or set the timeout threshold\n with --lcd-timeout. See --help"
" for more information.\n" + dashed_line
)
warnings = "\n"
warnings += lcd_text if lcd_warning else ""
warnings += "\n"
return warnings
def _get_separator_list(self, separator, separator_2=" "):
"""Creates column view for seperators in the TP/combined view."""
separator_list = []

View File

@@ -146,6 +146,16 @@ def create_parser(parser=None):
action="store_true",
help="Ignore if instructions cannot be found in the data file and print analysis anyway.",
)
parser.add_argument(
"--lcd-timeout",
dest="lcd_timeout",
metavar="SECONDS",
type=int,
default=10,
help="Set timeout in seconds for LCD analysis. After timeout, OSACA will continue"
" its analysis with the dependency paths found up to this point. Defaults to 10."
" Set to -1 for no timeout.",
)
parser.add_argument(
"--verbose", "-v", action="count", default=0, help="Increases verbosity level."
)
@@ -172,6 +182,9 @@ def check_arguments(args, parser):
"""
supported_import_files = ["ibench", "asmbench"]
# manually set CLX to CSX to support both abbreviations
if args.arch and args.arch.upper() == "CLX":
args.arch = "CSX"
if args.arch is None and (args.check_db or "import_data" in args):
parser.error(
"DB check and data import cannot work with a default microarchitecture. "
@@ -303,7 +316,7 @@ def inspect(args, output_file=sys.stdout):
semantics.assign_optimal_throughput(kernel)
# Create DiGrahps
kernel_graph = KernelDG(kernel, parser, machine_model, semantics)
kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)
if args.dotpath is not None:
kernel_graph.export_graph(args.dotpath if args.dotpath != "." else None)
# Print analysis
@@ -315,6 +328,7 @@ def inspect(args, output_file=sys.stdout):
ignore_unknown=ignore_unknown,
arch_warning=print_arch_warning,
length_warning=print_length_warning,
lcd_warning=kernel_graph.timed_out,
verbose=verbose,
),
file=output_file,

View File

@@ -1,6 +1,5 @@
#!/usr/bin/env python3
from copy import deepcopy
import pyparsing as pp
from osaca.parser import AttrDict, BaseParser
@@ -257,7 +256,9 @@ class ParserAArch64(BaseParser):
# 2. Parse label
if result is None:
try:
result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
result = self.process_operand(
self.label.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.LABEL_ID] = result[self.LABEL_ID].name
if self.COMMENT_ID in result[self.LABEL_ID]:
@@ -292,7 +293,6 @@ class ParserAArch64(BaseParser):
try:
result = self.parse_instruction(line)
except (pp.ParseException, KeyError) as e:
raise e
raise ValueError("Unable to parse {!r} on line {}".format(line, line_number)) from e
instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
@@ -313,19 +313,24 @@ class ParserAArch64(BaseParser):
# Add operands to list
# Check first operand
if "operand1" in result:
operands.append(self.process_operand(result["operand1"]))
operand = self.process_operand(result["operand1"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check second operand
if "operand2" in result:
operands.append(self.process_operand(result["operand2"]))
operand = self.process_operand(result["operand2"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check third operand
if "operand3" in result:
operands.append(self.process_operand(result["operand3"]))
operand = self.process_operand(result["operand3"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check fourth operand
if "operand4" in result:
operands.append(self.process_operand(result["operand4"]))
operand = self.process_operand(result["operand4"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check fifth operand
if "operand5" in result:
operands.append(self.process_operand(result["operand5"]))
operand = self.process_operand(result["operand5"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
return_dict = AttrDict(
{
@@ -347,8 +352,8 @@ class ParserAArch64(BaseParser):
if self.REGISTER_ID in operand and (
"list" in operand[self.REGISTER_ID] or "range" in operand[self.REGISTER_ID]
):
# TODO: discuss if ranges should be converted to lists
return self.process_register_list(operand[self.REGISTER_ID])
# resolve ranges and lists
return self.resolve_range_list(self.process_register_list(operand[self.REGISTER_ID]))
if self.REGISTER_ID in operand and operand[self.REGISTER_ID]["name"] == "sp":
return self.process_sp_register(operand[self.REGISTER_ID])
# add value attribute to floating point immediates without exponent
@@ -366,6 +371,8 @@ class ParserAArch64(BaseParser):
offset = memory_address.get("offset", None)
if isinstance(offset, list) and len(offset) == 1:
offset = offset[0]
if offset is not None and "value" in offset:
offset["value"] = int(offset["value"], 0)
base = memory_address.get("base", None)
index = memory_address.get("index", None)
scale = 1
@@ -382,7 +389,12 @@ class ParserAArch64(BaseParser):
if "pre_indexed" in memory_address:
new_dict["pre_indexed"] = True
if "post_indexed" in memory_address:
new_dict["post_indexed"] = memory_address["post_indexed"]
if "value" in memory_address["post_indexed"]:
new_dict["post_indexed"] = {"value": int(
memory_address["post_indexed"]["value"], 0
)}
else:
new_dict["post_indexed"] = memory_address["post_indexed"]
return AttrDict({self.MEMORY_ID: new_dict})
def process_sp_register(self, register):
@@ -391,6 +403,37 @@ class ParserAArch64(BaseParser):
reg["prefix"] = "x"
return AttrDict({self.REGISTER_ID: reg})
def resolve_range_list(self, operand):
"""
Resolve range or list register operand to list of registers.
Returns None if neither list nor range
"""
if 'register' in operand:
if 'list' in operand.register:
index = operand.register.get('index')
range_list = []
for reg in operand.register.list:
reg = deepcopy(reg)
if index is not None:
reg['index'] = int(index, 0)
range_list.append(AttrDict({self.REGISTER_ID: reg}))
return range_list
elif 'range' in operand.register:
base_register = operand.register.range[0]
index = operand.register.get('index')
range_list = []
start_name = base_register.name
end_name = operand.register.range[1].name
for name in range(int(start_name), int(end_name) + 1):
reg = deepcopy(base_register)
if index is not None:
reg['index'] = int(index, 0)
reg['name'] = str(name)
range_list.append(AttrDict({self.REGISTER_ID: reg}))
return range_list
# neither register list nor range, return unmodified
return operand
def process_register_list(self, register_list):
"""Post-process register lists (e.g., {r0,r3,r5}) and register ranges (e.g., {r0-r7})"""
# Remove unnecessarily created dictionary entries during parsing
@@ -419,11 +462,13 @@ class ParserAArch64(BaseParser):
if "value" in immediate:
# normal integer value
immediate["type"] = "int"
# convert hex/bin immediates to dec
immediate["value"] = self.normalize_imd(immediate)
return AttrDict({self.IMMEDIATE_ID: immediate})
if "base_immediate" in immediate:
# arithmetic immediate, add calculated value as value
immediate["shift"] = immediate["shift"][0]
immediate["value"] = int(immediate["base_immediate"]["value"], 0) << int(
immediate["value"] = self.normalize_imd(immediate["base_immediate"]) << int(
immediate["shift"]["value"]
)
immediate["type"] = "int"
@@ -471,10 +516,11 @@ class ParserAArch64(BaseParser):
def normalize_imd(self, imd):
"""Normalize immediate to decimal based representation"""
if "value" in imd:
if imd["value"].lower().startswith("0x"):
# hex, return decimal
return int(imd["value"], 16)
return int(imd["value"], 10)
if isinstance(imd["value"], str):
# hex or bin, return decimal
return int(imd["value"], 0)
else:
return imd["value"]
elif "float" in imd:
return self.ieee_to_float(imd["float"])
elif "double" in imd:

View File

@@ -108,7 +108,8 @@ class ParserX86ATT(BaseParser):
)
)
memory_segmentation = (
self.register.setResultsName("base")
pp.Optional(pp.Suppress(pp.Literal("*")))
+ self.register.setResultsName("base")
+ pp.Literal(":")
+ segment_extension.setResultsName(self.SEGMENT_EXT_ID)
)
@@ -326,9 +327,14 @@ class ParserX86ATT(BaseParser):
offset = memory_address.get("offset", None)
base = memory_address.get("base", None)
index = memory_address.get("index", None)
scale = 1 if "scale" not in memory_address else int(memory_address["scale"])
scale = 1 if "scale" not in memory_address else int(memory_address["scale"], 0)
if isinstance(offset, str) and base is None and index is None:
offset = {"value": offset}
try:
offset = {"value": int(offset, 0)}
except ValueError:
offset = {"value": offset}
elif offset is not None and "value" in offset:
offset["value"] = int(offset["value"], 0)
new_dict = AttrDict({"offset": offset, "base": base, "index": index, "scale": scale})
# Add segmentation extension if existing
if self.SEGMENT_EXT_ID in memory_address:
@@ -346,7 +352,8 @@ class ParserX86ATT(BaseParser):
if "identifier" in immediate:
# actually an identifier, change declaration
return immediate
# otherwise nothing to do
# otherwise just make sure the immediate is a decimal
immediate["value"] = int(immediate["value"], 0)
return AttrDict({self.IMMEDIATE_ID: immediate})
def get_full_reg_name(self, register):
@@ -357,10 +364,11 @@ class ParserX86ATT(BaseParser):
def normalize_imd(self, imd):
"""Normalize immediate to decimal based representation"""
if "value" in imd:
if imd["value"].lower().startswith("0x"):
# hex, return decimal
return int(imd["value"], 16)
return int(imd["value"], 10)
if isinstance(imd["value"], str):
# return decimal
return int(imd["value"], 0)
else:
return imd["value"]
# identifier
return imd

View File

@@ -1,5 +1,6 @@
#!/usr/bin/env python3
from itertools import chain
from copy import deepcopy
from osaca import utils
from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT
@@ -122,6 +123,7 @@ class ISASemantics(object):
"pre_indexed": pre_indexed,
"post_indexed": post_indexed})
)
# store operand list in dict and reassign operand key/value pair
instruction_form["semantic_operands"] = AttrDict.convert_dict(op_dict)
# assign LD/ST flags
@@ -130,6 +132,7 @@ class ISASemantics(object):
instruction_form["flags"] += [INSTR_FLAGS.HAS_LD]
if self._has_store(instruction_form):
instruction_form["flags"] += [INSTR_FLAGS.HAS_ST]
def get_reg_changes(self, instruction_form, only_postindexed=False):
"""
@@ -160,16 +163,16 @@ class ISASemantics(object):
if only_postindexed:
for o in instruction_form.operands:
if 'post_indexed' in o.get('memory', {}):
base_name = o.memory.base.get('prefix', '')+o.memory.base.name
base_name = o.memory.base.get('prefix', '') + o.memory.base.name
return {base_name: {
'name': o.memory.base.get('prefix', '')+o.memory.base.name,
'value': int(o.memory.post_indexed.value)
'name': o.memory.base.get('prefix', '') + o.memory.base.name,
'value': o.memory.post_indexed.value
}}
return {}
reg_operand_names = {} # e.g., {'rax': 'op1'}
operand_state = {} # e.g., {'op1': {'name': 'rax', 'value': 0}} 0 means unchanged
for o in instruction_form.operands:
if 'pre_indexed' in o.get('memory', {}):
# Assuming no isa_data.operation
@@ -177,24 +180,24 @@ class ISASemantics(object):
raise ValueError(
"ISA information for pre-indexed instruction {!r} has operation set."
"This is currently not supprted.".format(instruction_form.line))
base_name = o.memory.base.get('prefix', '')+o.memory.base.name
base_name = o.memory.base.get('prefix', '') + o.memory.base.name
reg_operand_names = {base_name: 'op1'}
operand_state = {'op1': {
'name': base_name,
'value': int(o.memory.offset.value)
'value': o.memory.offset.value
}}
if isa_data is not None and 'operation' in isa_data:
for i, o in enumerate(instruction_form.operands):
operand_name = "op{}".format(i+1)
operand_name = "op{}".format(i + 1)
if "register" in o:
o_reg_name = o["register"].get('prefix', '')+o["register"]["name"]
o_reg_name = o["register"].get('prefix', '') + o["register"]["name"]
reg_operand_names[o_reg_name] = operand_name
operand_state[operand_name] = {
'name': o_reg_name,
'value': 0}
elif "immediate" in o:
operand_state[operand_name] = {'value': int(o["immediate"]["value"])}
operand_state[operand_name] = {'value': o["immediate"]["value"]}
elif "memory" in o:
# TODO lea needs some thinking about
pass
@@ -209,7 +212,7 @@ class ISASemantics(object):
"""
Create operand dictionary containing src/dst operands out of the ISA data entry and
the oeprands of an instruction form
If breaks_pedendency_on_equal_operands is True (configuted per instruction in ISA db)
and all operands are equal, place operand into destination only.

View File

@@ -1,22 +1,39 @@
#!/usr/bin/env python3
import copy
from itertools import chain, product
from collections import defaultdict
import os
import signal
import time
from itertools import chain
from multiprocessing import Manager, Process, cpu_count
import networkx as nx
from osaca.semantics import INSTR_FLAGS, ArchSemantics, MachineModel
from osaca.parser import AttrDict
from osaca.semantics import INSTR_FLAGS, MachineModel, ArchSemantics
class KernelDG(nx.DiGraph):
def __init__(self, parsed_kernel, parser, hw_model: MachineModel, semantics: ArchSemantics):
# threshold for checking dependency graph sequential or in parallel
INSTRUCTION_THRESHOLD = 50
def __init__(
self, parsed_kernel, parser, hw_model: MachineModel, semantics: ArchSemantics, timeout=10
):
self.timed_out = False
self.kernel = parsed_kernel
self.parser = parser
self.model = hw_model
self.arch_sem = semantics
self.dg = self.create_DG(self.kernel)
self.loopcarried_deps = self.check_for_loopcarried_dep(self.kernel)
self.loopcarried_deps = self.check_for_loopcarried_dep(self.kernel, timeout)
def _extend_path(self, dst_list, kernel, dg, offset):
for instr in kernel:
generator_path = nx.algorithms.simple_paths.all_simple_paths(
dg, instr.line_number, instr.line_number + offset
)
tmp_list = list(generator_path)
dst_list.extend(tmp_list)
# print('Thread [{}-{}] done'.format(kernel[0]['line_number'], kernel[-1]['line_number']))
def create_DG(self, kernel):
"""
@@ -65,17 +82,19 @@ class KernelDG(nx.DiGraph):
dg.nodes[dep["line_number"]]["instruction_form"] = dep
return dg
def check_for_loopcarried_dep(self, kernel):
def check_for_loopcarried_dep(self, kernel, timeout=10):
"""
Try to find loop-carried dependencies in given kernel.
:param kernel: Parsed asm kernel with assigned semantic information
:type kernel: list
:param timeout: Timeout in seconds for parallel execution, defaults
to `10`. Set to `0` for no timeout
:type timeout: int
:returns: `dict` -- dependency dictionary with all cyclic LCDs
"""
# increase line number for second kernel loop
offset = max(1000, max([i.line_number for i in kernel]))
first_line_no = kernel[0].line_number
tmp_kernel = [] + kernel
for orig_iform in kernel:
temp_iform = copy.copy(orig_iform)
@@ -86,13 +105,59 @@ class KernelDG(nx.DiGraph):
# build cyclic loop-carried dependencies
loopcarried_deps = []
paths = []
for instr in kernel:
paths += list(nx.algorithms.simple_paths.all_simple_paths(
dg, instr.line_number, instr.line_number + offset))
all_paths = []
klen = len(kernel)
if klen >= self.INSTRUCTION_THRESHOLD:
# parallel execution with static scheduling
num_cores = cpu_count()
workload = int((klen - 1) / num_cores) + 1
starts = [tid * workload for tid in range(num_cores)]
ends = [min((tid + 1) * workload, klen) for tid in range(num_cores)]
instrs = [kernel[s:e] for s, e in zip(starts, ends)]
with Manager() as manager:
all_paths = manager.list()
processes = [
Process(target=self._extend_path, args=(all_paths, instr_section, dg, offset))
for instr_section in instrs
]
for p in processes:
p.start()
if (timeout == -1):
# no timeout
for p in processes:
p.join()
else:
start_time = time.time()
while time.time() - start_time <= timeout:
if any(p.is_alive() for p in processes):
time.sleep(0.2)
else:
# all procs done
for p in processes:
p.join()
break
else:
self.timed_out = True
# terminate running processes
for p in processes:
if p.is_alive():
# Python 3.6 does not support Process.kill().
# Can be changed to `p.kill()` after EoL (01/22) of Py3.6
os.kill(p.pid, signal.SIGKILL)
p.join()
all_paths = list(all_paths)
else:
# sequential execution to avoid overhead when analyzing smaller kernels
for instr in kernel:
all_paths.extend(
nx.algorithms.simple_paths.all_simple_paths(
dg, instr.line_number, instr.line_number + offset
)
)
paths_set = set()
for path in paths:
for path in all_paths:
lat_sum = 0.0
# extend path by edge bound latencies (e.g., store-to-load latency)
lat_path = []
@@ -120,8 +185,10 @@ class KernelDG(nx.DiGraph):
for lat_sum, involved_lines in loopcarried_deps:
loopcarried_deps_dict[involved_lines[0][0]] = {
"root": self._get_node_by_lineno(involved_lines[0][0]),
"dependencies": [(self._get_node_by_lineno(ln), lat) for ln, lat in involved_lines],
"latency": lat_sum
"dependencies": [
(self._get_node_by_lineno(ln), lat) for ln, lat in involved_lines
],
"latency": lat_sum,
}
return loopcarried_deps_dict
@@ -167,9 +234,7 @@ class KernelDG(nx.DiGraph):
# split to DAG
raise NotImplementedError("Kernel is cyclic.")
def find_depending(
self, instruction_form, instructions, flag_dependencies=False
):
def find_depending(self, instruction_form, instructions, flag_dependencies=False):
"""
Find instructions in `instructions` depending on a given instruction form's results.
@@ -189,15 +254,15 @@ class KernelDG(nx.DiGraph):
# TODO instructions before must be considered as well, if they update registers
# not used by insruction_form. E.g., validation/build/A64FX/gcc/O1/gs-2d-5pt.marked.s
register_changes = self._update_reg_changes(instruction_form)
#print("FROM", instruction_form.line, register_changes)
# print("FROM", instruction_form.line, register_changes)
for i, instr_form in enumerate(instructions):
self._update_reg_changes(instr_form, register_changes)
#print(" TO", instr_form.line, register_changes)
# print(" TO", instr_form.line, register_changes)
if "register" in dst:
# read of register
if self.is_read(dst.register, instr_form) and not (
dst.get("pre_indexed", False) or
dst.get("post_indexed", False)):
dst.get("pre_indexed", False) or dst.get("post_indexed", False)
):
yield instr_form, []
# write to register -> abort
if self.is_written(dst.register, instr_form):
@@ -214,10 +279,10 @@ class KernelDG(nx.DiGraph):
if "pre_indexed" in dst.memory:
if self.is_written(dst.memory.base, instr_form):
break
#if dst.memory.base:
# if dst.memory.base:
# if self.is_read(dst.memory.base, instr_form):
# yield instr_form, []
#if dst.memory.index:
# if dst.memory.index:
# if self.is_read(dst.memory.index, instr_form):
# yield instr_form, []
if "post_indexed" in dst.memory:
@@ -225,7 +290,7 @@ class KernelDG(nx.DiGraph):
if self.is_written(dst.memory.base, instr_form):
break
# TODO record register changes
# (e.g., mov, leaadd, sub, inc, dec) in instructions[:i]
# (e.g., mov, leaadd, sub, inc, dec) in instructions[:i]
# and pass to is_memload and is_memstore to consider relevance.
# load from same location (presumed)
if self.is_memload(dst.memory, instr_form, register_changes):
@@ -285,7 +350,9 @@ class KernelDG(nx.DiGraph):
if src.memory.base is not None:
is_read = self.parser.is_reg_dependend_of(register, src.memory.base) or is_read
if src.memory.index is not None:
is_read = self.parser.is_reg_dependend_of(register, src.memory.index) or is_read
is_read = (
self.parser.is_reg_dependend_of(register, src.memory.index) or is_read
)
# Check also if read in destination memory address
for dst in chain(
instruction_form.semantic_operands.destination,
@@ -295,7 +362,9 @@ class KernelDG(nx.DiGraph):
if dst.memory.base is not None:
is_read = self.parser.is_reg_dependend_of(register, dst.memory.base) or is_read
if dst.memory.index is not None:
is_read = self.parser.is_reg_dependend_of(register, dst.memory.index) or is_read
is_read = (
self.parser.is_reg_dependend_of(register, dst.memory.index) or is_read
)
return is_read
def is_memload(self, mem, instruction_form, register_changes={}):
@@ -313,41 +382,43 @@ class KernelDG(nx.DiGraph):
# determine absolute address change
addr_change = 0
if src.offset and "value" in src.offset:
addr_change += int(src.offset.value)
addr_change += src.offset.value
if mem.offset:
addr_change -= int(mem.offset.value)
addr_change -= mem.offset.value
if mem.base and src.base:
base_change = register_changes.get(
src.base.get('prefix', '')+src.base.name,
{'name': src.base.get('prefix', '')+src.base.name, 'value': 0})
src.base.get('prefix', '') + src.base.name,
{'name': src.base.get('prefix', '') + src.base.name, 'value': 0},
)
if base_change is None:
# Unknown change occurred
continue
if mem.base.get('prefix', '')+mem.base['name'] != base_change['name']:
if mem.base.get('prefix', '') + mem.base['name'] != base_change['name']:
# base registers do not match
continue
addr_change += base_change['value']
elif mem.base or src.base:
# base registers do not match
continue
# base registers do not match
continue
if mem.index and src.index:
index_change = register_changes.get(
src.index.get('prefix', '')+src.index.name,
{'name': src.index.get('prefix', '')+src.index.name, 'value': 0})
src.index.get('prefix', '') + src.index.name,
{'name': src.index.get('prefix', '') + src.index.name, 'value': 0},
)
if index_change is None:
# Unknown change occurred
continue
if mem.scale != src.scale:
# scale factors do not match
continue
if mem.index.get('prefix', '')+mem.index['name'] != index_change['name']:
if mem.index.get('prefix', '') + mem.index['name'] != index_change['name']:
# index registers do not match
continue
addr_change += index_change['value'] * src.scale
elif mem.index or src.index:
# index registers do not match
continue
#if instruction_form.line_number == 3:
# index registers do not match
continue
# if instruction_form.line_number == 3:
if addr_change == 0:
return True
return False

View File

@@ -91,7 +91,6 @@ setup(
# Specify the Python versions you support here. In particular, ensure
# that you indicate wheter you support Python2, Python 3 or both.
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
@@ -107,7 +106,7 @@ setup(
# requirements files see:
# https://packaging.python.org/en/latest/requirements.html
install_requires=["networkx", "pyparsing>=2.3.1", "ruamel.yaml>=0.15.71"],
python_requires=">=3.5",
python_requires=">=3.6",
# List additional groups of dependencies here (e.g. development
# dependencies). You can install these using the following syntax,
# for example:

View File

@@ -183,14 +183,38 @@ class TestCLI(unittest.TestCase):
output = StringIO()
osaca.run(args, output_file=output)
# WARNING for length
self.assertTrue(output.getvalue().count("WARNING") == 1)
self.assertTrue(
output.getvalue().count(
"WARNING: You are analyzing a large amount of instruction forms"
)
== 1
)
# WARNING for arch
args = parser.parse_args(
["--lines", "100-199", "--ignore-unknown", self._find_test_file(kernel)]
)
output = StringIO()
osaca.run(args, output_file=output)
# WARNING for arch
self.assertTrue(output.getvalue().count("WARNING") == 1)
self.assertTrue(
output.getvalue().count("WARNING: No micro-architecture was specified") == 1
)
# WARNING for timeout
args = parser.parse_args(
["--ignore-unknown", "--lcd-timeout", "0", self._find_test_file(kernel)]
)
output = StringIO()
osaca.run(args, output_file=output)
self.assertTrue(
output.getvalue().count("WARNING: LCD analysis timed out") == 1
)
args = parser.parse_args(
["--ignore-unknown", "--lcd-timeout", "-1", self._find_test_file(kernel)]
)
output = StringIO()
osaca.run(args, output_file=output)
self.assertTrue(
output.getvalue().count("WARNING: LCD analysis timed out") == 0
)
def test_lines_arg(self):
# Run tests with --lines option

View File

@@ -0,0 +1,32 @@
// OSACA-BEGIN
.L5:
add x10, x1, x11
add x6, x1, x8
ld2d {z0.d - z1.d}, p1/z, [x10]
ld2d {z2.d - z3.d}, p1/z, [x6]
mov z5.d, z1.d
fadd z20.d, z3.d, z3.d
mov z1.d, z0.d
add x6, x1, x7
fadd z2.d, z2.d, z2.d
ld2d {z6.d - z7.d}, p1/z, [x6]
fmul z4.d, z5.d, z20.d
add x10, x1, x12
mov z0.d, z7.d
ld2d {z16.d - z17.d}, p1/z, [x10]
mov z3.d, z4.d
fmls z3.d, p0/m, z0.d, z17.d
fmul z0.d, z0.d, z16.d
fmla z3.d, p0/m, z6.d, z16.d
fmla z0.d, p0/m, z6.d, z17.d
fmls z3.d, p0/m, z1.d, z2.d
fmls z0.d, p0/m, z1.d, z20.d
mov z18.d, z3.d
fmsb z5.d, p0/m, z2.d, z0.d
mov z19.d, z5.d
st2d {z18.d - z19.d}, p1, [x6]
add x5, x5, 8
add x1, x1, 128
whilelo p1.d, x5, x9
bne .L5
// OSACA-END

View File

@@ -0,0 +1,192 @@
# OSACA-BEGIN
push %r12
push %r13
push %r14
push %r15
push %rbp
mov %ecx,%r12d
mov %esi,%r14d
mov %r12d,%ecx
mov %r14d,%esi
mov %rdx,%r13
mov %rdi,%rbp
callq 0x4210d0
mov %rdx,%r8
movzbl (%rdi),%r9d
movslq %esi,%rsi
movslq %ecx,%rcx
movzbl (%r8),%r10d
vmovd %r9d,%xmm13
movzbl 0x4(%r8),%r9d
vpinsrb $0x1,(%rsi,%rdi,1),%xmm13,%xmm14
lea (%rsi,%rsi,2),%rdx
vmovd %r10d,%xmm1
vpinsrb $0x1,(%rcx,%r8,1),%xmm1,%xmm0
vmovd %r9d,%xmm7
vpinsrb $0x1,0x4(%rcx,%r8,1),%xmm7,%xmm5
vpinsrb $0x2,(%rdi,%rsi,2),%xmm14,%xmm15
vpinsrb $0x2,(%r8,%rcx,2),%xmm0,%xmm6
vpinsrb $0x2,0x4(%r8,%rcx,2),%xmm5,%xmm9
vpinsrb $0x3,(%rdx,%rdi,1),%xmm15,%xmm4
movzbl 0x4(%rdi),%r11d
lea (%rcx,%rcx,2),%rax
vpinsrb $0x3,(%rax,%r8,1),%xmm6,%xmm10
vpinsrb $0x3,0x4(%rax,%r8,1),%xmm9,%xmm11
vmovd %r11d,%xmm2
vpinsrb $0x1,0x4(%rsi,%rdi,1),%xmm2,%xmm8
vpinsrb $0x2,0x4(%rdi,%rsi,2),%xmm8,%xmm3
movzbl 0x1(%rdi),%r10d
movzbl 0x5(%rdi),%r9d
movzbl 0x1(%r8),%r11d
vmovd %r10d,%xmm1
movzbl 0x5(%r8),%r10d
vmovd %r9d,%xmm7
vpmovzxbd %xmm4,%xmm4
vmovd %r11d,%xmm2
vpmovzxbd %xmm10,%xmm10
vpinsrb $0x3,0x4(%rdx,%rdi,1),%xmm3,%xmm12
vpsubd %xmm10,%xmm4,%xmm14
vpinsrb $0x1,0x5(%rsi,%rdi,1),%xmm7,%xmm5
vmovd %r10d,%xmm4
vpinsrb $0x1,0x5(%rcx,%r8,1),%xmm4,%xmm10
vpinsrb $0x1,0x1(%rcx,%r8,1),%xmm2,%xmm8
vpinsrb $0x1,0x1(%rsi,%rdi,1),%xmm1,%xmm0
vpinsrb $0x2,0x5(%rdi,%rsi,2),%xmm5,%xmm9
vpinsrb $0x2,0x1(%r8,%rcx,2),%xmm8,%xmm3
vpinsrb $0x2,0x1(%rdi,%rsi,2),%xmm0,%xmm6
vpmovzxbd %xmm12,%xmm12
vpmovzxbd %xmm11,%xmm11
vpsubd %xmm11,%xmm12,%xmm13
vpinsrb $0x2,0x5(%r8,%rcx,2),%xmm10,%xmm11
vpslld $0x10,%xmm13,%xmm15
vpinsrb $0x3,0x1(%rdx,%rdi,1),%xmm6,%xmm13
vpaddd %xmm15,%xmm14,%xmm12
vpinsrb $0x3,0x5(%rdx,%rdi,1),%xmm9,%xmm15
vpinsrb $0x3,0x1(%rax,%r8,1),%xmm3,%xmm14
vpinsrb $0x3,0x5(%rax,%r8,1),%xmm11,%xmm1
movzbl 0x2(%rdi),%r11d
movzbl 0x2(%r8),%r9d
vpmovzxbd %xmm15,%xmm15
vmovd %r11d,%xmm8
vmovd %r9d,%xmm5
vpinsrb $0x1,0x2(%rsi,%rdi,1),%xmm8,%xmm3
vpinsrb $0x1,0x2(%rcx,%r8,1),%xmm5,%xmm9
vpinsrb $0x2,0x2(%rdi,%rsi,2),%xmm3,%xmm7
vpinsrb $0x2,0x2(%r8,%rcx,2),%xmm9,%xmm4
vpinsrb $0x3,0x2(%rdx,%rdi,1),%xmm7,%xmm3
vpinsrb $0x3,0x2(%rax,%r8,1),%xmm4,%xmm7
vpmovzxbd %xmm1,%xmm1
movzbl 0x6(%r8),%r11d
vpsubd %xmm1,%xmm15,%xmm0
vpmovzxbd %xmm13,%xmm13
vpslld $0x10,%xmm0,%xmm2
vpmovzxbd %xmm14,%xmm14
vpsubd %xmm14,%xmm13,%xmm6
vpaddd %xmm2,%xmm6,%xmm11
vmovd %r11d,%xmm6
vpinsrb $0x1,0x6(%rcx,%r8,1),%xmm6,%xmm2
movzbl 0x6(%rdi),%r10d
vpinsrb $0x2,0x6(%r8,%rcx,2),%xmm2,%xmm8
vmovd %r10d,%xmm10
vpinsrb $0x1,0x6(%rsi,%rdi,1),%xmm10,%xmm1
vpinsrb $0x3,0x6(%rax,%r8,1),%xmm8,%xmm9
vpinsrb $0x2,0x6(%rdi,%rsi,2),%xmm1,%xmm0
movzbl 0x3(%rdi),%r9d
movzbl 0x7(%rdi),%r11d
vpmovzxbd %xmm3,%xmm3
vpmovzxbd %xmm7,%xmm7
vmovd %r9d,%xmm14
vmovd %r11d,%xmm8
vpsubd %xmm7,%xmm3,%xmm10
vpinsrb $0x1,0x3(%rsi,%rdi,1),%xmm14,%xmm15
vpinsrb $0x1,0x7(%rsi,%rdi,1),%xmm8,%xmm3
vpinsrb $0x3,0x6(%rdx,%rdi,1),%xmm0,%xmm5
vpinsrb $0x2,0x3(%rdi,%rsi,2),%xmm15,%xmm1
vpinsrb $0x2,0x7(%rdi,%rsi,2),%xmm3,%xmm7
vpaddd %xmm11,%xmm12,%xmm3
vpmovzxbd %xmm5,%xmm5
vpmovzxbd %xmm9,%xmm9
vpsubd %xmm9,%xmm5,%xmm4
vpslld $0x10,%xmm4,%xmm13
vpinsrb $0x3,0x7(%rdx,%rdi,1),%xmm7,%xmm15
vpaddd %xmm13,%xmm10,%xmm10
vpinsrb $0x3,0x3(%rdx,%rdi,1),%xmm1,%xmm13
movzbl 0x7(%r8),%edx
movzbl 0x3(%r8),%r10d
vpmovzxbd %xmm15,%xmm15
vmovd %edx,%xmm5
vpinsrb $0x1,0x7(%rcx,%r8,1),%xmm5,%xmm9
vmovd %r10d,%xmm0
vpinsrb $0x1,0x3(%rcx,%r8,1),%xmm0,%xmm6
vpinsrb $0x2,0x7(%r8,%rcx,2),%xmm9,%xmm4
vpinsrb $0x2,0x3(%r8,%rcx,2),%xmm6,%xmm2
vpinsrb $0x3,0x7(%rax,%r8,1),%xmm4,%xmm1
vpinsrb $0x3,0x3(%rax,%r8,1),%xmm2,%xmm14
vpmovzxbd %xmm1,%xmm1
vpmovzxbd %xmm13,%xmm13
vpsubd %xmm1,%xmm15,%xmm0
vpmovzxbd %xmm14,%xmm14
vpslld $0x10,%xmm0,%xmm2
vpsubd %xmm14,%xmm13,%xmm6
vpsubd %xmm11,%xmm12,%xmm1
vpaddd %xmm2,%xmm6,%xmm8
vpaddd %xmm8,%xmm10,%xmm12
vpsubd %xmm8,%xmm10,%xmm0
vpaddd %xmm12,%xmm3,%xmm8
vpaddd %xmm0,%xmm1,%xmm7
vpsubd %xmm12,%xmm3,%xmm3
vpsubd %xmm0,%xmm1,%xmm5
vunpcklps %xmm7,%xmm8,%xmm6
vunpcklps %xmm5,%xmm3,%xmm2
vunpckhps %xmm7,%xmm8,%xmm9
vunpckhps %xmm5,%xmm3,%xmm4
vunpcklpd %xmm2,%xmm6,%xmm10
vunpckhpd %xmm2,%xmm6,%xmm11
vunpcklpd %xmm4,%xmm9,%xmm12
vpaddd %xmm11,%xmm10,%xmm14
vunpckhpd %xmm4,%xmm9,%xmm13
vpsubd %xmm11,%xmm10,%xmm1
vpaddd %xmm13,%xmm12,%xmm15
vpsubd %xmm13,%xmm12,%xmm0
vpaddd %xmm15,%xmm14,%xmm9
vpaddd %xmm0,%xmm1,%xmm7
vpsubd %xmm15,%xmm14,%xmm8
vpsubd %xmm0,%xmm1,%xmm6
vmovdqu 0x279d68(%rip),%xmm15
vpsrld $0xf,%xmm9,%xmm2
vpsrld $0xf,%xmm7,%xmm10
vpand %xmm15,%xmm2,%xmm3
vmovdqu 0x279d40(%rip),%xmm4
vpand %xmm15,%xmm10,%xmm11
vpsrld $0xf,%xmm8,%xmm12
vpsrld $0xf,%xmm6,%xmm14
vpmulld %xmm3,%xmm4,%xmm5
vpand %xmm15,%xmm12,%xmm13
vpmulld %xmm11,%xmm4,%xmm3
vpand %xmm15,%xmm14,%xmm1
vpmulld %xmm13,%xmm4,%xmm2
vpaddd %xmm3,%xmm7,%xmm7
vpmulld %xmm1,%xmm4,%xmm0
vpaddd %xmm5,%xmm9,%xmm4
vpxor %xmm5,%xmm4,%xmm5
vpxor %xmm3,%xmm7,%xmm9
vpaddd %xmm2,%xmm8,%xmm8
vpaddd %xmm9,%xmm5,%xmm3
vpxor %xmm2,%xmm8,%xmm2
vpaddd %xmm0,%xmm6,%xmm6
vpaddd %xmm2,%xmm3,%xmm4
vpxor %xmm0,%xmm6,%xmm0
vpaddd %xmm0,%xmm4,%xmm2
vpxor %xmm1,%xmm1,%xmm1
vpaddd %xmm2,%xmm1,%xmm1
vpsrldq $0x8,%xmm1,%xmm3
vpaddd %xmm3,%xmm1,%xmm5
vpsrlq $0x20,%xmm5,%xmm6
vpaddd %xmm6,%xmm5,%xmm7
vmovd %xmm7,%ecx
movzwl %cx,%eax
shr $0x10,%ecx
add %ecx,%eax
shr %eax
retq
# OSACA-END

View File

@@ -102,7 +102,7 @@ class TestParserAArch64(unittest.TestCase):
self.assertEqual(parsed_3.instruction, "mov")
self.assertEqual(parsed_3.operands[0].register.name, "2")
self.assertEqual(parsed_3.operands[0].register.prefix, "x")
self.assertEqual(parsed_3.operands[1].immediate.value, "0x222")
self.assertEqual(parsed_3.operands[1].immediate.value, int("0x222", 0))
self.assertEqual(parsed_3.comment, "NOT IACA END")
self.assertEqual(parsed_4.instruction, "str")
@@ -208,7 +208,7 @@ class TestParserAArch64(unittest.TestCase):
{"prfop": {"type": ["PLD"], "target": ["L1"], "policy": ["KEEP"]}},
{
"memory": {
"offset": {"value": "2048"},
"offset": {"value": 2048},
"base": {"prefix": "x", "name": "26"},
"index": None,
"scale": 1,
@@ -228,7 +228,7 @@ class TestParserAArch64(unittest.TestCase):
{"register": {"prefix": "x", "name": "30"}},
{
"memory": {
"offset": {"value": "-16"},
"offset": {"value": -16},
"base": {"name": "sp", "prefix": "x"},
"index": None,
"scale": 1,
@@ -253,7 +253,7 @@ class TestParserAArch64(unittest.TestCase):
"base": {"prefix": "x", "name": "11"},
"index": None,
"scale": 1,
"post_indexed": {"value": "64"},
"post_indexed": {"value": 64},
}
},
],
@@ -270,7 +270,7 @@ class TestParserAArch64(unittest.TestCase):
{"register": {"prefix": "p", "name": "0", "predication": "m"}},
{"register": {"prefix": "z", "name": "29", "shape": "d"}},
{"register": {"prefix": "z", "name": "21", "shape": "d"}},
{"immediate": {"value": "90", "type": "int"}},
{"immediate": {"value": 90, "type": "int"}},
],
"directive": None,
"comment": None,
@@ -326,32 +326,34 @@ class TestParserAArch64(unittest.TestCase):
def test_multiple_regs(self):
instr_range = "PUSH {x5-x7}"
reg_range = AttrDict(
{
"register": {
"range": [{"prefix": "x", "name": "5"}, {"prefix": "x", "name": "7"}],
"index": None,
}
}
)
instr_list = "POP {x5, x7, x9}"
reg_list = AttrDict(
{
"register": {
"list": [
{"prefix": "x", "name": "5"},
{"prefix": "x", "name": "7"},
{"prefix": "x", "name": "9"},
],
"index": None,
}
}
)
instr_list = "POP {x5, x6, x7}"
instr_range_with_index = "ld4 {v0.S - v3.S}[2]"
instr_list_with_index = "ld4 {v0.S, v1.S, v2.S, v3.S}[2]"
instr_range_single = "dummy { z1.d }"
reg_list = [
AttrDict({"register": {"prefix": "x", "name": "5"}}),
AttrDict({"register": {"prefix": "x", "name": "6"}}),
AttrDict({"register": {"prefix": "x", "name": "7"}}),
]
reg_list_idx = [
AttrDict({"register": {"prefix": "v", "name": "0", "shape": "S", "index": 2}}),
AttrDict({"register": {"prefix": "v", "name": "1", "shape": "S", "index": 2}}),
AttrDict({"register": {"prefix": "v", "name": "2", "shape": "S", "index": 2}}),
AttrDict({"register": {"prefix": "v", "name": "3", "shape": "S", "index": 2}}),
]
reg_list_single = [AttrDict({"register": {"prefix": "z", "name": "1", "shape": "d"}})]
prange = self.parser.parse_line(instr_range)
plist = self.parser.parse_line(instr_list)
p_idx_range = self.parser.parse_line(instr_range_with_index)
p_idx_list = self.parser.parse_line(instr_list_with_index)
p_single = self.parser.parse_line(instr_range_single)
self.assertEqual(prange.operands[0], reg_range)
self.assertEqual(plist.operands[0], reg_list)
self.assertEqual(prange.operands, reg_list)
self.assertEqual(plist.operands, reg_list)
self.assertEqual(p_idx_range.operands, reg_list_idx)
self.assertEqual(p_idx_list.operands, reg_list_idx)
self.assertEqual(p_single.operands, reg_list_single)
def test_reg_dependency(self):
reg_1_1 = AttrDict({"prefix": "b", "name": "1"})

View File

@@ -120,12 +120,12 @@ class TestParserX86ATT(unittest.TestCase):
self.assertIsNone(parsed_2.comment)
self.assertEqual(parsed_3.instruction, "movl")
self.assertEqual(parsed_3.operands[0].immediate.value, "222")
self.assertEqual(parsed_3.operands[0].immediate.value, 222)
self.assertEqual(parsed_3.operands[1].register.name, "ebx")
self.assertEqual(parsed_3.comment, "IACA END")
self.assertEqual(parsed_4.instruction, "vmovss")
self.assertEqual(parsed_4.operands[1].memory.offset.value, "-4")
self.assertEqual(parsed_4.operands[1].memory.offset.value, -4)
self.assertEqual(parsed_4.operands[1].memory.base.name, "rsp")
self.assertEqual(parsed_4.operands[1].memory.index.name, "rax")
self.assertEqual(parsed_4.operands[1].memory.scale, 8)
@@ -146,7 +146,7 @@ class TestParserX86ATT(unittest.TestCase):
self.assertEqual(parsed_6.operands[0].memory.scale, 8)
self.assertEqual(parsed_6.operands[1].register.name, "rbx")
self.assertEqual(parsed_7.operands[0].immediate.value, "0x1")
self.assertEqual(parsed_7.operands[0].immediate.value, 0x1)
self.assertEqual(parsed_7.operands[1].register.name, "xmm0")
self.assertEqual(parsed_7.operands[2].register.name, "ymm1")
self.assertEqual(parsed_7.operands[3].register.name, "ymm1")
@@ -189,7 +189,7 @@ class TestParserX86ATT(unittest.TestCase):
"operands": [
{
"memory": {
"offset": {"value": "2"},
"offset": {"value": 2},
"base": {"name": "rax"},
"index": {"name": "rax"},
"scale": 1,
@@ -240,7 +240,7 @@ class TestParserX86ATT(unittest.TestCase):
imd_decimal_1 = {"value": "79"}
imd_hex_1 = {"value": "0x4f"}
imd_decimal_2 = {"value": "8"}
imd_hex_2 = {"value": "0x8"}
imd_hex_2 = {"value": "8"}
self.assertEqual(
self.parser.normalize_imd(imd_decimal_1), self.parser.normalize_imd(imd_hex_1)
)

View File

@@ -5,15 +5,14 @@ Unit tests for Semantic Analysis
import os
import unittest
import time
from copy import deepcopy
import networkx as nx
from osaca.osaca import get_unmatched_instruction_ratio
from osaca.parser import AttrDict, ParserAArch64, ParserX86ATT
from osaca.semantics import (
INSTR_FLAGS, ArchSemantics, KernelDG, MachineModel, reduce_to_section, ISASemantics
)
from osaca.semantics import (INSTR_FLAGS, ArchSemantics, ISASemantics,
KernelDG, MachineModel, reduce_to_section)
class TestSemanticTools(unittest.TestCase):
@@ -30,17 +29,30 @@ class TestSemanticTools(unittest.TestCase):
cls.code_x86 = f.read()
with open(cls._find_file("kernel_x86_memdep.s")) as f:
cls.code_x86_memdep = f.read()
with open(cls._find_file("kernel_x86_long_LCD.s")) as f:
cls.code_x86_long_LCD = f.read()
with open(cls._find_file("kernel_aarch64_memdep.s")) as f:
cls.code_aarch64_memdep = f.read()
with open(cls._find_file("kernel_aarch64.s")) as f:
cls.code_AArch64 = f.read()
with open(cls._find_file("kernel_aarch64_sve.s")) as f:
cls.code_AArch64_SVE = f.read()
cls.kernel_x86 = reduce_to_section(cls.parser_x86.parse_file(cls.code_x86), "x86")
cls.kernel_x86_memdep = reduce_to_section(
cls.parser_x86.parse_file(cls.code_x86_memdep), "x86")
cls.parser_x86.parse_file(cls.code_x86_memdep), "x86"
)
cls.kernel_x86_long_LCD = reduce_to_section(
cls.parser_x86.parse_file(cls.code_x86_long_LCD), "x86"
)
cls.kernel_AArch64 = reduce_to_section(
cls.parser_AArch64.parse_file(cls.code_AArch64), "aarch64")
cls.parser_AArch64.parse_file(cls.code_AArch64), "aarch64"
)
cls.kernel_aarch64_memdep = reduce_to_section(
cls.parser_AArch64.parse_file(cls.code_aarch64_memdep), "aarch64")
cls.parser_AArch64.parse_file(cls.code_aarch64_memdep), "aarch64"
)
cls.kernel_aarch64_SVE = reduce_to_section(
cls.parser_AArch64.parse_file(cls.code_AArch64_SVE), "aarch64"
)
# set up machine models
cls.machine_model_csx = MachineModel(
@@ -49,6 +61,9 @@ class TestSemanticTools(unittest.TestCase):
cls.machine_model_tx2 = MachineModel(
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "tx2.yml")
)
cls.machine_model_a64fx = MachineModel(
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "a64fx.yml")
)
cls.semantics_x86 = ISASemantics("x86")
cls.semantics_csx = ArchSemantics(
cls.machine_model_csx, path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "isa/x86.yml")
@@ -58,6 +73,10 @@ class TestSemanticTools(unittest.TestCase):
cls.machine_model_tx2,
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "isa/aarch64.yml"),
)
cls.semantics_a64fx = ArchSemantics(
cls.machine_model_a64fx,
path_to_yaml=os.path.join(cls.MODULE_DATA_DIR, "isa/aarch64.yml"),
)
cls.machine_model_zen = MachineModel(arch="zen1")
for i in range(len(cls.kernel_x86)):
@@ -66,12 +85,18 @@ class TestSemanticTools(unittest.TestCase):
for i in range(len(cls.kernel_x86_memdep)):
cls.semantics_csx.assign_src_dst(cls.kernel_x86_memdep[i])
cls.semantics_csx.assign_tp_lt(cls.kernel_x86_memdep[i])
for i in range(len(cls.kernel_x86_long_LCD)):
cls.semantics_csx.assign_src_dst(cls.kernel_x86_long_LCD[i])
cls.semantics_csx.assign_tp_lt(cls.kernel_x86_long_LCD[i])
for i in range(len(cls.kernel_AArch64)):
cls.semantics_tx2.assign_src_dst(cls.kernel_AArch64[i])
cls.semantics_tx2.assign_tp_lt(cls.kernel_AArch64[i])
for i in range(len(cls.kernel_aarch64_memdep)):
cls.semantics_tx2.assign_src_dst(cls.kernel_aarch64_memdep[i])
cls.semantics_tx2.assign_tp_lt(cls.kernel_aarch64_memdep[i])
for i in range(len(cls.kernel_aarch64_SVE)):
cls.semantics_a64fx.assign_src_dst(cls.kernel_aarch64_SVE[i])
cls.semantics_a64fx.assign_tp_lt(cls.kernel_aarch64_SVE[i])
###########
# Tests
@@ -284,8 +309,9 @@ class TestSemanticTools(unittest.TestCase):
dg.export_graph(filepath="/dev/null")
def test_memdependency_x86(self):
dg = KernelDG(self.kernel_x86_memdep, self.parser_x86, self.machine_model_csx,
self.semantics_csx)
dg = KernelDG(
self.kernel_x86_memdep, self.parser_x86, self.machine_model_csx, self.semantics_csx
)
self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=3)), {6, 8})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=5)), {10, 12})
@@ -295,8 +321,9 @@ class TestSemanticTools(unittest.TestCase):
dg.export_graph(filepath="/dev/null")
def test_kernelDG_AArch64(self):
dg = KernelDG(self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2,
self.semantics_tx2)
dg = KernelDG(
self.kernel_AArch64, self.parser_AArch64, self.machine_model_tx2, self.semantics_tx2
)
self.assertTrue(nx.algorithms.dag.is_directed_acyclic_graph(dg.dg))
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=3)), {7, 8})
self.assertEqual(set(dg.get_dependent_instruction_forms(line_number=4)), {9, 10})
@@ -321,6 +348,15 @@ class TestSemanticTools(unittest.TestCase):
# test dot creation
dg.export_graph(filepath="/dev/null")
def test_kernelDG_SVE(self):
KernelDG(
self.kernel_aarch64_SVE,
self.parser_AArch64,
self.machine_model_a64fx,
self.semantics_a64fx,
)
# TODO check for correct analysis
def test_hidden_load(self):
machine_model_hld = MachineModel(
path_to_yaml=self._find_file("hidden_load_machine_model.yml")
@@ -353,14 +389,20 @@ class TestSemanticTools(unittest.TestCase):
dg.get_loopcarried_dependencies()
def test_loop_carried_dependency_aarch64(self):
dg = KernelDG(self.kernel_aarch64_memdep, self.parser_AArch64, self.machine_model_tx2,
self.semantics_tx2)
dg = KernelDG(
self.kernel_aarch64_memdep,
self.parser_AArch64,
self.machine_model_tx2,
self.semantics_tx2,
)
lc_deps = dg.get_loopcarried_dependencies()
self.assertEqual(len(lc_deps), 2)
# based on line 6
self.assertEqual(lc_deps[6]["latency"], 28.0)
self.assertEqual([(iform.line_number, lat) for iform, lat in lc_deps[6]['dependencies']],
[(6, 4.0), (10, 6.0), (11, 6.0), (12, 6.0), (13, 6.0), (14, 0)])
self.assertEqual(
[(iform.line_number, lat) for iform, lat in lc_deps[6]['dependencies']],
[(6, 4.0), (10, 6.0), (11, 6.0), (12, 6.0), (13, 6.0), (14, 0)],
)
def test_loop_carried_dependency_x86(self):
lcd_id = 8
@@ -375,7 +417,7 @@ class TestSemanticTools(unittest.TestCase):
self.assertEqual(len(lc_deps[lcd_id]["dependencies"]), 1)
self.assertEqual(
lc_deps[lcd_id]["dependencies"][0][0],
dg.dg.nodes(data=True)[lcd_id]["instruction_form"]
dg.dg.nodes(data=True)[lcd_id]["instruction_form"],
)
# w/ flag dependencies: ID 9 w/ len=2
# w/o flag dependencies: ID 5 w/ len=1
@@ -389,6 +431,31 @@ class TestSemanticTools(unittest.TestCase):
dg.dg.nodes(data=True)[lcd_id2]["instruction_form"],
)
def test_timeout_during_loop_carried_dependency(self):
start_time = time.perf_counter()
KernelDG(
self.kernel_x86_long_LCD,
self.parser_x86,
self.machine_model_csx,
self.semantics_x86,
timeout=10
)
end_time = time.perf_counter()
time_10 = end_time - start_time
start_time = time.perf_counter()
KernelDG(
self.kernel_x86_long_LCD,
self.parser_x86,
self.machine_model_csx,
self.semantics_x86,
timeout=2
)
end_time = time.perf_counter()
time_2 = end_time - start_time
self.assertTrue(time_10 > 10)
self.assertTrue(2 < time_2)
self.assertTrue(time_2 < (time_10 - 7))
def test_is_read_is_written_x86(self):
# independent form HW model
dag = KernelDG(self.kernel_x86, self.parser_x86, None, None)

View File

@@ -232,7 +232,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 29,
"metadata": {},
"outputs": [
{
@@ -284,8 +284,10 @@
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"execution_count": 27,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
@@ -296,8 +298,672 @@
"ZEN has 156 tests, compiled to 126 unique assembly representations.\n",
"ZEN2 has 156 tests, compiled to 126 unique assembly representations.\n",
"TX2 has 104 tests, compiled to 78 unique assembly representations.\n",
"A64FX has 104 tests, compiled to 81 unique assembly representations.\n"
"A64FX has 104 tests, compiled to 81 unique assembly representations.\n",
"High-level iterations in assembly block: 16\n",
"Measured: 1.1903856655856655\n",
"IACA Predicted: 1.96875 TP: 1.875 LCD: None CP: None\n",
"Ithemal Predicted: nan TP: None LCD: None CP: None\n",
"LLVM-MCA Predicted: 2.240625 TP: 1.948125 LCD: 2.240625 CP: 3.8125\n",
"OSACA Predicted: 1.875 TP: 1.875 LCD: 0.5 CP: 2.75\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space: pre !important;\">Open Source Architecture Code Analyzer (OSACA) - 0.3.14\n",
"Analyzed file: build/SKX/icc/O3/pi.marked.s\n",
"Architecture: SKX\n",
"Timestamp: 2021-04-15 12:15:40\n",
"\n",
"\n",
" P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction\n",
" * - Instruction micro-ops not bound to a port\n",
" X - No throughput/latency information for this instruction in data file\n",
"\n",
"\n",
"Combined Analysis Report\n",
"------------------------\n",
" Port pressure in cycles \n",
" | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |\n",
"-------------------------------------------------------------------------------------------------\n",
" 62 | | | | | | | | || | | # pointer_increment=128 fa3c665ee18e1e5f704c8a6026891c36\n",
" 63 | | | | | | | | || | | ..B1.4: # Preds ..B1.4 ..B1.3\n",
" 64 | | | | | | | | || | | # Execution count [5.00e+00]\n",
" 65 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | addl $32, %ecx #16.5\n",
" 66 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm5, %ymm9, %ymm14 #17.9\n",
" 67 | 0.50 | | | | | 1.50 | | || | | vcvtdq2pd %ymm9, %zmm8 #17.14\n",
" 68 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm8, %zmm1, %zmm10 #17.18\n",
" 69 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm2, %zmm11 #17.25\n",
" 70 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm0, %zmm11, %zmm11 #18.38\n",
" 71 | | | | | | | | || | | * vmovaps %zmm0, %zmm29 #18.38\n",
" 72 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm11, %zmm13 #18.38\n",
" 73 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || | | vfnmadd213pd .L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11 #18.38\n",
" 74 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm13, %k0 #18.38\n",
" 75 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm11, %zmm11, %zmm12 #18.38\n",
" 76 | 1.00 | | | | | | | || | | knotw %k0, %k1 #18.38\n",
" 77 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm13, %zmm11, %zmm13{%k1} #18.38\n",
" 78 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm13, %zmm12, %zmm13{%k1} #18.38\n",
" 79 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm4, %zmm13, %zmm6 #18.38\n",
" 80 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm5, %ymm14, %ymm20 #17.9\n",
" 81 | 0.50 | | | | | 1.50 | | || 7.0 | | vcvtdq2pd %ymm14, %zmm15 #17.14\n",
" 82 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm15, %zmm1, %zmm16 #17.18\n",
" 83 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm16, %zmm2, %zmm17 #17.25\n",
" 84 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm0, %zmm17, %zmm17 #18.38\n",
" 85 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm19 #18.38\n",
" 86 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17 #18.38\n",
" 87 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm19, %k2 #18.38\n",
" 88 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18 #18.38\n",
" 89 | 1.00 | | | | | | | || | | knotw %k2, %k3 #18.38\n",
" 90 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm19, %zmm17, %zmm19{%k3} #18.38\n",
" 91 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm19, %zmm18, %zmm19{%k3} #18.38\n",
" 92 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm4, %zmm19, %zmm3 #18.38\n",
" 93 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm5, %ymm20, %ymm26 #17.9\n",
" 94 | 0.50 | | | | | 1.50 | | || | | vcvtdq2pd %ymm20, %zmm21 #17.14\n",
" 95 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm21, %zmm1, %zmm22 #17.18\n",
" 96 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm2, %zmm23 #17.25\n",
" 97 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm0, %zmm23, %zmm23 #18.38\n",
" 98 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm23, %zmm25 #18.38\n",
" 99 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || | | vfnmadd213pd .L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23 #18.38\n",
" 100 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm25, %k4 #18.38\n",
" 101 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm23, %zmm23, %zmm24 #18.38\n",
" 102 | 1.00 | | | | | | | || | | knotw %k4, %k5 #18.38\n",
" 103 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm25, %zmm23, %zmm25{%k5} #18.38\n",
" 104 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm25, %zmm24, %zmm25{%k5} #18.38\n",
" 105 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm4, %zmm25, %zmm6 #18.38\n",
" 106 | 0.50 | | | | | 1.50 | | || | | vcvtdq2pd %ymm26, %zmm27 #17.14\n",
" 107 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm5, %ymm26, %ymm9 #17.9\n",
" 108 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm27, %zmm1, %zmm28 #17.18\n",
" 109 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm28, %zmm2, %zmm8 #17.25\n",
" 110 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm8, %zmm8, %zmm29 #18.38\n",
" 111 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm29, %zmm31 #18.38\n",
" 112 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || | | vfnmadd213pd .L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29 #18.38\n",
" 113 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm31, %k6 #18.38\n",
" 114 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm29, %zmm29, %zmm30 #18.38\n",
" 115 | 1.00 | | | | | | | || | | knotw %k6, %k7 #18.38\n",
" 116 | 0.00 | | | | | 1.00 | | || | | vfmadd213pd %zmm31, %zmm29, %zmm31{%k7} #18.38\n",
" 117 | 0.00 | | | | | 1.00 | | || | | vfmadd213pd %zmm31, %zmm30, %zmm31{%k7} #18.38\n",
" 118 | 0.00 | | | | | 1.00 | | || 0.0 | 4.0 | vfmadd231pd %zmm4, %zmm31, %zmm3 #18.38\n",
" 119 | 0.00 | 0.34 | | | | 0.00 | 0.66 | || | | cmpl %edx, %ecx #16.5\n",
" 120 | 0.00 | | | | | | 1.00 | || | | jb ..B1.4 # Prob 82% #16.5\n",
"\n",
" 30.0 4.34 2.00 2.00 2.00 2.00 30.0 2.66 44 8.0 \n",
"\n",
"\n",
"Loop-Carried Dependencies Analysis Report\n",
"-----------------------------------------\n",
" 92 | 8.0 | vfmadd231pd %zmm4, %zmm19, %zmm3 #18.38| [92, 118]\n",
" 79 | 8.0 | vfmadd231pd %zmm4, %zmm13, %zmm6 #18.38| [79, 105]\n",
" 66 | 4.0 | vpaddd %ymm5, %ymm9, %ymm14 #17.9| [66, 80, 93, 107]\n",
" 65 | 1.0 | addl $32, %ecx #16.5| [65]\n",
"</pre>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space: pre !important;\">Iterations: 100\n",
"Instructions: 5600\n",
"Total Cycles: 3585\n",
"Total uOps: 7200\n",
"\n",
"Dispatch Width: 6\n",
"uOps Per Cycle: 2.01\n",
"IPC: 1.56\n",
"Block RThroughput: 18.0\n",
"\n",
"\n",
"Instruction Info:\n",
"[1]: #uOps\n",
"[2]: Latency\n",
"[3]: RThroughput\n",
"[4]: MayLoad\n",
"[5]: MayStore\n",
"[6]: HasSideEffects (U)\n",
"\n",
"[1] [2] [3] [4] [5] [6] Instructions:\n",
" 1 1 0.25 addl\t$32, %ecx\n",
" 1 1 0.33 vpaddd\t%ymm5, %ymm9, %ymm14\n",
" 2 7 1.00 vcvtdq2pd\t%ymm9, %zmm8\n",
" 1 4 0.50 vaddpd\t%zmm8, %zmm1, %zmm10\n",
" 1 4 0.50 vmulpd\t%zmm10, %zmm2, %zmm11\n",
" 1 4 0.50 vfmadd213pd\t%zmm0, %zmm11, %zmm11\n",
" 1 1 0.33 vmovaps\t%zmm0, %zmm29\n",
" 3 4 2.00 vrcp14pd\t%zmm11, %zmm13\n",
" 2 11 0.50 * vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11\n",
" 1 4 1.00 vfpclasspd\t$30, %zmm13, %k0\n",
" 1 4 0.50 vmulpd\t%zmm11, %zmm11, %zmm12\n",
" 1 1 1.00 knotw\t%k0, %k1\n",
" 1 4 0.50 vfmadd213pd\t%zmm13, %zmm11, %zmm13 {%k1}\n",
" 1 4 0.50 vfmadd213pd\t%zmm13, %zmm12, %zmm13 {%k1}\n",
" 1 4 0.50 vfmadd231pd\t%zmm4, %zmm13, %zmm6\n",
" 1 1 0.33 vpaddd\t%ymm5, %ymm14, %ymm20\n",
" 2 7 1.00 vcvtdq2pd\t%ymm14, %zmm15\n",
" 1 4 0.50 vaddpd\t%zmm15, %zmm1, %zmm16\n",
" 1 4 0.50 vmulpd\t%zmm16, %zmm2, %zmm17\n",
" 1 4 0.50 vfmadd213pd\t%zmm0, %zmm17, %zmm17\n",
" 3 4 2.00 vrcp14pd\t%zmm17, %zmm19\n",
" 2 11 0.50 * vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17\n",
" 1 4 1.00 vfpclasspd\t$30, %zmm19, %k2\n",
" 1 4 0.50 vmulpd\t%zmm17, %zmm17, %zmm18\n",
" 1 1 1.00 knotw\t%k2, %k3\n",
" 1 4 0.50 vfmadd213pd\t%zmm19, %zmm17, %zmm19 {%k3}\n",
" 1 4 0.50 vfmadd213pd\t%zmm19, %zmm18, %zmm19 {%k3}\n",
" 1 4 0.50 vfmadd231pd\t%zmm4, %zmm19, %zmm3\n",
" 1 1 0.33 vpaddd\t%ymm5, %ymm20, %ymm26\n",
" 2 7 1.00 vcvtdq2pd\t%ymm20, %zmm21\n",
" 1 4 0.50 vaddpd\t%zmm21, %zmm1, %zmm22\n",
" 1 4 0.50 vmulpd\t%zmm22, %zmm2, %zmm23\n",
" 1 4 0.50 vfmadd213pd\t%zmm0, %zmm23, %zmm23\n",
" 3 4 2.00 vrcp14pd\t%zmm23, %zmm25\n",
" 2 11 0.50 * vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23\n",
" 1 4 1.00 vfpclasspd\t$30, %zmm25, %k4\n",
" 1 4 0.50 vmulpd\t%zmm23, %zmm23, %zmm24\n",
" 1 1 1.00 knotw\t%k4, %k5\n",
" 1 4 0.50 vfmadd213pd\t%zmm25, %zmm23, %zmm25 {%k5}\n",
" 1 4 0.50 vfmadd213pd\t%zmm25, %zmm24, %zmm25 {%k5}\n",
" 1 4 0.50 vfmadd231pd\t%zmm4, %zmm25, %zmm6\n",
" 2 7 1.00 vcvtdq2pd\t%ymm26, %zmm27\n",
" 1 1 0.33 vpaddd\t%ymm5, %ymm26, %ymm9\n",
" 1 4 0.50 vaddpd\t%zmm27, %zmm1, %zmm28\n",
" 1 4 0.50 vmulpd\t%zmm28, %zmm2, %zmm8\n",
" 1 4 0.50 vfmadd231pd\t%zmm8, %zmm8, %zmm29\n",
" 3 4 2.00 vrcp14pd\t%zmm29, %zmm31\n",
" 2 11 0.50 * vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29\n",
" 1 4 1.00 vfpclasspd\t$30, %zmm31, %k6\n",
" 1 4 0.50 vmulpd\t%zmm29, %zmm29, %zmm30\n",
" 1 1 1.00 knotw\t%k6, %k7\n",
" 1 4 0.50 vfmadd213pd\t%zmm31, %zmm29, %zmm31 {%k7}\n",
" 1 4 0.50 vfmadd213pd\t%zmm31, %zmm30, %zmm31 {%k7}\n",
" 1 4 0.50 vfmadd231pd\t%zmm4, %zmm31, %zmm3\n",
" 1 1 0.25 cmpl\t%edx, %ecx\n",
" 1 1 0.50 jb\t..B1.4\n",
"\n",
"\n",
"Resources:\n",
"[0] - SKXDivider\n",
"[1] - SKXFPDivider\n",
"[2] - SKXPort0\n",
"[3] - SKXPort1\n",
"[4] - SKXPort2\n",
"[5] - SKXPort3\n",
"[6] - SKXPort4\n",
"[7] - SKXPort5\n",
"[8] - SKXPort6\n",
"[9] - SKXPort7\n",
"\n",
"\n",
"Resource pressure per iteration:\n",
"[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] \n",
" - - 31.17 5.72 2.00 2.00 - 29.10 2.01 - \n",
"\n",
"Resource pressure by instruction:\n",
"[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:\n",
" - - - 0.80 - - - 0.19 0.01 - addl\t$32, %ecx\n",
" - - 0.07 0.92 - - - 0.01 - - vpaddd\t%ymm5, %ymm9, %ymm14\n",
" - - 1.00 - - - - 1.00 - - vcvtdq2pd\t%ymm9, %zmm8\n",
" - - 0.42 - - - - 0.58 - - vaddpd\t%zmm8, %zmm1, %zmm10\n",
" - - 0.51 - - - - 0.49 - - vmulpd\t%zmm10, %zmm2, %zmm11\n",
" - - 0.45 - - - - 0.55 - - vfmadd213pd\t%zmm0, %zmm11, %zmm11\n",
" - - - 1.00 - - - - - - vmovaps\t%zmm0, %zmm29\n",
" - - 2.00 - - - - 1.00 - - vrcp14pd\t%zmm11, %zmm13\n",
" - - 0.40 - - 1.00 - 0.60 - - vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11\n",
" - - - - - - - 1.00 - - vfpclasspd\t$30, %zmm13, %k0\n",
" - - 0.49 - - - - 0.51 - - vmulpd\t%zmm11, %zmm11, %zmm12\n",
" - - 1.00 - - - - - - - knotw\t%k0, %k1\n",
" - - 0.44 - - - - 0.56 - - vfmadd213pd\t%zmm13, %zmm11, %zmm13 {%k1}\n",
" - - 0.54 - - - - 0.46 - - vfmadd213pd\t%zmm13, %zmm12, %zmm13 {%k1}\n",
" - - 0.70 - - - - 0.30 - - vfmadd231pd\t%zmm4, %zmm13, %zmm6\n",
" - - - 1.00 - - - - - - vpaddd\t%ymm5, %ymm14, %ymm20\n",
" - - 1.00 - - - - 1.00 - - vcvtdq2pd\t%ymm14, %zmm15\n",
" - - 0.48 - - - - 0.52 - - vaddpd\t%zmm15, %zmm1, %zmm16\n",
" - - 0.42 - - - - 0.58 - - vmulpd\t%zmm16, %zmm2, %zmm17\n",
" - - 0.32 - - - - 0.68 - - vfmadd213pd\t%zmm0, %zmm17, %zmm17\n",
" - - 2.00 - - - - 1.00 - - vrcp14pd\t%zmm17, %zmm19\n",
" - - 0.32 - 1.00 - - 0.68 - - vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17\n",
" - - - - - - - 1.00 - - vfpclasspd\t$30, %zmm19, %k2\n",
" - - 0.47 - - - - 0.53 - - vmulpd\t%zmm17, %zmm17, %zmm18\n",
" - - 1.00 - - - - - - - knotw\t%k2, %k3\n",
" - - 0.53 - - - - 0.47 - - vfmadd213pd\t%zmm19, %zmm17, %zmm19 {%k3}\n",
" - - 0.54 - - - - 0.46 - - vfmadd213pd\t%zmm19, %zmm18, %zmm19 {%k3}\n",
" - - 0.57 - - - - 0.43 - - vfmadd231pd\t%zmm4, %zmm19, %zmm3\n",
" - - - 1.00 - - - - - - vpaddd\t%ymm5, %ymm20, %ymm26\n",
" - - 1.00 - - - - 1.00 - - vcvtdq2pd\t%ymm20, %zmm21\n",
" - - 0.52 - - - - 0.48 - - vaddpd\t%zmm21, %zmm1, %zmm22\n",
" - - 0.47 - - - - 0.53 - - vmulpd\t%zmm22, %zmm2, %zmm23\n",
" - - 0.48 - - - - 0.52 - - vfmadd213pd\t%zmm0, %zmm23, %zmm23\n",
" - - 2.00 - - - - 1.00 - - vrcp14pd\t%zmm23, %zmm25\n",
" - - 0.40 - - 1.00 - 0.60 - - vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23\n",
" - - - - - - - 1.00 - - vfpclasspd\t$30, %zmm25, %k4\n",
" - - 0.53 - - - - 0.47 - - vmulpd\t%zmm23, %zmm23, %zmm24\n",
" - - 1.00 - - - - - - - knotw\t%k4, %k5\n",
" - - 0.42 - - - - 0.58 - - vfmadd213pd\t%zmm25, %zmm23, %zmm25 {%k5}\n",
" - - 0.54 - - - - 0.46 - - vfmadd213pd\t%zmm25, %zmm24, %zmm25 {%k5}\n",
" - - 0.60 - - - - 0.40 - - vfmadd231pd\t%zmm4, %zmm25, %zmm6\n",
" - - 1.00 - - - - 1.00 - - vcvtdq2pd\t%ymm26, %zmm27\n",
" - - - 1.00 - - - - - - vpaddd\t%ymm5, %ymm26, %ymm9\n",
" - - 0.26 - - - - 0.74 - - vaddpd\t%zmm27, %zmm1, %zmm28\n",
" - - 0.47 - - - - 0.53 - - vmulpd\t%zmm28, %zmm2, %zmm8\n",
" - - 0.34 - - - - 0.66 - - vfmadd231pd\t%zmm8, %zmm8, %zmm29\n",
" - - 2.00 - - - - 1.00 - - vrcp14pd\t%zmm29, %zmm31\n",
" - - 0.34 - 1.00 - - 0.66 - - vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29\n",
" - - - - - - - 1.00 - - vfpclasspd\t$30, %zmm31, %k6\n",
" - - 0.52 - - - - 0.48 - - vmulpd\t%zmm29, %zmm29, %zmm30\n",
" - - 1.00 - - - - - - - knotw\t%k6, %k7\n",
" - - 0.47 - - - - 0.53 - - vfmadd213pd\t%zmm31, %zmm29, %zmm31 {%k7}\n",
" - - 0.48 - - - - 0.52 - - vfmadd213pd\t%zmm31, %zmm30, %zmm31 {%k7}\n",
" - - 0.66 - - - - 0.34 - - vfmadd231pd\t%zmm4, %zmm31, %zmm3\n",
" - - - - - - - - 1.00 - cmpl\t%edx, %ecx\n",
" - - - - - - - - 1.00 - jb\t..B1.4\n",
"\n",
"\n",
"Timeline view:\n",
" 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 \n",
"Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 012345678\n",
"\n",
"[0,0] DeER . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . addl\t$32, %ecx\n",
"[0,1] DeER . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm9, %ymm14\n",
"[0,2] D=eeeeeeeER . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vcvtdq2pd\t%ymm9, %zmm8\n",
"[0,3] D========eeeeER. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vaddpd\t%zmm8, %zmm1, %zmm10\n",
"[0,4] D============eeeeER . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm10, %zmm2, %zmm11\n",
"[0,5] .D===============eeeeER . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm11, %zmm11\n",
"[0,6] .DeE------------------R . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vmovaps\t%zmm0, %zmm29\n",
"[0,7] .D===================eeeeER . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vrcp14pd\t%zmm11, %zmm13\n",
"[0,8] . D======================eeeeeeeeeeeER . . . . . . . . . . . . . . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11\n",
"[0,9] . D======================eeeeE-------R . . . . . . . . . . . . . . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm13, %k0\n",
"[0,10] . D=================================eeeeER . . . . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm11, %zmm11, %zmm12\n",
"[0,11] . D==========================eE----------R . . . . . . . . . . . . . . . . . . . . . . . . . . knotw\t%k0, %k1\n",
"[0,12] . D=================================eeeeER . . . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm13, %zmm11, %zmm13 {%k1}\n",
"[0,13] . D====================================eeeeER . . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm13, %zmm12, %zmm13 {%k1}\n",
"[0,14] . D========================================eeeeER. . . . . . . . . . . . . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm13, %zmm6\n",
"[0,15] . DeE-------------------------------------------R. . . . . . . . . . . . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm14, %ymm20\n",
"[0,16] . DeeeeeeeE-------------------------------------R. . . . . . . . . . . . . . . . . . . . . . . . . vcvtdq2pd\t%ymm14, %zmm15\n",
"[0,17] . D=======eeeeE---------------------------------R. . . . . . . . . . . . . . . . . . . . . . . . . vaddpd\t%zmm15, %zmm1, %zmm16\n",
"[0,18] . D==========eeeeE-----------------------------R. . . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm16, %zmm2, %zmm17\n",
"[0,19] . D==============eeeeE-------------------------R. . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm17, %zmm17\n",
"[0,20] . D==================eeeeE---------------------R. . . . . . . . . . . . . . . . . . . . . . . . . vrcp14pd\t%zmm17, %zmm19\n",
"[0,21] . D=====================eeeeeeeeeeeE----------R. . . . . . . . . . . . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17\n",
"[0,22] . D======================eeeeE----------------R. . . . . . . . . . . . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm19, %k2\n",
"[0,23] . D================================eeeeE------R. . . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm17, %zmm17, %zmm18\n",
"[0,24] . D==========================eE---------------R. . . . . . . . . . . . . . . . . . . . . . . . . knotw\t%k2, %k3\n",
"[0,25] . D================================eeeeE------R. . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm19, %zmm17, %zmm19 {%k3}\n",
"[0,26] . .D===================================eeeeE--R. . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm19, %zmm18, %zmm19 {%k3}\n",
"[0,27] . .D=======================================eeeeER . . . . . . . . . . . . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm19, %zmm3\n",
"[0,28] . .DeE------------------------------------------R . . . . . . . . . . . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm20, %ymm26\n",
"[0,29] . .DeeeeeeeE------------------------------------R . . . . . . . . . . . . . . . . . . . . . . . . vcvtdq2pd\t%ymm20, %zmm21\n",
"[0,30] . .D=======eeeeE--------------------------------R . . . . . . . . . . . . . . . . . . . . . . . . vaddpd\t%zmm21, %zmm1, %zmm22\n",
"[0,31] . . D==========eeeeE----------------------------R . . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm22, %zmm2, %zmm23\n",
"[0,32] . . D==============eeeeE------------------------R . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm23, %zmm23\n",
"[0,33] . . D==================eeeeE--------------------R . . . . . . . . . . . . . . . . . . . . . . . . vrcp14pd\t%zmm23, %zmm25\n",
"[0,34] . . D=====================eeeeeeeeeeeE---------R . . . . . . . . . . . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23\n",
"[0,35] . . D=====================eeeeE----------------R . . . . . . . . . . . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm25, %k4\n",
"[0,36] . . D================================eeeeE-----R . . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm23, %zmm23, %zmm24\n",
"[0,37] . . D==========================eE--------------R . . . . . . . . . . . . . . . . . . . . . . . . knotw\t%k4, %k5\n",
"[0,38] . . D================================eeeeE-----R . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm25, %zmm23, %zmm25 {%k5}\n",
"[0,39] . . D===================================eeeeE-R . . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm25, %zmm24, %zmm25 {%k5}\n",
"[0,40] . . D=======================================eeeeER. . . . . . . . . . . . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm25, %zmm6\n",
"[0,41] . . DeeeeeeeE------------------------------------R. . . . . . . . . . . . . . . . . . . . . . . . vcvtdq2pd\t%ymm26, %zmm27\n",
"[0,42] . . DeE------------------------------------------R. . . . . . . . . . . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm26, %ymm9\n",
"[0,43] . . D=======eeeeE--------------------------------R. . . . . . . . . . . . . . . . . . . . . . . . vaddpd\t%zmm27, %zmm1, %zmm28\n",
"[0,44] . . D=============eeeeE-------------------------R. . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm28, %zmm2, %zmm8\n",
"[0,45] . . D=================eeeeE---------------------R. . . . . . . . . . . . . . . . . . . . . . . . vfmadd231pd\t%zmm8, %zmm8, %zmm29\n",
"[0,46] . . D======================eeeeE----------------R. . . . . . . . . . . . . . . . . . . . . . . . vrcp14pd\t%zmm29, %zmm31\n",
"[0,47] . . .D=========================eeeeeeeeeeeE-----R. . . . . . . . . . . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29\n",
"[0,48] . . .D=========================eeeeE------------R. . . . . . . . . . . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm31, %k6\n",
"[0,49] . . .D====================================eeeeE-R. . . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm29, %zmm29, %zmm30\n",
"[0,50] . . .D==============================eE----------R. . . . . . . . . . . . . . . . . . . . . . . . knotw\t%k6, %k7\n",
"[0,51] . . .D====================================eeeeE-R. . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm31, %zmm29, %zmm31 {%k7}\n",
"[0,52] . . . D=======================================eeeeER . . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm31, %zmm30, %zmm31 {%k7}\n",
"[0,53] . . . D===========================================eeeeER . . . . . . . . . . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm31, %zmm3\n",
"[0,54] . . . DeE----------------------------------------------R . . . . . . . . . . . . . . . . . . . . . . cmpl\t%edx, %ecx\n",
"[0,55] . . . D=eE---------------------------------------------R . . . . . . . . . . . . . . . . . . . . . . jb\t..B1.4\n",
"[1,0] . . . DeE----------------------------------------------R . . . . . . . . . . . . . . . . . . . . . . addl\t$32, %ecx\n",
"[1,1] . . . DeE----------------------------------------------R . . . . . . . . . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm9, %ymm14\n",
"[1,2] . . . D==eeeeeeeE-------------------------------------R . . . . . . . . . . . . . . . . . . . . . . vcvtdq2pd\t%ymm9, %zmm8\n",
"[1,3] . . . D===============eeeeE---------------------------R . . . . . . . . . . . . . . . . . . . . . . vaddpd\t%zmm8, %zmm1, %zmm10\n",
"[1,4] . . . D====================eeeeE----------------------R . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm10, %zmm2, %zmm11\n",
"[1,5] . . . D=========================eeeeE-----------------R . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm11, %zmm11\n",
"[1,6] . . . DeE---------------------------------------------R . . . . . . . . . . . . . . . . . . . . . . vmovaps\t%zmm0, %zmm29\n",
"[1,7] . . . D============================eeeeE-------------R . . . . . . . . . . . . . . . . . . . . . . vrcp14pd\t%zmm11, %zmm13\n",
"[1,8] . . . D================================eeeeeeeeeeeE--R . . . . . . . . . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11\n",
"[1,9] . . . D================================eeeeE---------R . . . . . . . . . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm13, %k0\n",
"[1,10] . . . D==========================================eeeeER . . . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm11, %zmm11, %zmm12\n",
"[1,11] . . . D====================================eE---------R . . . . . . . . . . . . . . . . . . . . . . knotw\t%k0, %k1\n",
"[1,12] . . . D==========================================eeeeER . . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm13, %zmm11, %zmm13 {%k1}\n",
"[1,13] . . . D==============================================eeeeER . . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm13, %zmm12, %zmm13 {%k1}\n",
"[1,14] . . . D==================================================eeeeER . . . . . . . . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm13, %zmm6\n",
"[1,15] . . . DeE-----------------------------------------------------R . . . . . . . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm14, %ymm20\n",
"[1,16] . . . .D===eeeeeeeE-------------------------------------------R . . . . . . . . . . . . . . . . . . . . vcvtdq2pd\t%ymm14, %zmm15\n",
"[1,17] . . . .D==============eeeeE-----------------------------------R . . . . . . . . . . . . . . . . . . . . vaddpd\t%zmm15, %zmm1, %zmm16\n",
"[1,18] . . . .D==================eeeeE-------------------------------R . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm16, %zmm2, %zmm17\n",
"[1,19] . . . .D======================eeeeE---------------------------R . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm17, %zmm17\n",
"[1,20] . . . . D================================eeeeE----------------R . . . . . . . . . . . . . . . . . . . . vrcp14pd\t%zmm17, %zmm19\n",
"[1,21] . . . . D====================================eeeeeeeeeeeE-----R . . . . . . . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17\n",
"[1,22] . . . . D=====================================eeeeE-----------R . . . . . . . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm19, %k2\n",
"[1,23] . . . . D==============================================eeeeE-R . . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm17, %zmm17, %zmm18\n",
"[1,24] . . . . D========================================eE----------R . . . . . . . . . . . . . . . . . . . . knotw\t%k2, %k3\n",
"[1,25] . . . . D==============================================eeeeE-R . . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm19, %zmm17, %zmm19 {%k3}\n",
"[1,26] . . . . D==================================================eeeeER. . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm19, %zmm18, %zmm19 {%k3}\n",
"[1,27] . . . . D======================================================eeeeER . . . . . . . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm19, %zmm3\n",
"[1,28] . . . . DeE---------------------------------------------------------R . . . . . . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm20, %ymm26\n",
"[1,29] . . . . D=================================eeeeeeeE-----------------R . . . . . . . . . . . . . . . . . . . vcvtdq2pd\t%ymm20, %zmm21\n",
"[1,30] . . . . D========================================eeeeE-------------R . . . . . . . . . . . . . . . . . . . vaddpd\t%zmm21, %zmm1, %zmm22\n",
"[1,31] . . . . D===========================================eeeeE---------R . . . . . . . . . . . . . . . . . . . vmulpd\t%zmm22, %zmm2, %zmm23\n",
"[1,32] . . . . .D==============================================eeeeE-----R . . . . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm23, %zmm23\n",
"[1,33] . . . . . D=================================================eeeeE-R . . . . . . . . . . . . . . . . . . . vrcp14pd\t%zmm23, %zmm25\n",
"[1,34] . . . . . D====================================================eeeeeeeeeeeER . . . . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23\n",
"[1,35] . . . . . D===================================================eeeeE-------R . . . . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm25, %k4\n",
"[1,36] . . . . . D=============================================================eeeeER . . . . . . . . . . . . . . . . vmulpd\t%zmm23, %zmm23, %zmm24\n",
"[1,37] . . . . . D======================================================eE----------R . . . . . . . . . . . . . . . . knotw\t%k4, %k5\n",
"[1,38] . . . . . .D============================================================eeeeER . . . . . . . . . . . . . . . . vfmadd213pd\t%zmm25, %zmm23, %zmm25 {%k5}\n",
"[1,39] . . . . . . D===============================================================eeeeER . . . . . . . . . . . . . . . vfmadd213pd\t%zmm25, %zmm24, %zmm25 {%k5}\n",
"[1,40] . . . . . . D==================================================================eeeeER . . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm25, %zmm6\n",
"[1,41] . . . . . . D============================eeeeeeeE-----------------------------------R . . . . . . . . . . . . . . vcvtdq2pd\t%ymm26, %zmm27\n",
"[1,42] . . . . . . DeE--------------------------------------------------------------------R . . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm26, %ymm9\n",
"[1,43] . . . . . . D==================================eeeeE-------------------------------R . . . . . . . . . . . . . . vaddpd\t%zmm27, %zmm1, %zmm28\n",
"[1,44] . . . . . . D=====================================eeeeE---------------------------R . . . . . . . . . . . . . . vmulpd\t%zmm28, %zmm2, %zmm8\n",
"[1,45] . . . . . . D===========================================eeeeE---------------------R . . . . . . . . . . . . . . vfmadd231pd\t%zmm8, %zmm8, %zmm29\n",
"[1,46] . . . . . . D===============================================eeeeE-----------------R . . . . . . . . . . . . . . vrcp14pd\t%zmm29, %zmm31\n",
"[1,47] . . . . . . .D==================================================eeeeeeeeeeeE------R . . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29\n",
"[1,48] . . . . . . . D=================================================eeeeE-------------R . . . . . . . . . . . . . . vfpclasspd\t$30, %zmm31, %k6\n",
"[1,49] . . . . . . . D===========================================================eeeeE--R . . . . . . . . . . . . . . vmulpd\t%zmm29, %zmm29, %zmm30\n",
"[1,50] . . . . . . . D=====================================================eE----------R . . . . . . . . . . . . . . knotw\t%k6, %k7\n",
"[1,51] . . . . . . . D==========================================================eeeeE-R . . . . . . . . . . . . . . vfmadd213pd\t%zmm31, %zmm29, %zmm31 {%k7}\n",
"[1,52] . . . . . . . D==============================================================eeeeER . . . . . . . . . . . . . . vfmadd213pd\t%zmm31, %zmm30, %zmm31 {%k7}\n",
"[1,53] . . . . . . . .D=================================================================eeeeER . . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm31, %zmm3\n",
"[1,54] . . . . . . . .DeE--------------------------------------------------------------------R . . . . . . . . . . . . . cmpl\t%edx, %ecx\n",
"[1,55] . . . . . . . . DeE-------------------------------------------------------------------R . . . . . . . . . . . . . jb\t..B1.4\n",
"[2,0] . . . . . . . . DeE-------------------------------------------------------------------R . . . . . . . . . . . . . addl\t$32, %ecx\n",
"[2,1] . . . . . . . . D=eE------------------------------------------------------------------R . . . . . . . . . . . . . vpaddd\t%ymm5, %ymm9, %ymm14\n",
"[2,2] . . . . . . . . D======================eeeeeeeE--------------------------------------R . . . . . . . . . . . . . vcvtdq2pd\t%ymm9, %zmm8\n",
"[2,3] . . . . . . . . D==============================eeeeE---------------------------------R . . . . . . . . . . . . . vaddpd\t%zmm8, %zmm1, %zmm10\n",
"[2,4] . . . . . . . . D===================================eeeeE----------------------------R . . . . . . . . . . . . . vmulpd\t%zmm10, %zmm2, %zmm11\n",
"[2,5] . . . . . . . . D========================================eeeeE-----------------------R . . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm11, %zmm11\n",
"[2,6] . . . . . . . . DeE-----------------------------------------------------------------R . . . . . . . . . . . . . vmovaps\t%zmm0, %zmm29\n",
"[2,7] . . . . . . . . D===========================================eeeeE-------------------R . . . . . . . . . . . . . vrcp14pd\t%zmm11, %zmm13\n",
"[2,8] . . . . . . . . D================================================eeeeeeeeeeeE-------R . . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11\n",
"[2,9] . . . . . . . . D================================================eeeeE-------------R . . . . . . . . . . . . . vfpclasspd\t$30, %zmm13, %k0\n",
"[2,10] . . . . . . . . D==========================================================eeeeE---R . . . . . . . . . . . . . vmulpd\t%zmm11, %zmm11, %zmm12\n",
"[2,11] . . . . . . . . .D======================================================eE---------R . . . . . . . . . . . . . knotw\t%k0, %k1\n",
"[2,12] . . . . . . . . .D=========================================================eeeeE---R . . . . . . . . . . . . . vfmadd213pd\t%zmm13, %zmm11, %zmm13 {%k1}\n",
"[2,13] . . . . . . . . . D============================================================eeeeER . . . . . . . . . . . . . vfmadd213pd\t%zmm13, %zmm12, %zmm13 {%k1}\n",
"[2,14] . . . . . . . . . D================================================================eeeeER . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm13, %zmm6\n",
"[2,15] . . . . . . . . . DeE------------------------------------------------------------------R . . . . . . . . . . . . vpaddd\t%ymm5, %ymm14, %ymm20\n",
"[2,16] . . . . . . . . . D==================eeeeeeeE-----------------------------------------R . . . . . . . . . . . . vcvtdq2pd\t%ymm14, %zmm15\n",
"[2,17] . . . . . . . . . D=========================eeeeE-------------------------------------R . . . . . . . . . . . . vaddpd\t%zmm15, %zmm1, %zmm16\n",
"[2,18] . . . . . . . . . D=============================eeeeE--------------------------------R . . . . . . . . . . . . vmulpd\t%zmm16, %zmm2, %zmm17\n",
"[2,19] . . . . . . . . . .D=================================eeeeE---------------------------R . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm17, %zmm17\n",
"[2,20] . . . . . . . . . . D=====================================eeeeE----------------------R . . . . . . . . . . . . vrcp14pd\t%zmm17, %zmm19\n",
"[2,21] . . . . . . . . . . D=========================================eeeeeeeeeeeE-----------R . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17\n",
"[2,22] . . . . . . . . . . D=========================================eeeeE-----------------R . . . . . . . . . . . . vfpclasspd\t$30, %zmm19, %k2\n",
"[2,23] . . . . . . . . . . D===================================================eeeeE-------R . . . . . . . . . . . . vmulpd\t%zmm17, %zmm17, %zmm18\n",
"[2,24] . . . . . . . . . . D===============================================eE-------------R . . . . . . . . . . . . knotw\t%k2, %k3\n",
"[2,25] . . . . . . . . . . D=================================================eeeeE-------R . . . . . . . . . . . . vfmadd213pd\t%zmm19, %zmm17, %zmm19 {%k3}\n",
"[2,26] . . . . . . . . . . . D===================================================eeeeE---R . . . . . . . . . . . . vfmadd213pd\t%zmm19, %zmm18, %zmm19 {%k3}\n",
"[2,27] . . . . . . . . . . . D=======================================================eeeeER . . . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm19, %zmm3\n",
"[2,28] . . . . . . . . . . . DeE---------------------------------------------------------R . . . . . . . . . . . . vpaddd\t%ymm5, %ymm20, %ymm26\n",
"[2,29] . . . . . . . . . . . D============eeeeeeeE--------------------------------------R . . . . . . . . . . . . vcvtdq2pd\t%ymm20, %zmm21\n",
"[2,30] . . . . . . . . . . . D====================eeeeE---------------------------------R . . . . . . . . . . . . vaddpd\t%zmm21, %zmm1, %zmm22\n",
"[2,31] . . . . . . . . . . . D=========================eeeeE---------------------------R . . . . . . . . . . . . vmulpd\t%zmm22, %zmm2, %zmm23\n",
"[2,32] . . . . . . . . . . . .D=============================eeeeE----------------------R . . . . . . . . . . . . vfmadd213pd\t%zmm0, %zmm23, %zmm23\n",
"[2,33] . . . . . . . . . . . . D==================================eeeeE----------------R . . . . . . . . . . . . vrcp14pd\t%zmm23, %zmm25\n",
"[2,34] . . . . . . . . . . . . D=====================================eeeeeeeeeeeE-----R . . . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23\n",
"[2,35] . . . . . . . . . . . . D======================================eeeeE-----------R . . . . . . . . . . . . vfpclasspd\t$30, %zmm25, %k4\n",
"[2,36] . . . . . . . . . . . . D===============================================eeeeE-R . . . . . . . . . . . . vmulpd\t%zmm23, %zmm23, %zmm24\n",
"[2,37] . . . . . . . . . . . . D========================================eE----------R . . . . . . . . . . . . knotw\t%k4, %k5\n",
"[2,38] . . . . . . . . . . . . .D==============================================eeeeER . . . . . . . . . . . . vfmadd213pd\t%zmm25, %zmm23, %zmm25 {%k5}\n",
"[2,39] . . . . . . . . . . . . . D=================================================eeeeER . . . . . . . . . . . vfmadd213pd\t%zmm25, %zmm24, %zmm25 {%k5}\n",
"[2,40] . . . . . . . . . . . . . D====================================================eeeeER . . . . . . . . . . vfmadd231pd\t%zmm4, %zmm25, %zmm6\n",
"[2,41] . . . . . . . . . . . . . D======eeeeeeeE------------------------------------------R . . . . . . . . . . vcvtdq2pd\t%ymm26, %zmm27\n",
"[2,42] . . . . . . . . . . . . . DeE------------------------------------------------------R . . . . . . . . . . vpaddd\t%ymm5, %ymm26, %ymm9\n",
"[2,43] . . . . . . . . . . . . . D===============eeeeE-----------------------------------R . . . . . . . . . . vaddpd\t%zmm27, %zmm1, %zmm28\n",
"[2,44] . . . . . . . . . . . . . D========================eeeeE--------------------------R . . . . . . . . . . vmulpd\t%zmm28, %zmm2, %zmm8\n",
"[2,45] . . . . . . . . . . . . . D============================eeeeE----------------------R . . . . . . . . . . vfmadd231pd\t%zmm8, %zmm8, %zmm29\n",
"[2,46] . . . . . . . . . . . . . .D======================================eeeeE-----------R . . . . . . . . . . vrcp14pd\t%zmm29, %zmm31\n",
"[2,47] . . . . . . . . . . . . . . D=========================================eeeeeeeeeeeER . . . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29\n",
"[2,48] . . . . . . . . . . . . . . D=========================================eeeeE------R . . . . . . . . . . vfpclasspd\t$30, %zmm31, %k6\n",
"[2,49] . . . . . . . . . . . . . . D===================================================eeeeER . . . . . . . . . vmulpd\t%zmm29, %zmm29, %zmm30\n",
"[2,50] . . . . . . . . . . . . . . D============================================eE---------R . . . . . . . . . knotw\t%k6, %k7\n",
"[2,51] . . . . . . . . . . . . . . D==================================================eeeeER . . . . . . . . . vfmadd213pd\t%zmm31, %zmm29, %zmm31 {%k7}\n",
"[2,52] . . . . . . . . . . . . . . D=====================================================eeeeER. . . . . . . . . vfmadd213pd\t%zmm31, %zmm30, %zmm31 {%k7}\n",
"[2,53] . . . . . . . . . . . . . . .D========================================================eeeeER . . . . . . . . vfmadd231pd\t%zmm4, %zmm31, %zmm3\n",
"[2,54] . . . . . . . . . . . . . . . DeE----------------------------------------------------------R . . . . . . . . cmpl\t%edx, %ecx\n",
"[2,55] . . . . . . . . . . . . . . . DeE---------------------------------------------------------R . . . . . . . . jb\t..B1.4\n",
"[3,0] . . . . . . . . . . . . . . . DeE---------------------------------------------------------R . . . . . . . . addl\t$32, %ecx\n",
"[3,1] . . . . . . . . . . . . . . . DeE--------------------------------------------------------R . . . . . . . . vpaddd\t%ymm5, %ymm9, %ymm14\n",
"[3,2] . . . . . . . . . . . . . . . D==eeeeeeeE------------------------------------------------R . . . . . . . . vcvtdq2pd\t%ymm9, %zmm8\n",
"[3,3] . . . . . . . . . . . . . . . D=========eeeeE--------------------------------------------R . . . . . . . . vaddpd\t%zmm8, %zmm1, %zmm10\n",
"[3,4] . . . . . . . . . . . . . . . D================eeeeE-------------------------------------R . . . . . . . . vmulpd\t%zmm10, %zmm2, %zmm11\n",
"[3,5] . . . . . . . . . . . . . . . D===================eeeeE---------------------------------R . . . . . . . . vfmadd213pd\t%zmm0, %zmm11, %zmm11\n",
"[3,6] . . . . . . . . . . . . . . . DeE-------------------------------------------------------R . . . . . . . . vmovaps\t%zmm0, %zmm29\n",
"[3,7] . . . . . . . . . . . . . . . D===================================eeeeE-----------------R . . . . . . . . vrcp14pd\t%zmm11, %zmm13\n",
"[3,8] . . . . . . . . . . . . . . . .D======================================eeeeeeeeeeeE------R . . . . . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11\n",
"[3,9] . . . . . . . . . . . . . . . .D=======================================eeeeE------------R . . . . . . . . vfpclasspd\t$30, %zmm13, %k0\n",
"[3,10] . . . . . . . . . . . . . . . .D=================================================eeeeE--R . . . . . . . . vmulpd\t%zmm11, %zmm11, %zmm12\n",
"[3,11] . . . . . . . . . . . . . . . . D===========================================eE----------R . . . . . . . . knotw\t%k0, %k1\n",
"[3,12] . . . . . . . . . . . . . . . . D===============================================eeeeE--R . . . . . . . . vfmadd213pd\t%zmm13, %zmm11, %zmm13 {%k1}\n",
"[3,13] . . . . . . . . . . . . . . . . D==================================================eeeeER . . . . . . . vfmadd213pd\t%zmm13, %zmm12, %zmm13 {%k1}\n",
"[3,14] . . . . . . . . . . . . . . . . D=====================================================eeeeER. . . . . . . vfmadd231pd\t%zmm4, %zmm13, %zmm6\n",
"[3,15] . . . . . . . . . . . . . . . . DeE--------------------------------------------------------R. . . . . . . vpaddd\t%ymm5, %ymm14, %ymm20\n",
"[3,16] . . . . . . . . . . . . . . . . .D===============================eeeeeeeE------------------R. . . . . . . vcvtdq2pd\t%ymm14, %zmm15\n",
"[3,17] . . . . . . . . . . . . . . . . .D=======================================eeeeE-------------R. . . . . . . vaddpd\t%zmm15, %zmm1, %zmm16\n",
"[3,18] . . . . . . . . . . . . . . . . .D===========================================eeeeE---------R. . . . . . . vmulpd\t%zmm16, %zmm2, %zmm17\n",
"[3,19] . . . . . . . . . . . . . . . . . D==============================================eeeeE-----R. . . . . . . vfmadd213pd\t%zmm0, %zmm17, %zmm17\n",
"[3,20] . . . . . . . . . . . . . . . . . D==================================================eeeeE-R. . . . . . . vrcp14pd\t%zmm17, %zmm19\n",
"[3,21] . . . . . . . . . . . . . . . . . D=====================================================eeeeeeeeeeeER. . . . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17\n",
"[3,22] . . . . . . . . . . . . . . . . . D=====================================================eeeeE------R. . . . . vfpclasspd\t$30, %zmm19, %k2\n",
"[3,23] . . . . . . . . . . . . . . . . . D==============================================================eeeeER . . . . vmulpd\t%zmm17, %zmm17, %zmm18\n",
"[3,24] . . . . . . . . . . . . . . . . . .D=======================================================eE---------R . . . . knotw\t%k2, %k3\n",
"[3,25] . . . . . . . . . . . . . . . . . . D============================================================eeeeER . . . . vfmadd213pd\t%zmm19, %zmm17, %zmm19 {%k3}\n",
"[3,26] . . . . . . . . . . . . . . . . . . D================================================================eeeeER . . . vfmadd213pd\t%zmm19, %zmm18, %zmm19 {%k3}\n",
"[3,27] . . . . . . . . . . . . . . . . . . D===================================================================eeeeER . . vfmadd231pd\t%zmm4, %zmm19, %zmm3\n",
"[3,28] . . . . . . . . . . . . . . . . . . DeE----------------------------------------------------------------------R . . vpaddd\t%ymm5, %ymm20, %ymm26\n",
"[3,29] . . . . . . . . . . . . . . . . . . D===========================eeeeeeeE------------------------------------R . . vcvtdq2pd\t%ymm20, %zmm21\n",
"[3,30] . . . . . . . . . . . . . . . . . . D==================================eeeeE--------------------------------R . . vaddpd\t%zmm21, %zmm1, %zmm22\n",
"[3,31] . . . . . . . . . . . . . . . . . . D======================================eeeeE----------------------------R . . vmulpd\t%zmm22, %zmm2, %zmm23\n",
"[3,32] . . . . . . . . . . . . . . . . . . D=========================================eeeeE------------------------R . . vfmadd213pd\t%zmm0, %zmm23, %zmm23\n",
"[3,33] . . . . . . . . . . . . . . . . . . D=============================================eeeeE--------------------R . . vrcp14pd\t%zmm23, %zmm25\n",
"[3,34] . . . . . . . . . . . . . . . . . . .D================================================eeeeeeeeeeeE---------R . . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23\n",
"[3,35] . . . . . . . . . . . . . . . . . . .D=================================================eeeeE---------------R . . vfpclasspd\t$30, %zmm25, %k4\n",
"[3,36] . . . . . . . . . . . . . . . . . . . D==========================================================eeeeE-----R . . vmulpd\t%zmm23, %zmm23, %zmm24\n",
"[3,37] . . . . . . . . . . . . . . . . . . . D====================================================eE-------------R . . knotw\t%k4, %k5\n",
"[3,38] . . . . . . . . . . . . . . . . . . . D========================================================eeeeE-----R . . vfmadd213pd\t%zmm25, %zmm23, %zmm25 {%k5}\n",
"[3,39] . . . . . . . . . . . . . . . . . . . D============================================================eeeeE-R . . vfmadd213pd\t%zmm25, %zmm24, %zmm25 {%k5}\n",
"[3,40] . . . . . . . . . . . . . . . . . . . D===============================================================eeeeER. . vfmadd231pd\t%zmm4, %zmm25, %zmm6\n",
"[3,41] . . . . . . . . . . . . . . . . . . . D======================eeeeeeeE--------------------------------------R. . vcvtdq2pd\t%ymm26, %zmm27\n",
"[3,42] . . . . . . . . . . . . . . . . . . . .DeE-----------------------------------------------------------------R. . vpaddd\t%ymm5, %ymm26, %ymm9\n",
"[3,43] . . . . . . . . . . . . . . . . . . . .D============================eeeeE----------------------------------R. . vaddpd\t%zmm27, %zmm1, %zmm28\n",
"[3,44] . . . . . . . . . . . . . . . . . . . . D===============================eeeeE------------------------------R. . vmulpd\t%zmm28, %zmm2, %zmm8\n",
"[3,45] . . . . . . . . . . . . . . . . . . . . D=====================================eeeeE------------------------R. . vfmadd231pd\t%zmm8, %zmm8, %zmm29\n",
"[3,46] . . . . . . . . . . . . . . . . . . . . D=========================================eeeeE--------------------R. . vrcp14pd\t%zmm29, %zmm31\n",
"[3,47] . . . . . . . . . . . . . . . . . . . . D============================================eeeeeeeeeeeE---------R. . vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29\n",
"[3,48] . . . . . . . . . . . . . . . . . . . . D===========================================eeeeE----------------R. . vfpclasspd\t$30, %zmm31, %k6\n",
"[3,49] . . . . . . . . . . . . . . . . . . . . D======================================================eeeeE-----R. . vmulpd\t%zmm29, %zmm29, %zmm30\n",
"[3,50] . . . . . . . . . . . . . . . . . . . . D==============================================eE---------------R. . knotw\t%k6, %k7\n",
"[3,51] . . . . . . . . . . . . . . . . . . . . D======================================================eeeeE----R. . vfmadd213pd\t%zmm31, %zmm29, %zmm31 {%k7}\n",
"[3,52] . . . . . . . . . . . . . . . . . . . . .D=========================================================eeeeER. . vfmadd213pd\t%zmm31, %zmm30, %zmm31 {%k7}\n",
"[3,53] . . . . . . . . . . . . . . . . . . . . . D============================================================eeeeER vfmadd231pd\t%zmm4, %zmm31, %zmm3\n",
"[3,54] . . . . . . . . . . . . . . . . . . . . . DeE--------------------------------------------------------------R cmpl\t%edx, %ecx\n",
"[3,55] . . . . . . . . . . . . . . . . . . . . . DeE-------------------------------------------------------------R jb\t..B1.4\n",
"\n",
"\n",
"Average Wait times (based on the timeline view):\n",
"[0]: Executions\n",
"[1]: Average time spent waiting in a scheduler's queue\n",
"[2]: Average time spent waiting in a scheduler's queue while ready\n",
"[3]: Average time elapsed from WB until retire stage\n",
"\n",
" [0] [1] [2] [3]\n",
"0. 4 1.0 1.0 42.5 addl\t$32, %ecx\n",
"1. 4 1.3 1.3 42.0 vpaddd\t%ymm5, %ymm9, %ymm14\n",
"2. 4 7.8 7.8 30.8 vcvtdq2pd\t%ymm9, %zmm8\n",
"3. 4 16.5 1.8 26.0 vaddpd\t%zmm8, %zmm1, %zmm10\n",
"4. 4 21.8 1.3 21.8 vmulpd\t%zmm10, %zmm2, %zmm11\n",
"5. 4 25.8 0.5 18.3 vfmadd213pd\t%zmm0, %zmm11, %zmm11\n",
"6. 4 1.0 1.0 45.8 vmovaps\t%zmm0, %zmm29\n",
"7. 4 32.3 3.0 12.3 vrcp14pd\t%zmm11, %zmm13\n",
"8. 4 36.0 0.3 3.8 vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm13, %zmm11\n",
"9. 4 36.3 0.8 10.3 vfpclasspd\t$30, %zmm13, %k0\n",
"10. 4 46.5 0.0 1.3 vmulpd\t%zmm11, %zmm11, %zmm12\n",
"11. 4 40.8 1.3 9.5 knotw\t%k0, %k1\n",
"12. 4 45.8 0.0 1.3 vfmadd213pd\t%zmm13, %zmm11, %zmm13 {%k1}\n",
"13. 4 49.0 0.0 0.0 vfmadd213pd\t%zmm13, %zmm12, %zmm13 {%k1}\n",
"14. 4 52.8 0.0 0.0 vfmadd231pd\t%zmm4, %zmm13, %zmm6\n",
"15. 4 1.0 1.0 54.5 vpaddd\t%ymm5, %ymm14, %ymm20\n",
"16. 4 14.0 14.0 34.8 vcvtdq2pd\t%ymm14, %zmm15\n",
"17. 4 22.3 1.3 29.5 vaddpd\t%zmm15, %zmm1, %zmm16\n",
"18. 4 26.0 0.3 25.3 vmulpd\t%zmm16, %zmm2, %zmm17\n",
"19. 4 29.8 0.3 21.0 vfmadd213pd\t%zmm0, %zmm17, %zmm17\n",
"20. 4 35.3 2.0 15.0 vrcp14pd\t%zmm17, %zmm19\n",
"21. 4 38.8 0.0 6.5 vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm19, %zmm17\n",
"22. 4 39.3 1.0 12.5 vfpclasspd\t$30, %zmm19, %k2\n",
"23. 4 48.8 0.0 3.5 vmulpd\t%zmm17, %zmm17, %zmm18\n",
"24. 4 43.0 0.8 11.8 knotw\t%k2, %k3\n",
"25. 4 47.8 0.0 3.5 vfmadd213pd\t%zmm19, %zmm17, %zmm19 {%k3}\n",
"26. 4 51.0 0.0 1.3 vfmadd213pd\t%zmm19, %zmm18, %zmm19 {%k3}\n",
"27. 4 54.8 0.0 0.0 vfmadd231pd\t%zmm4, %zmm19, %zmm3\n",
"28. 4 1.0 1.0 56.5 vpaddd\t%ymm5, %ymm20, %ymm26\n",
"29. 4 19.0 19.0 31.8 vcvtdq2pd\t%ymm20, %zmm21\n",
"30. 4 26.3 0.3 27.5 vaddpd\t%zmm21, %zmm1, %zmm22\n",
"31. 4 30.0 0.5 23.0 vmulpd\t%zmm22, %zmm2, %zmm23\n",
"32. 4 33.5 0.3 18.8 vfmadd213pd\t%zmm0, %zmm23, %zmm23\n",
"33. 4 37.5 0.5 14.3 vrcp14pd\t%zmm23, %zmm25\n",
"34. 4 40.5 0.0 5.8 vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm25, %zmm23\n",
"35. 4 40.8 0.5 12.3 vfpclasspd\t$30, %zmm25, %k4\n",
"36. 4 50.5 0.0 2.8 vmulpd\t%zmm23, %zmm23, %zmm24\n",
"37. 4 44.0 0.5 11.8 knotw\t%k4, %k5\n",
"38. 4 49.5 0.3 2.5 vfmadd213pd\t%zmm25, %zmm23, %zmm25 {%k5}\n",
"39. 4 52.8 0.0 0.5 vfmadd213pd\t%zmm25, %zmm24, %zmm25 {%k5}\n",
"40. 4 56.0 0.0 0.0 vfmadd231pd\t%zmm4, %zmm25, %zmm6\n",
"41. 4 15.0 15.0 37.8 vcvtdq2pd\t%ymm26, %zmm27\n",
"42. 4 1.0 1.0 57.3 vpaddd\t%ymm5, %ymm26, %ymm9\n",
"43. 4 22.0 0.8 33.0 vaddpd\t%zmm27, %zmm1, %zmm28\n",
"44. 4 27.3 2.0 27.0 vmulpd\t%zmm28, %zmm2, %zmm8\n",
"45. 4 32.3 1.0 22.0 vfmadd231pd\t%zmm8, %zmm8, %zmm29\n",
"46. 4 38.0 2.0 16.0 vrcp14pd\t%zmm29, %zmm31\n",
"47. 4 41.0 0.0 5.0 vfnmadd213pd\t.L_2il0floatpacket.6(%rip){1to8}, %zmm31, %zmm29\n",
"48. 4 40.5 0.3 11.8 vfpclasspd\t$30, %zmm31, %k6\n",
"49. 4 51.0 0.0 2.0 vmulpd\t%zmm29, %zmm29, %zmm30\n",
"50. 4 44.3 0.8 11.0 knotw\t%k6, %k7\n",
"51. 4 50.5 0.5 1.5 vfmadd213pd\t%zmm31, %zmm29, %zmm31 {%k7}\n",
"52. 4 53.8 0.0 0.0 vfmadd213pd\t%zmm31, %zmm30, %zmm31 {%k7}\n",
"53. 4 57.0 0.0 0.0 vfmadd231pd\t%zmm4, %zmm31, %zmm3\n",
"54. 4 1.0 1.0 58.5 cmpl\t%edx, %ecx\n",
"55. 4 1.3 0.0 57.5 jb\t..B1.4\n",
" 4 32.5 1.6 18.4 <total>\n",
"</pre>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space: pre !important;\">Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-30;16:57:45\n",
"Analyzed File - build/SKX/icc/O3/pi.marked.o\n",
"Binary Format - 64Bit\n",
"Architecture - SKX\n",
"Analysis Type - Throughput\n",
"\n",
"Throughput Analysis Report\n",
"--------------------------\n",
"Block Throughput: 31.50 Cycles Throughput Bottleneck: Backend\n",
"Loop Count: 103\n",
"Port Binding In Cycles Per Iteration:\n",
"--------------------------------------------------------------------------------------------------\n",
"| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |\n",
"--------------------------------------------------------------------------------------------------\n",
"| Cycles | 30.0 0.0 | 4.0 | 2.0 2.0 | 2.0 2.0 | 0.0 | 30.0 | 1.0 | 0.0 |\n",
"--------------------------------------------------------------------------------------------------\n",
"\n",
"DV - Divider pipe (on port 0)\n",
"D - Data fetch pipe (on ports 2 and 3)\n",
"F - Macro Fusion with the previous instruction occurred\n",
"* - instruction micro-ops not bound to a port\n",
"^ - Micro Fusion occurred\n",
"# - ESP Tracking sync uop was issued\n",
"@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected\n",
"X - instruction not supported, was not accounted in Analysis\n",
"\n",
"| Num Of | Ports pressure in cycles | |\n",
"| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |\n",
"-----------------------------------------------------------------------------------------\n",
"| 1 | | | | | | | 1.0 | | add ecx, 0x20\n",
"| 1 | | 1.0 | | | | | | | vpaddd ymm14, ymm9, ymm5\n",
"| 2 | 1.0 | | | | | 1.0 | | | vcvtdq2pd zmm8, ymm9\n",
"| 1 | | | | | | 1.0 | | | vaddpd zmm10, zmm1, zmm8\n",
"| 1 | 1.0 | | | | | | | | vmulpd zmm11, zmm2, zmm10\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm11, zmm11, zmm0\n",
"| 1* | | | | | | | | | vmovaps zmm29, zmm0\n",
"| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm13, zmm11\n",
"| 2^ | | | 1.0 1.0 | | | 1.0 | | | vfnmadd213pd zmm11, zmm13, qword ptr [rip]{1to8}\n",
"| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm13, 0x1e\n",
"| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm11, zmm11\n",
"| 1 | 1.0 | | | | | | | | knotw k1, k0\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm13{k1}, zmm11, zmm13\n",
"| 1 | 1.0 | | | | | | | | vfmadd213pd zmm13{k1}, zmm12, zmm13\n",
"| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6, zmm13, zmm4\n",
"| 1 | | 1.0 | | | | | | | vpaddd ymm20, ymm14, ymm5\n",
"| 2 | 1.0 | | | | | 1.0 | | | vcvtdq2pd zmm15, ymm14\n",
"| 1 | 1.0 | | | | | | | | vaddpd zmm16, zmm1, zmm15\n",
"| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm2, zmm16\n",
"| 1 | 1.0 | | | | | | | | vfmadd213pd zmm17, zmm17, zmm0\n",
"| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm17\n",
"| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm17, zmm19, qword ptr [rip]{1to8}\n",
"| 1 | | | | | | 1.0 | | | vfpclasspd k2, zmm19, 0x1e\n",
"| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17\n",
"| 1 | 1.0 | | | | | | | | knotw k3, k2\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm19{k3}, zmm17, zmm19\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm19{k3}, zmm18, zmm19\n",
"| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm4\n",
"| 1 | | 1.0 | | | | | | | vpaddd ymm26, ymm20, ymm5\n",
"| 2 | 1.0 | | | | | 1.0 | | | vcvtdq2pd zmm21, ymm20\n",
"| 1 | | | | | | 1.0 | | | vaddpd zmm22, zmm1, zmm21\n",
"| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm2, zmm22\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm23, zmm23, zmm0\n",
"| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm25, zmm23\n",
"| 2^ | | | 1.0 1.0 | | | 1.0 | | | vfnmadd213pd zmm23, zmm25, qword ptr [rip]{1to8}\n",
"| 1 | | | | | | 1.0 | | | vfpclasspd k4, zmm25, 0x1e\n",
"| 1 | 1.0 | | | | | | | | vmulpd zmm24, zmm23, zmm23\n",
"| 1 | 1.0 | | | | | | | | knotw k5, k4\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm25{k5}, zmm23, zmm25\n",
"| 1 | 1.0 | | | | | | | | vfmadd213pd zmm25{k5}, zmm24, zmm25\n",
"| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6, zmm25, zmm4\n",
"| 2 | 1.0 | | | | | 1.0 | | | vcvtdq2pd zmm27, ymm26\n",
"| 1 | | 1.0 | | | | | | | vpaddd ymm9, ymm26, ymm5\n",
"| 1 | 1.0 | | | | | | | | vaddpd zmm28, zmm1, zmm27\n",
"| 1 | | | | | | 1.0 | | | vmulpd zmm8, zmm2, zmm28\n",
"| 1 | 1.0 | | | | | | | | vfmadd231pd zmm29, zmm8, zmm8\n",
"| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm31, zmm29\n",
"| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm29, zmm31, qword ptr [rip]{1to8}\n",
"| 1 | | | | | | 1.0 | | | vfpclasspd k6, zmm31, 0x1e\n",
"| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm29, zmm29\n",
"| 1 | 1.0 | | | | | | | | knotw k7, k6\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm31{k7}, zmm29, zmm31\n",
"| 1 | | | | | | 1.0 | | | vfmadd213pd zmm31{k7}, zmm30, zmm31\n",
"| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm31, zmm4\n",
"| 1* | | | | | | | | | cmp ecx, edx\n",
"| 0*F | | | | | | | | | jb 0xfffffffffffffeb3\n",
"Total Num Of Uops: 71\n",
"Analysis Notes:\n",
"Backend allocation was stalled due to unavailable allocation resources.\n",
"</pre>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
@@ -307,7 +973,8 @@
" for l in r['analyzed kernel']\n",
" if l['instruction']]))\n",
"for a in archs:\n",
" print(a, 'has', len(df[df.arch == a]), 'tests, compiled to', len(set(list(df[df.arch == a]['kernel_index']))), 'unique assembly representations.')"
" print(a, 'has', len(df[df.arch == a]), 'tests, compiled to', len(set(list(df[df.arch == a]['kernel_index']))), 'unique assembly representations.')\n",
"get_info((\"SKX\", \"icc\", \"O3\", \"pi\"))"
]
},
{
@@ -343,7 +1010,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 25,
"metadata": {
"hideCode": false,
"hidePrompt": false,