Merge pull request #27 from RRZE-HPC/imported_intel_models

Imported intel models
This commit is contained in:
Jan
2019-04-30 08:28:06 +02:00
committed by GitHub
16 changed files with 40700 additions and 428 deletions

View File

@@ -1,2 +1,2 @@
name = "osaca"
__version__ = '0.2.0'
__version__ = '0.2.1'

3669
osaca/data/CFL_data.csv Normal file

File diff suppressed because it is too large Load Diff

3669
osaca/data/KBL_data.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

223
osaca/data/model_importer.py Executable file
View File

@@ -0,0 +1,223 @@
#!/usr/bin/env python3
from collections import defaultdict, OrderedDict
import xml.etree.ElementTree as ET
import re
import sys
import argparse
from distutils.version import StrictVersion
from osaca.param import Parameter, Register
from osaca.eu_sched import Scheduler
def normalize_reg_name(reg_name):
# strip spaces
reg_name = reg_name.strip()
# masks are denoted with curly brackets in uops.info
reg_name = re.sub(r'{K([0-7])}', r'K\1', reg_name)
reg_name = re.sub(r'ST\(([0-7])\)', r'ST\1', reg_name)
return reg_name
def port_occupancy_from_tag_attributes(attrib, arch):
occupancy = defaultdict(int)
for k, v in attrib.items():
m = re.match('^port([0-9]+)', k)
if not m:
continue
ports = m.group(1)
# Ignore Port7 on HSW, BDW, SKL and SKX if present in combination with ports 2 and 3.
# Port7 is only used for simple address generation, while 2 and 3 handle all addressing,
# but uops.info does not differentiate.
if arch in ['HSW', 'BDW', 'SKL', 'SKX'] and ports == '237':
ports = ports.replace('7', '')
potential_ports = list(ports)
per_port_occupancy = int(v) / len(potential_ports)
for pp in potential_ports:
occupancy[pp] += per_port_occupancy
# Also consider DIV pipeline
if 'div_cycles' in attrib:
occupancy['0DV'] = int(attrib['div_cycles'])
return dict(occupancy)
def extract_paramters(instruction_tag):
# Extract parameter components
parameters = [] # used to store string representations
parameter_tags = sorted(instruction_tag.findall("operand"),
key=lambda p: int(p.attrib['idx']))
for parameter_tag in parameter_tags:
# Ignore parameters with suppressed=1
if int(parameter_tag.attrib.get('suppressed', '0')):
continue
p_type = parameter_tag.attrib['type']
if p_type == 'imm':
parameters.append('imd') # Parameter('IMD')
elif p_type == 'mem':
parameters.append('mem') # Parameter('MEM')
elif p_type == 'reg':
possible_regs = [normalize_reg_name(r)
for r in parameter_tag.text.split(',')]
reg_groups = [Register.sizes.get(r, None) for r in possible_regs]
if reg_groups[1:] == reg_groups[:-1]:
if reg_groups[0] is None:
raise ValueError("Unknown register type for {} with {}.".format(
parameter_tag.attrib, parameter_tag.text))
elif reg_groups[0][1] == 'GPR':
parameters.append('r{}'.format(reg_groups[0][0]))
# Register(possible_regs[0]))
elif '{' in parameter_tag.text:
# We have a mask
parameters[-1] += '{opmask}'
else:
parameters.append(reg_groups[0][1].lower())
elif p_type == 'relbr':
parameters.append('LBL')
elif p_type == 'agen':
parameters.append('mem')
else:
raise ValueError("Unknown paramter type {}".format(parameter_tag.attrib))
return parameters
def extract_model(tree, arch):
model_data = []
for instruction_tag in tree.findall('//instruction'):
ignore = False
mnemonic = instruction_tag.attrib['asm']
# Extract parameter components
try:
parameters = extract_paramters(instruction_tag)
except ValueError as e:
print(e, file=sys.stderr)
# Extract port occupation, throughput and latency
port_occupancy, throughput, latency = [], 0.0, None
arch_tag = instruction_tag.find('architecture[@name="'+arch+'"]')
if arch_tag is None:
continue
# We collect all measurement and IACA information and compare them later
for measurement_tag in arch_tag.iter('measurement'):
port_occupancy.append(port_occupancy_from_tag_attributes(measurement_tag.attrib, arch))
# FIXME handle min/max Latencies ('maxCycles' and 'minCycles')
latencies = [int(l_tag.attrib['cycles'])
for l_tag in measurement_tag.iter('latency') if 'latency' in l_tag.attrib]
if latencies[1:] != latencies[:-1]:
print("Contradicting latencies found:", mnemonic, file=sys.stderr)
ignore = True
elif latencies:
latency = latencies[0]
# Ordered by IACA version (newest last)
for iaca_tag in sorted(arch_tag.iter('IACA'),
key=lambda i: StrictVersion(i.attrib['version'])):
port_occupancy.append(port_occupancy_from_tag_attributes(iaca_tag.attrib, arch))
if ignore: continue
# Check if all are equal
if port_occupancy:
if port_occupancy[1:] != port_occupancy[:-1]:
print("Contradicting port occupancies, using latest IACA:", mnemonic,
file=sys.stderr)
port_occupancy = port_occupancy[-1]
throughput = max(list(port_occupancy.values())+[0.0])
else:
# print("No data available for this architecture:", mnemonic, file=sys.stderr)
continue
for m, p in build_variants(mnemonic, parameters):
model_data.append((m.lower() + '-' + '_'.join(p),
throughput, latency, port_occupancy))
return model_data
def all_or_false(iterator):
if not iterator:
return False
else:
return all(iterator)
def build_variants(mnemonic, parameters):
"""Yield all resonable variants of this instruction form."""
# The one that was given
mnemonic = mnemonic.upper()
yield mnemonic, parameters
# Without opmask
if any(['{opmask}' in p for p in parameters]):
yield mnemonic, list([p.replace('{opmask}', '') for p in parameters])
# With suffix (assuming suffix was not already present)
suffixes = {'Q': 'r64',
'L': 'r32',
'W': 'r16',
'B': 'r8'}
for s, reg in suffixes.items():
if not mnemonic.endswith(s) and all_or_false(
[p == reg for p in parameters if p not in ['mem', 'imd']]):
yield mnemonic+s, parameters
def architectures(tree):
return set([a.attrib['name'] for a in tree.findall('.//architecture')])
def int_or_zero(s):
try:
return int(s)
except ValueError:
return 0
def dump_csv(model_data, arch):
csv = 'instr,TP,LT,ports\n'
ports = set()
for mnemonic, throughput, latency, port_occupancy in model_data:
for p in port_occupancy:
ports.add(p)
ports = sorted(ports)
# If not all ports have been used (happens with port7 due to blacklist
# port_occupancy_from_tag_attributes), extend list accordingly:
while len(ports) < Scheduler.arch_dict[arch] + len(Scheduler.arch_pipeline_ports.get(arch, [])):
max_index = ports.index(str(max(map(int_or_zero, ports))))
ports.insert(max_index + 1, str(max(map(int_or_zero, ports)) + 1))
for mnemonic, throughput, latency, port_occupancy in model_data:
for p in ports:
if p not in port_occupancy:
port_occupancy[p] = 0.0
po_items = sorted(port_occupancy.items())
csv_line = '{},{},{},"({})"\n'.format(mnemonic, throughput, latency,
','.join([str(c) for p, c in po_items]))
csv += csv_line
return csv
def main():
parser = argparse.ArgumentParser()
parser.add_argument('xml', help='path of instructions.xml from http://uops.info')
parser.add_argument('arch', nargs='?',
help='architecture to extract, use IACA abbreviations (e.g., SNB). '
'if not given, all will be extracted and saved to file in CWD.')
args = parser.parse_args()
tree = ET.parse(args.xml)
if args.arch:
model_data = extract_model(tree, args.arch)
print(dump_csv(model_data, args.arch))
else:
for arch in architectures(tree):
model_data = extract_model(tree, arch)
with open('{}_data.csv'.format(arch), 'w') as f:
f.write(dump_csv(model_data, arch))
if __name__ == '__main__':
main()

2434
osaca/data/nhm_data.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

9664
osaca/data/skx_data.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

2446
osaca/data/wsm_data.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -12,8 +12,23 @@ from osaca.param import Register, MemAddr
class Scheduler(object):
arch_dict = {'SNB': 6, 'IVB': 6, 'HSW': 8, 'BDW': 8, 'SKL': 8, 'ZEN': 10}
dv_ports_dict = {'SKL': [0], 'ZEN': [3]}
arch_dict = {
# Intel
'NHM': 5, 'WSM': 5, # Nehalem, Westmere
'SNB': 6, 'IVB': 6, # Sandy Bridge, Ivy Bridge
'HSW': 8, 'BDW': 8, # Haswell, Broadwell
'SKL': 8, 'SKX': 8, # Skylake(-X)
'KBL': 8, 'CFL': 8, # Kaby Lake, Coffee Lake
# AMD
'ZEN': 10, # Zen/Ryzen/EPYC
}
arch_pipeline_ports = {
'NHM': ['0DV'], 'WSM': ['0DV'],
'SNB': ['0DV'], 'IVB': ['0DV'],
'HSW': ['0DV'], 'BDW': ['0DV'],
'SKL': ['0DV'], 'SKX': ['0DV'],
'KBL': ['0DV'], 'CFL': ['0DV'],
'ZEN': ['3DV'],}
# content of most inner list in instrList: instr, operand(s), instr form
df = None # type: DataFrame
# for parallel ld/st in archs with 1 st/cy and >1 ld/cy, able to do 1 st and 1 ld in 1cy
@@ -33,7 +48,7 @@ class Scheduler(object):
self.en_par_ldst = True
self.ld_ports = [9, 10]
# check for DV port
self.dv_ports = self.dv_ports_dict.get(arch, [])
self.pipeline_ports = self.arch_pipeline_ports.get(arch, [])
self.instrList = instruction_list
# curr_dir = os.path.realpath(__file__)[:-11]
osaca_dir = os.path.expanduser('~/.osaca/')
@@ -60,8 +75,8 @@ class Scheduler(object):
sched = self.get_head()
# Initialize ports
# Add DV port, if it is existing
occ_ports = [[0] * (self.ports + len(self.dv_ports)) for x in range(len(self.instrList))]
port_bndgs = [0] * (self.ports + len(self.dv_ports))
occ_ports = [[0] * (self.ports + len(self.pipeline_ports)) for x in range(len(self.instrList))]
port_bndgs = [0] * (self.ports + len(self.pipeline_ports))
# Store instruction counter for parallel ld/st
par_ldst = 0
# Count the number of store instr if we schedule for an architecture with par ld/st
@@ -74,8 +89,8 @@ class Scheduler(object):
# Check if there's a port occupation stored in the CSV, otherwise leave the
# occ_port list item empty
for i, instrForm in enumerate(self.instrList):
search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
try:
search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
tup = entry.ports.values[0]
if len(tup) == 1 and tup[0] == -1:
@@ -92,13 +107,14 @@ class Scheduler(object):
p_flg = ''
if self.en_par_ldst:
# Check for ld
# FIXME remove special load handling from here and place in machine model
if (isinstance(instrForm[-2], MemAddr) or
(len(instrForm) > 4 and isinstance(instrForm[2], MemAddr))):
if par_ldst > 0:
par_ldst -= 1
p_flg = 'P '
for port in self.ld_ports:
occ_ports[i][port] = '(' + str(occ_ports[i][port]) + ')'
occ_ports[i][port] = 0.0 # '(' + str(occ_ports[i][port]) + ')'
# Write schedule line
if len(p_flg) > 0:
sched += self.format_port_occupation_line(occ_ports[i], p_flg + instrForm[-1])
@@ -361,14 +377,7 @@ class Scheduler(object):
:return: list of strings
"""
port_names = []
dv_ports_appended = 0
for i in range(self.ports):
port_names.append(str(i))
if i in self.dv_ports:
dv_ports_appended += 1
port_names.append(str(i)+'DV')
return port_names
return sorted([str(i) for i in range(self.ports)] + self.pipeline_ports)
def get_port_binding(self, port_bndg):
"""

View File

@@ -364,7 +364,7 @@ class OSACA(object):
longestInstr = 30
machine_readable = False
VALID_ARCHS = ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'ZEN']
VALID_ARCHS = Scheduler.arch_dict
def __init__(self, arch, assembly, extract_with_markers=True):
"""
@@ -574,6 +574,15 @@ class OSACA(object):
(port_name, port_binding[i])
for i, port_name in enumerate(self.schedule.get_port_naming())])
def get_unmatched_instruction_ratio(self):
"""
Calculate ratio of unmatched vs total instructions
:return: float
"""
sched_output, port_binding = self.schedule.new_schedule()
return sched_output.count('| X ') / len(self.instr_forms)
def get_total_throughput(self):
"""
Return total cycles estimated per block execution. Including (potential) penalties.

View File

@@ -13,35 +13,57 @@ from osaca import osaca
class TestOsaca(unittest.TestCase):
maxDiff = None
def setUp(self):
self.curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
@unittest.skip("Binary analysis is error prone and currently not working with FSF's objdump")
def testIACABinary(self):
curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
assembly = osaca.get_assembly_from_binary(curr_dir + '/testfiles/taxCalc-ivb-iaca')
assembly = osaca.get_assembly_from_binary(self.curr_dir + '/testfiles/taxCalc-ivb-iaca')
osa = osaca.OSACA('IVB', assembly)
result = osa.generate_text_output()
result = result[result.find('Port Binding in Cycles Per Iteration:'):]
with open(curr_dir + '/test_osaca_iaca.out', encoding='utf-8') as f:
with open(self.curr_dir + '/test_osaca_iaca.out', encoding='utf-8') as f:
assertion = f.read()
self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
# Test ASM file with IACA marker in two lines
def testIACAasm1(self):
curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
with open(curr_dir + '/testfiles/taxCalc-ivb-iaca.S') as f:
with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca.S') as f:
osa = osaca.OSACA('IVB', f.read())
result = osa.generate_text_output()
result = result[result.find('Port Binding in Cycles Per Iteration:'):]
with open(curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
assertion = f.read()
self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
# Test ASM file with IACA marker in four lines
def testIACAasm2(self):
curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
with open(curr_dir + '/testfiles/taxCalc-ivb-iaca2.S') as f:
with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca2.S') as f:
osa = osaca.OSACA('IVB', f.read())
result = osa.generate_text_output()
result = result[result.find('Port Binding in Cycles Per Iteration:'):]
with open(curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
assertion = f.read()
self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
#@unittest.skip("Skip until required instructions are supported.")
def test_asm_API(self):
with open(self.curr_dir + '/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s') as f:
osa = osaca.OSACA('SKX', f.read())
text_output = osa.create_output()
print(text_output)
# Derived from IACA (and manually considering OSACAs equal distribution to ports)
self.assertEqual(dict(osa.get_port_occupation_cycles()),
{'0': 4.0,
'0DV': 0.0,
'1': 3.5,
'2': 3.5,
'3': 3.5,
'4': 1.0,
'5': 4.5,
'6': 3.5,
'7': 0.0})
# TODO consider frontend bottleneck -> 6.25 cy
self.assertEqual(osa.get_total_throughput(),
4.5)

View File

@@ -0,0 +1,653 @@
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 14
.globl _main ## -- Begin function main
.p2align 4, 0x90
_main: ## @main
.cfi_startproc
## %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $408, %rsp ## imm = 0x198
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
movq %rsi, %rbx
movq 16(%rsi), %rdi
callq _atoi
movl %eax, %r14d
movq 24(%rbx), %rdi
callq _atoi
## kill: def $eax killed $eax def $rax
movq %r14, -96(%rbp) ## 8-byte Spill
movl %r14d, %ecx
imull %r14d, %ecx
movl %ecx, -88(%rbp) ## 4-byte Spill
movq %rax, -72(%rbp) ## 8-byte Spill
imull %eax, %ecx
movslq %ecx, %r13
shlq $3, %r13
leaq -56(%rbp), %rdi
movl $32, %esi
movq %r13, %rdx
callq _posix_memalign
testl %eax, %eax
je LBB0_2
## %bb.1:
movq $0, -56(%rbp)
xorl %ebx, %ebx
jmp LBB0_3
LBB0_2:
movq -56(%rbp), %rbx
LBB0_3:
leaq -56(%rbp), %rdi
movl $32, %esi
movq %r13, %rdx
callq _posix_memalign
testl %eax, %eax
je LBB0_5
## %bb.4:
movq $0, -56(%rbp)
xorl %eax, %eax
jmp LBB0_6
LBB0_5:
movq -56(%rbp), %rax
LBB0_6:
movq %rax, -80(%rbp) ## 8-byte Spill
movq -96(%rbp), %r9 ## 8-byte Reload
movabsq $4602641980904887326, %rax ## imm = 0x3FDFDE7EEC22D41E
movq %rax, -56(%rbp)
cmpl $3, -72(%rbp) ## 4-byte Folded Reload
jl LBB0_15
## %bb.7:
movabsq $4294967296, %r12 ## imm = 0x100000000
leal -1(%r9), %ecx
movslq %r9d, %rax
movslq -88(%rbp), %rdx ## 4-byte Folded Reload
movq %rdx, -160(%rbp) ## 8-byte Spill
movq -72(%rbp), %rsi ## 8-byte Reload
leal -1(%rsi), %edx
leaq 8(%rbx,%rax,8), %rsi
movq %rsi, -152(%rbp) ## 8-byte Spill
movq -80(%rbp), %rsi ## 8-byte Reload
leaq 8(%rsi,%rax,8), %rsi
movq %rsi, -144(%rbp) ## 8-byte Spill
leaq (,%rax,8), %rsi
movq %rsi, -104(%rbp) ## 8-byte Spill
leaq 2(%rax), %rsi
movq %rsi, -136(%rbp) ## 8-byte Spill
shlq $32, %rax
movq %rax, -184(%rbp) ## 8-byte Spill
addq $-1, %rcx
movl %r9d, %eax
movq %rax, -176(%rbp) ## 8-byte Spill
movl $1, %eax
movabsq $4601149042440805838, %rdi ## imm = 0x3FDA90AD19501DCE
movq %rdx, -208(%rbp) ## 8-byte Spill
.p2align 4, 0x90
LBB0_8: ## =>This Loop Header: Depth=1
## Child Loop BB0_10 Depth 2
## Child Loop BB0_11 Depth 3
cmpl $2, %r9d
jle LBB0_14
## %bb.9: ## in Loop: Header=BB0_8 Depth=1
movl %eax, %r14d
imull -88(%rbp), %r14d ## 4-byte Folded Reload
leaq 1(%rax), %r8
movq -160(%rbp), %rdx ## 8-byte Reload
movq %rdx, %rsi
movq %r8, -168(%rbp) ## 8-byte Spill
imulq %r8, %rsi
movq -152(%rbp), %r10 ## 8-byte Reload
leaq (%r10,%rsi,8), %r8
leaq -1(%rax), %rsi
imulq %rdx, %rsi
leaq (%r10,%rsi,8), %r10
movq %rax, %rsi
imulq %rdx, %rsi
movq -144(%rbp), %rdx ## 8-byte Reload
leaq (%rdx,%rsi,8), %r11
addl -136(%rbp), %esi ## 4-byte Folded Reload
shlq $32, %rsi
movl %r9d, %r15d
imull %eax, %r15d
leal 2(%r15), %r13d
imull %r9d, %r13d
addl $1, %r13d
addq $1, %r14
addl $1, %r15d
imull %r9d, %r15d
movl $1, %eax
.p2align 4, 0x90
LBB0_10: ## Parent Loop BB0_8 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_11 Depth 3
movq %rax, -112(%rbp) ## 8-byte Spill
leaq 1(%rax), %rax
movq %rax, -192(%rbp) ## 8-byte Spill
movq %rsi, -120(%rbp) ## 8-byte Spill
xorl %edx, %edx
.p2align 4, 0x90
LBB0_11: ## Parent Loop BB0_8 Depth=1
## Parent Loop BB0_10 Depth=2
## => This Inner Loop Header: Depth=3
movq %rdi, (%r11,%rdx,8)
leal (%r15,%rdx), %r9d
movslq %r9d, %rax
movq %rdi, (%rbx,%rax,8)
movq %rsi, %rax
sarq $29, %rax
movq %rdi, (%rbx,%rax)
leal (%r14,%rdx), %eax
cltq
movq %rdi, (%rbx,%rax,8)
leal (%r13,%rdx), %eax
cltq
movq %rdi, (%rbx,%rax,8)
movq %rdi, (%r10,%rdx,8)
movq %rdi, (%r8,%rdx,8)
addq $1, %rdx
addq %r12, %rsi
cmpq %rdx, %rcx
jne LBB0_11
## %bb.12: ## in Loop: Header=BB0_10 Depth=2
movq -104(%rbp), %rax ## 8-byte Reload
addq %rax, %r8
addq %rax, %r10
addq %rax, %r11
movq -120(%rbp), %rsi ## 8-byte Reload
addq -184(%rbp), %rsi ## 8-byte Folded Reload
movq -176(%rbp), %rax ## 8-byte Reload
addq %rax, %r13
addq %rax, %r14
addq %rax, %r15
cmpq %rdx, -112(%rbp) ## 8-byte Folded Reload
movq -192(%rbp), %rax ## 8-byte Reload
jne LBB0_10
## %bb.13: ## in Loop: Header=BB0_8 Depth=1
movq -168(%rbp), %rsi ## 8-byte Reload
movq %rsi, %rax
movq -96(%rbp), %r9 ## 8-byte Reload
movq -208(%rbp), %rdx ## 8-byte Reload
cmpq %rdx, %rsi
jne LBB0_8
jmp LBB0_15
.p2align 4, 0x90
LBB0_14: ## in Loop: Header=BB0_8 Depth=1
addq $1, %rax
movq %rax, %rsi
cmpq %rdx, %rsi
jne LBB0_8
LBB0_15:
movq _var_false@GOTPCREL(%rip), %rax
cmpl $0, (%rax)
je LBB0_17
## %bb.16:
movq %rbx, %rdi
callq _dummy
movq -80(%rbp), %rdi ## 8-byte Reload
callq _dummy
leaq -56(%rbp), %rdi
callq _dummy
movq -96(%rbp), %r9 ## 8-byte Reload
LBB0_17:
cmpl $3, -72(%rbp) ## 4-byte Folded Reload
jl LBB0_59
## %bb.18:
movabsq $4294967296, %r14 ## imm = 0x100000000
leal -1(%r9), %ecx
movslq %r9d, %rsi
movslq -88(%rbp), %rax ## 4-byte Folded Reload
movq %rax, -312(%rbp) ## 8-byte Spill
movq -72(%rbp), %rax ## 8-byte Reload
addl $-1, %eax
movq %rax, -72(%rbp) ## 8-byte Spill
leaq -1(%rcx), %rax
leaq -2(%rcx), %rdi
movq %rdi, -424(%rbp) ## 8-byte Spill
leaq 1(%rsi), %rdi
movq %rdi, -224(%rbp) ## 8-byte Spill
leaq (%rsi,%rcx), %rdi
movq %rdi, -304(%rbp) ## 8-byte Spill
movl %r9d, %edi
movq %rdi, -256(%rbp) ## 8-byte Spill
movq %rcx, -264(%rbp) ## 8-byte Spill
leaq (%rbx,%rcx,8), %rcx
addq $-8, %rcx
movq %rcx, -352(%rbp) ## 8-byte Spill
leal 6(%r9), %ecx
andl $7, %ecx
movq %rax, -448(%rbp) ## 8-byte Spill
movq %rcx, -344(%rbp) ## 8-byte Spill
subq %rcx, %rax
movq %rsi, %rcx
shlq $32, %rcx
movq %rcx, -440(%rbp) ## 8-byte Spill
leaq 1(%rax), %rcx
movq %rcx, -328(%rbp) ## 8-byte Spill
movq %rax, -336(%rbp) ## 8-byte Spill
leal 1(%rax), %eax
movl %eax, -212(%rbp) ## 4-byte Spill
leaq 2(%rsi), %rax
movq %rax, -296(%rbp) ## 8-byte Spill
movq -80(%rbp), %rax ## 8-byte Reload
leaq 8(%rax,%rsi,8), %rax
movq %rax, -288(%rbp) ## 8-byte Spill
leaq (,%rsi,8), %rax
movq %rax, -432(%rbp) ## 8-byte Spill
movq %rsi, -200(%rbp) ## 8-byte Spill
leaq (%rbx,%rsi,8), %rax
addq $8, %rax
movq %rax, -280(%rbp) ## 8-byte Spill
movl $1, %eax
.p2align 4, 0x90
LBB0_19: ## =>This Loop Header: Depth=1
## Child Loop BB0_52 Depth 2
## Child Loop BB0_37 Depth 3
## Child Loop BB0_55 Depth 3
cmpl $2, %r9d
jle LBB0_58
## %bb.20: ## in Loop: Header=BB0_19 Depth=1
movq %rax, %rcx
movq %rax, %r12
movq -312(%rbp), %r15 ## 8-byte Reload
imulq %r15, %r12
leaq 1(%rax), %rax
movl %r9d, %edi
imull %ecx, %edi
leal 1(%rdi), %r8d
imull %r9d, %r8d
addl $2, %edi
imull %r9d, %edi
movq %rax, -320(%rbp) ## 8-byte Spill
movq %rax, %r13
imulq %r15, %r13
movq -224(%rbp), %rdx ## 8-byte Reload
leaq (%rdx,%r13), %rax
movq %rax, -408(%rbp) ## 8-byte Spill
movq -304(%rbp), %rsi ## 8-byte Reload
leaq (%rsi,%r13), %rax
movq %rax, -400(%rbp) ## 8-byte Spill
addq $-1, %rcx
imulq %r15, %rcx
leaq (%rdx,%rcx), %rax
movq %rax, -392(%rbp) ## 8-byte Spill
leaq (%rsi,%rcx), %rax
movq %rax, -384(%rbp) ## 8-byte Spill
movq -296(%rbp), %rax ## 8-byte Reload
leal (%rax,%r12), %eax
shlq $32, %rax
movq %rax, -104(%rbp) ## 8-byte Spill
movq -280(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r13,8), %r10
leaq (%rax,%rcx,8), %r11
movl %r12d, %edx
addq $1, %rdx
movq -200(%rbp), %rax ## 8-byte Reload
addq %rax, %r13
movq %r13, -144(%rbp) ## 8-byte Spill
addq %rax, %rcx
movq %rcx, -152(%rbp) ## 8-byte Spill
leal 2(%r8), %eax
movq %rax, -240(%rbp) ## 8-byte Spill
leal 1(%r12), %eax
movq %rax, -416(%rbp) ## 8-byte Spill
movq %rdi, %rax
movq %rdi, -112(%rbp) ## 8-byte Spill
leal 1(%rdi), %r15d
movq -224(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r12), %rcx
leaq (%rsi,%r12), %rax
movq %rax, -368(%rbp) ## 8-byte Spill
movq -288(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r12,8), %rsi
leaq -8(%rax,%r12,8), %rax
movq %rax, -136(%rbp) ## 8-byte Spill
movq %r12, -120(%rbp) ## 8-byte Spill
leaq 1(%r12), %rax
movq %rax, -360(%rbp) ## 8-byte Spill
leal -1(%r8), %eax
movl %eax, -124(%rbp) ## 4-byte Spill
movq %rcx, -376(%rbp) ## 8-byte Spill
movq %rcx, -272(%rbp) ## 8-byte Spill
movq %r8, -248(%rbp) ## 8-byte Spill
movq %r8, %rdi
movq %r15, -232(%rbp) ## 8-byte Spill
movq %r15, %r8
xorl %r12d, %r12d
movl $1, %eax
jmp LBB0_52
.p2align 4, 0x90
LBB0_21: ## in Loop: Header=BB0_52 Depth=2
movl %r9d, %edx
imull %r12d, %edx
movq -248(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %ecx
movq -424(%rbp), %rax ## 8-byte Reload
leal (%rcx,%rax), %esi
cmpl %ecx, %esi
jl LBB0_53
## %bb.22: ## in Loop: Header=BB0_52 Depth=2
movq %rax, %rcx
shrq $32, %rcx
jne LBB0_53
## %bb.23: ## in Loop: Header=BB0_52 Depth=2
movq -240(%rbp), %rsi ## 8-byte Reload
leal (%rsi,%rdx), %esi
leal (%rsi,%rax), %edi
cmpl %esi, %edi
jl LBB0_53
## %bb.24: ## in Loop: Header=BB0_52 Depth=2
testq %rcx, %rcx
jne LBB0_53
## %bb.25: ## in Loop: Header=BB0_52 Depth=2
movq -416(%rbp), %rsi ## 8-byte Reload
leal (%rsi,%rdx), %esi
leal (%rsi,%rax), %edi
cmpl %esi, %edi
jl LBB0_53
## %bb.26: ## in Loop: Header=BB0_52 Depth=2
testq %rcx, %rcx
jne LBB0_53
## %bb.27: ## in Loop: Header=BB0_52 Depth=2
addl -232(%rbp), %edx ## 4-byte Folded Reload
leal (%rdx,%rax), %esi
cmpl %edx, %esi
jl LBB0_53
## %bb.28: ## in Loop: Header=BB0_52 Depth=2
testq %rcx, %rcx
jne LBB0_53
## %bb.29: ## in Loop: Header=BB0_52 Depth=2
movq -192(%rbp), %rdx ## 8-byte Reload
movq %rdx, %rsi
imulq -200(%rbp), %rsi ## 8-byte Folded Reload
movq -376(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %rdi
movq -368(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %r13
movq -408(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %r11
movq -400(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %rcx
movq -392(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi), %r10
addq -384(%rbp), %rsi ## 8-byte Folded Reload
## kill: def $edx killed $edx killed $rdx def $rdx
imull -256(%rbp), %edx ## 4-byte Folded Reload
movq -232(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %r12d
movq -360(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %r9d
movq -240(%rbp), %rax ## 8-byte Reload
leal (%rax,%rdx), %eax
movl %eax, -60(%rbp) ## 4-byte Spill
addl -248(%rbp), %edx ## 4-byte Folded Reload
movq -80(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rdi,8), %rdi
leaq (%rbx,%rcx,8), %rcx
cmpq %rcx, %rdi
leaq (%rax,%r13,8), %rcx
leaq (%rbx,%r11,8), %r11
setb -45(%rbp) ## 1-byte Folded Spill
cmpq %rcx, %r11
leaq (%rbx,%r10,8), %r10
leaq (%rbx,%rsi,8), %r11
movslq %r12d, %rsi
setb -44(%rbp) ## 1-byte Folded Spill
cmpq %r11, %rdi
setb %r12b
cmpq %rcx, %r10
leaq (%rbx,%rsi,8), %r10
movq -352(%rbp), %rax ## 8-byte Reload
leaq (%rax,%rsi,8), %rsi
movslq %r9d, %r9
setb -43(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
setb %r11b
cmpq %rcx, %r10
leaq (%rbx,%r9,8), %r10
leaq (%rax,%r9,8), %rsi
movslq -60(%rbp), %r9 ## 4-byte Folded Reload
setb -60(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
setb %r13b
cmpq %rcx, %r10
leaq (%rbx,%r9,8), %r10
leaq (%rax,%r9,8), %rsi
movslq %edx, %rdx
setb -42(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
setb %r9b
cmpq %rcx, %r10
leaq (%rax,%rdx,8), %rsi
setb -41(%rbp) ## 1-byte Folded Spill
cmpq %rsi, %rdi
leaq (%rbx,%rdx,8), %rdx
setb %r10b
cmpq %rcx, %rdx
setb %dl
leaq -55(%rbp), %rax
cmpq %rdi, %rax
seta %dil
leaq -56(%rbp), %rax
cmpq %rcx, %rax
setb %al
movb -44(%rbp), %cl ## 1-byte Reload
testb %cl, -45(%rbp) ## 1-byte Folded Reload
jne LBB0_53
## %bb.30: ## in Loop: Header=BB0_52 Depth=2
andb -43(%rbp), %r12b ## 1-byte Folded Reload
jne LBB0_53
## %bb.31: ## in Loop: Header=BB0_52 Depth=2
andb -60(%rbp), %r11b ## 1-byte Folded Reload
jne LBB0_53
## %bb.32: ## in Loop: Header=BB0_52 Depth=2
andb -42(%rbp), %r13b ## 1-byte Folded Reload
jne LBB0_53
## %bb.33: ## in Loop: Header=BB0_52 Depth=2
andb -41(%rbp), %r9b ## 1-byte Folded Reload
jne LBB0_53
## %bb.34: ## in Loop: Header=BB0_52 Depth=2
movl $1, %r9d
andb %dl, %r10b
jne LBB0_54
## %bb.35: ## in Loop: Header=BB0_52 Depth=2
andb %al, %dil
jne LBB0_54
## %bb.36: ## in Loop: Header=BB0_52 Depth=2
vbroadcastsd -56(%rbp), %zmm0
movq -104(%rbp), %rdx ## 8-byte Reload
xorl %esi, %esi
movq -336(%rbp), %r9 ## 8-byte Reload
movabsq $34359738368, %rdi ## imm = 0x800000000
movq %rdi, %r10
movq -184(%rbp), %r11 ## 8-byte Reload
movq -176(%rbp), %r15 ## 8-byte Reload
movq -168(%rbp), %r12 ## 8-byte Reload
movq -88(%rbp), %rdi ## 8-byte Reload
movq -160(%rbp), %rax ## 8-byte Reload
.p2align 4, 0x90
movl $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
LBB0_37: ## Parent Loop BB0_19 Depth=1
## Parent Loop BB0_52 Depth=2
## => This Inner Loop Header: Depth=3
leal (%rax,%rsi), %ecx
movslq %ecx, %rcx
vmovupd (%rbx,%rcx,8), %zmm1
movq %rdx, %rcx
sarq $29, %rcx
vaddpd (%rbx,%rcx), %zmm1, %zmm1
leal (%r12,%rsi), %ecx
movslq %ecx, %rcx
vaddpd (%rbx,%rcx,8), %zmm1, %zmm1
leal (%r8,%rsi), %ecx
movslq %ecx, %rcx
vaddpd (%rbx,%rcx,8), %zmm1, %zmm1
vaddpd (%r15,%rsi,8), %zmm1, %zmm1
vaddpd (%r11,%rsi,8), %zmm1, %zmm1
vmulpd %zmm0, %zmm1, %zmm1
vmovupd %zmm1, (%rdi,%rsi,8)
addq $8, %rsi
addq %r10, %rdx
cmpq %rsi, %r9
jne LBB0_37
movl $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 100 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 103 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
.byte 144 # INSERTED BY KERNCRAFT IACA MARKER UTILITY
## %bb.38: ## in Loop: Header=BB0_52 Depth=2
movq -328(%rbp), %r9 ## 8-byte Reload
movl -212(%rbp), %eax ## 4-byte Reload
movl %eax, %r15d
cmpl $0, -344(%rbp) ## 4-byte Folded Reload
jne LBB0_54
jmp LBB0_56
.p2align 4, 0x90
LBB0_52: ## Parent Loop BB0_19 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_37 Depth 3
## Child Loop BB0_55 Depth 3
movq %rdx, -168(%rbp) ## 8-byte Spill
addq $1, %rax
movl $1, %r15d
cmpq $8, -448(%rbp) ## 8-byte Folded Reload
movq %r10, -184(%rbp) ## 8-byte Spill
movq %r11, -176(%rbp) ## 8-byte Spill
movq %rsi, -88(%rbp) ## 8-byte Spill
movq %rdi, -160(%rbp) ## 8-byte Spill
movq %r12, -192(%rbp) ## 8-byte Spill
movq %rax, -208(%rbp) ## 8-byte Spill
jae LBB0_21
LBB0_53: ## in Loop: Header=BB0_52 Depth=2
movl $1, %r9d
LBB0_54: ## in Loop: Header=BB0_52 Depth=2
movq -136(%rbp), %rax ## 8-byte Reload
leaq (%rax,%r9,8), %rdx
movq -144(%rbp), %rax ## 8-byte Reload
leaq (%r9,%rax), %rcx
leaq (%rbx,%rcx,8), %r11
movq -152(%rbp), %rax ## 8-byte Reload
leaq (%r9,%rax), %rcx
leaq (%rbx,%rcx,8), %r10
movq -272(%rbp), %rax ## 8-byte Reload
leal (%r9,%rax), %r12d
shlq $32, %r12
movq -264(%rbp), %r13 ## 8-byte Reload
subq %r9, %r13
movq -112(%rbp), %rax ## 8-byte Reload
leal (%r15,%rax), %esi
movq -120(%rbp), %rax ## 8-byte Reload
leal (%r15,%rax), %edi
addl -124(%rbp), %r15d ## 4-byte Folded Reload
xorl %ecx, %ecx
.p2align 4, 0x90
LBB0_55: ## Parent Loop BB0_19 Depth=1
## Parent Loop BB0_52 Depth=2
## => This Inner Loop Header: Depth=3
leal (%r15,%rcx), %eax
cltq
vmovsd (%rbx,%rax,8), %xmm0 ## xmm0 = mem[0],zero
movq %r12, %rax
sarq $29, %rax
vaddsd (%rbx,%rax), %xmm0, %xmm0
leal (%rdi,%rcx), %eax
cltq
vaddsd (%rbx,%rax,8), %xmm0, %xmm0
leal (%rsi,%rcx), %eax
cltq
vaddsd (%rbx,%rax,8), %xmm0, %xmm0
vaddsd (%r10,%rcx,8), %xmm0, %xmm0
vaddsd (%r11,%rcx,8), %xmm0, %xmm0
vmulsd -56(%rbp), %xmm0, %xmm0
vmovsd %xmm0, (%rdx,%rcx,8)
addq $1, %rcx
addq %r14, %r12
cmpq %rcx, %r13
jne LBB0_55
LBB0_56: ## in Loop: Header=BB0_52 Depth=2
movq -192(%rbp), %r12 ## 8-byte Reload
addq $1, %r12
movq -104(%rbp), %rax ## 8-byte Reload
addq -440(%rbp), %rax ## 8-byte Folded Reload
movq %rax, -104(%rbp) ## 8-byte Spill
movq -432(%rbp), %rcx ## 8-byte Reload
movq -88(%rbp), %rsi ## 8-byte Reload
addq %rcx, %rsi
movq -184(%rbp), %r10 ## 8-byte Reload
addq %rcx, %r10
movq -176(%rbp), %r11 ## 8-byte Reload
addq %rcx, %r11
movq -256(%rbp), %rax ## 8-byte Reload
addq %rax, %r8
movq -168(%rbp), %rdx ## 8-byte Reload
addq %rax, %rdx
movq -160(%rbp), %rdi ## 8-byte Reload
addq %rax, %rdi
addq %rcx, -136(%rbp) ## 8-byte Folded Spill
movq -200(%rbp), %rax ## 8-byte Reload
addq %rax, -144(%rbp) ## 8-byte Folded Spill
addq %rax, -152(%rbp) ## 8-byte Folded Spill
addq %rax, -272(%rbp) ## 8-byte Folded Spill
movq -96(%rbp), %r9 ## 8-byte Reload
movq -112(%rbp), %rax ## 8-byte Reload
addl %r9d, %eax
movq %rax, -112(%rbp) ## 8-byte Spill
movq -120(%rbp), %rax ## 8-byte Reload
addl %r9d, %eax
movq %rax, -120(%rbp) ## 8-byte Spill
addl %r9d, -124(%rbp) ## 4-byte Folded Spill
movq -208(%rbp), %rax ## 8-byte Reload
cmpq -264(%rbp), %rax ## 8-byte Folded Reload
jne LBB0_52
## %bb.57: ## in Loop: Header=BB0_19 Depth=1
movq -320(%rbp), %rcx ## 8-byte Reload
movq %rcx, %rax
cmpq -72(%rbp), %rcx ## 8-byte Folded Reload
jne LBB0_19
jmp LBB0_59
.p2align 4, 0x90
LBB0_58: ## in Loop: Header=BB0_19 Depth=1
movq %rax, %rcx
addq $1, %rcx
movq %rcx, %rax
cmpq -72(%rbp), %rcx ## 8-byte Folded Reload
jne LBB0_19
LBB0_59:
movq _var_false@GOTPCREL(%rip), %rax
cmpl $0, (%rax)
je LBB0_61
## %bb.60:
movq %rbx, %rdi
vzeroupper
callq _dummy
movq -80(%rbp), %rdi ## 8-byte Reload
callq _dummy
leaq -56(%rbp), %rdi
callq _dummy
LBB0_61:
xorl %eax, %eax
addq $408, %rsp ## imm = 0x198
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
.cfi_endproc
## -- End function
.subsections_via_symbols