Merge pull request #27 from RRZE-HPC/imported_intel_models

Imported intel models
2026-01-07 03:30:06 +01:00 · 2019-04-30 08:28:06 +02:00
parent aea6f8f043 01b23e1b47
commit bd4a5622b2
16 changed files with 40700 additions and 428 deletions
--- a/osaca/init.py
+++ b/osaca/init.py
@@ -1,2 +1,2 @@
 name = "osaca"
-__version__ = '0.2.0'
+__version__ = '0.2.1'
--- a/osaca/data/CFL_data.csv
+++ b/osaca/data/CFL_data.csv
--- a/osaca/data/KBL_data.csv
+++ b/osaca/data/KBL_data.csv
--- a/osaca/data/bdw_data.csv
+++ b/osaca/data/bdw_data.csv
--- a/osaca/data/hsw_data.csv
+++ b/osaca/data/hsw_data.csv
--- a/osaca/data/ivb_data.csv
+++ b/osaca/data/ivb_data.csv
--- a/osaca/data/model_importer.py
+++ b/osaca/data/model_importer.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+from collections import defaultdict, OrderedDict
+import xml.etree.ElementTree as ET
+import re
+import sys
+import argparse
+from distutils.version import StrictVersion
+
+from osaca.param import Parameter, Register
+from osaca.eu_sched import Scheduler
+
+
+def normalize_reg_name(reg_name):
+    # strip spaces
+    reg_name = reg_name.strip()
+    # masks are denoted with curly brackets in uops.info
+    reg_name = re.sub(r'{K([0-7])}', r'K\1', reg_name)
+    reg_name = re.sub(r'ST\(([0-7])\)', r'ST\1', reg_name)
+    return reg_name
+
+
+def port_occupancy_from_tag_attributes(attrib, arch):
+    occupancy = defaultdict(int)
+    for k, v in attrib.items():
+        m = re.match('^port([0-9]+)', k)
+        if not m:
+            continue
+        ports = m.group(1)
+        # Ignore Port7 on HSW, BDW, SKL and SKX if present in combination with ports 2 and 3.
+        # Port7 is only used for simple address generation, while 2 and 3 handle all addressing,
+        # but uops.info does not differentiate.
+        if arch in ['HSW', 'BDW', 'SKL', 'SKX'] and ports == '237':
+            ports = ports.replace('7', '')
+        potential_ports = list(ports)
+        per_port_occupancy = int(v) / len(potential_ports)
+        for pp in potential_ports:
+            occupancy[pp] += per_port_occupancy
+
+    # Also consider DIV pipeline
+    if 'div_cycles' in attrib:
+        occupancy['0DV'] = int(attrib['div_cycles'])
+
+    return dict(occupancy)
+
+
+def extract_paramters(instruction_tag):
+    # Extract parameter components
+    parameters = []  # used to store string representations
+    parameter_tags = sorted(instruction_tag.findall("operand"),
+                            key=lambda p: int(p.attrib['idx']))
+    for parameter_tag in parameter_tags:
+        # Ignore parameters with suppressed=1
+        if int(parameter_tag.attrib.get('suppressed', '0')):
+            continue
+
+        p_type = parameter_tag.attrib['type']
+        if p_type == 'imm':
+            parameters.append('imd')  # Parameter('IMD')
+        elif p_type == 'mem':
+            parameters.append('mem')  # Parameter('MEM')
+        elif p_type == 'reg':
+            possible_regs = [normalize_reg_name(r)
+                             for r in parameter_tag.text.split(',')]
+            reg_groups = [Register.sizes.get(r, None) for r in possible_regs]
+            if reg_groups[1:] == reg_groups[:-1]:
+                if reg_groups[0] is None:
+                    raise ValueError("Unknown register type for {} with {}.".format(
+                        parameter_tag.attrib, parameter_tag.text))
+                elif reg_groups[0][1] == 'GPR':
+                    parameters.append('r{}'.format(reg_groups[0][0]))
+                    # Register(possible_regs[0]))
+                elif '{' in parameter_tag.text:
+                    # We have a mask
+                    parameters[-1] += '{opmask}'
+                else:
+                    parameters.append(reg_groups[0][1].lower())
+        elif p_type == 'relbr':
+            parameters.append('LBL')
+        elif p_type == 'agen':
+            parameters.append('mem')
+        else:
+            raise ValueError("Unknown paramter type {}".format(parameter_tag.attrib))
+    return parameters
+
+
+def extract_model(tree, arch):
+    model_data = []
+    for instruction_tag in tree.findall('//instruction'):
+        ignore = False
+
+        mnemonic = instruction_tag.attrib['asm']
+
+        # Extract parameter components
+        try:
+            parameters = extract_paramters(instruction_tag)
+        except ValueError as e:
+            print(e, file=sys.stderr)
+
+        # Extract port occupation, throughput and latency
+        port_occupancy, throughput, latency = [], 0.0, None
+        arch_tag = instruction_tag.find('architecture[@name="'+arch+'"]')
+        if arch_tag is None:
+            continue
+        # We collect all measurement and IACA information and compare them later
+        for measurement_tag in arch_tag.iter('measurement'):
+            port_occupancy.append(port_occupancy_from_tag_attributes(measurement_tag.attrib, arch))
+            # FIXME handle min/max Latencies ('maxCycles' and 'minCycles')
+            latencies = [int(l_tag.attrib['cycles'])
+                         for l_tag in measurement_tag.iter('latency') if 'latency' in l_tag.attrib]
+
+            if latencies[1:] != latencies[:-1]:
+                print("Contradicting latencies found:", mnemonic, file=sys.stderr)
+                ignore = True
+            elif latencies:
+                latency = latencies[0]
+        # Ordered by IACA version (newest last)
+        for iaca_tag in sorted(arch_tag.iter('IACA'),
+                               key=lambda i: StrictVersion(i.attrib['version'])):
+            port_occupancy.append(port_occupancy_from_tag_attributes(iaca_tag.attrib, arch))
+        if ignore: continue
+
+        # Check if all are equal
+        if port_occupancy:
+            if port_occupancy[1:] != port_occupancy[:-1]:
+                print("Contradicting port occupancies, using latest IACA:", mnemonic,
+                      file=sys.stderr)
+            port_occupancy = port_occupancy[-1]
+            throughput = max(list(port_occupancy.values())+[0.0])
+        else:
+            # print("No data available for this architecture:", mnemonic, file=sys.stderr)
+            continue
+
+        for m, p in build_variants(mnemonic, parameters):
+            model_data.append((m.lower() + '-' + '_'.join(p),
+                              throughput, latency, port_occupancy))
+
+    return model_data
+
+
+def all_or_false(iterator):
+    if not iterator:
+        return False
+    else:
+        return all(iterator)
+
+
+def build_variants(mnemonic, parameters):
+    """Yield all resonable variants of this instruction form."""
+    # The one that was given
+    mnemonic = mnemonic.upper()
+    yield mnemonic, parameters
+
+    # Without opmask
+    if any(['{opmask}' in p for p in parameters]):
+        yield mnemonic, list([p.replace('{opmask}', '') for p in parameters])
+
+    # With suffix (assuming suffix was not already present)
+    suffixes = {'Q': 'r64',
+                'L': 'r32',
+                'W': 'r16',
+                'B': 'r8'}
+    for s, reg in suffixes.items():
+        if not mnemonic.endswith(s) and all_or_false(
+                [p == reg for p in parameters if p not in ['mem', 'imd']]):
+            yield mnemonic+s, parameters
+
+
+def architectures(tree):
+    return set([a.attrib['name'] for a in tree.findall('.//architecture')])
+
+
+def int_or_zero(s):
+    try:
+        return int(s)
+    except ValueError:
+        return 0
+
+
+def dump_csv(model_data, arch):
+    csv = 'instr,TP,LT,ports\n'
+    ports = set()
+    for mnemonic, throughput, latency, port_occupancy in model_data:
+        for p in port_occupancy:
+            ports.add(p)
+    ports = sorted(ports)
+    # If not all ports have been used (happens with port7 due to blacklist
+    # port_occupancy_from_tag_attributes), extend list accordingly:
+    while len(ports) < Scheduler.arch_dict[arch] + len(Scheduler.arch_pipeline_ports.get(arch, [])):
+        max_index = ports.index(str(max(map(int_or_zero, ports))))
+        ports.insert(max_index + 1, str(max(map(int_or_zero, ports)) + 1))
+
+    for mnemonic, throughput, latency, port_occupancy in model_data:
+        for p in ports:
+            if p not in port_occupancy:
+                port_occupancy[p] = 0.0
+        po_items = sorted(port_occupancy.items())
+        csv_line = '{},{},{},"({})"\n'.format(mnemonic, throughput, latency,
+                                              ','.join([str(c) for p, c in po_items]))
+        csv += csv_line
+    return csv
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('xml', help='path of instructions.xml from http://uops.info')
+    parser.add_argument('arch', nargs='?',
+                        help='architecture to extract, use IACA abbreviations (e.g., SNB). '
+                             'if not given, all will be extracted and saved to file in CWD.')
+    args = parser.parse_args()
+
+    tree = ET.parse(args.xml)
+    if args.arch:
+        model_data = extract_model(tree, args.arch)
+        print(dump_csv(model_data, args.arch))
+    else:
+        for arch in architectures(tree):
+            model_data = extract_model(tree, arch)
+            with open('{}_data.csv'.format(arch), 'w') as f:
+                f.write(dump_csv(model_data, arch))
+
+
+if __name__ == '__main__':
+    main()
--- a/osaca/data/nhm_data.csv
+++ b/osaca/data/nhm_data.csv
--- a/osaca/data/skl_data.csv
+++ b/osaca/data/skl_data.csv
--- a/osaca/data/skx_data.csv
+++ b/osaca/data/skx_data.csv
--- a/osaca/data/snb_data.csv
+++ b/osaca/data/snb_data.csv
--- a/osaca/data/wsm_data.csv
+++ b/osaca/data/wsm_data.csv
--- a/osaca/eu_sched.py
+++ b/osaca/eu_sched.py
@@ -12,8 +12,23 @@ from osaca.param import Register, MemAddr


 class Scheduler(object):
-    arch_dict = {'SNB': 6, 'IVB': 6, 'HSW': 8, 'BDW': 8, 'SKL': 8, 'ZEN': 10}
-    dv_ports_dict = {'SKL': [0], 'ZEN': [3]}
+    arch_dict = {
+        # Intel
+        'NHM': 5, 'WSM': 5,  # Nehalem, Westmere
+        'SNB': 6, 'IVB': 6,  # Sandy Bridge, Ivy Bridge
+        'HSW': 8, 'BDW': 8,  # Haswell, Broadwell
+        'SKL': 8, 'SKX': 8,  # Skylake(-X)
+        'KBL': 8, 'CFL': 8,  # Kaby Lake, Coffee Lake
+        # AMD
+        'ZEN': 10,  # Zen/Ryzen/EPYC
+    }
+    arch_pipeline_ports = {
+        'NHM': ['0DV'], 'WSM': ['0DV'],
+        'SNB': ['0DV'], 'IVB': ['0DV'],
+        'HSW': ['0DV'], 'BDW': ['0DV'],
+        'SKL': ['0DV'], 'SKX': ['0DV'],
+        'KBL': ['0DV'], 'CFL': ['0DV'],
+        'ZEN': ['3DV'],}
    # content of most inner list in instrList: instr, operand(s), instr form
    df = None  # type: DataFrame
    # for parallel ld/st in archs with 1 st/cy and >1 ld/cy, able to do 1 st and 1 ld in 1cy
@@ -33,7 +48,7 @@ class Scheduler(object):
            self.en_par_ldst = True
            self.ld_ports = [9, 10]
        # check for DV port
-        self.dv_ports = self.dv_ports_dict.get(arch, [])
+        self.pipeline_ports = self.arch_pipeline_ports.get(arch, [])
        self.instrList = instruction_list
        # curr_dir = os.path.realpath(__file__)[:-11]
        osaca_dir = os.path.expanduser('~/.osaca/')
@@ -60,8 +75,8 @@ class Scheduler(object):
        sched = self.get_head()
        # Initialize ports
        # Add DV port, if it is existing
-        occ_ports = [[0] * (self.ports + len(self.dv_ports)) for x in range(len(self.instrList))]
-        port_bndgs = [0] * (self.ports + len(self.dv_ports))
+        occ_ports = [[0] * (self.ports + len(self.pipeline_ports)) for x in range(len(self.instrList))]
+        port_bndgs = [0] * (self.ports + len(self.pipeline_ports))
        # Store instruction counter for parallel ld/st
        par_ldst = 0
        # Count the number of store instr if we schedule for an architecture with par ld/st
@@ -74,8 +89,8 @@ class Scheduler(object):
        # Check if there's a port occupation stored in the CSV, otherwise leave the
        # occ_port list item empty
        for i, instrForm in enumerate(self.instrList):
+            search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
            try:
-                search_string = instrForm[0] + '-' + self.get_operand_suffix(instrForm)
                entry = self.df.loc[lambda df, sStr=search_string: df.instr == sStr]
                tup = entry.ports.values[0]
                if len(tup) == 1 and tup[0] == -1:
@@ -92,13 +107,14 @@ class Scheduler(object):
            p_flg = ''
            if self.en_par_ldst:
                # Check for ld
+                # FIXME remove special load handling from here and place in machine model
                if (isinstance(instrForm[-2], MemAddr) or
                        (len(instrForm) > 4 and isinstance(instrForm[2], MemAddr))):
                    if par_ldst > 0:
                        par_ldst -= 1
                        p_flg = 'P '
                        for port in self.ld_ports:
-                            occ_ports[i][port] = '(' + str(occ_ports[i][port]) + ')'
+                            occ_ports[i][port] = 0.0  # '(' + str(occ_ports[i][port]) + ')'
            # Write schedule line
            if len(p_flg) > 0:
                sched += self.format_port_occupation_line(occ_ports[i], p_flg + instrForm[-1])
@@ -361,14 +377,7 @@ class Scheduler(object):

        :return: list of strings
        """
-        port_names = []
-        dv_ports_appended = 0
-        for i in range(self.ports):
-            port_names.append(str(i))
-            if i in self.dv_ports:
-                dv_ports_appended += 1
-                port_names.append(str(i)+'DV')
-        return port_names
+        return sorted([str(i) for i in range(self.ports)] + self.pipeline_ports)

    def get_port_binding(self, port_bndg):
        """
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -364,7 +364,7 @@ class OSACA(object):
    longestInstr = 30
    machine_readable = False

-    VALID_ARCHS = ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'ZEN']
+    VALID_ARCHS = Scheduler.arch_dict

    def __init__(self, arch, assembly, extract_with_markers=True):
        """
@@ -574,6 +574,15 @@ class OSACA(object):
            (port_name, port_binding[i])
            for i, port_name in enumerate(self.schedule.get_port_naming())])

+    def get_unmatched_instruction_ratio(self):
+        """
+        Calculate ratio of unmatched vs total instructions
+
+        :return: float
+        """
+        sched_output, port_binding = self.schedule.new_schedule()
+        return sched_output.count('| X ') / len(self.instr_forms)
+
    def get_total_throughput(self):
        """
        Return total cycles estimated per block execution. Including (potential) penalties.
--- a/tests/test_osaca.py
+++ b/tests/test_osaca.py
@@ -13,35 +13,57 @@ from osaca import osaca
 class TestOsaca(unittest.TestCase):
    maxDiff = None

+    def setUp(self):
+        self.curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
+
    @unittest.skip("Binary analysis is error prone and currently not working with FSF's objdump")
    def testIACABinary(self):
-        curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
-        assembly = osaca.get_assembly_from_binary(curr_dir + '/testfiles/taxCalc-ivb-iaca')
+        assembly = osaca.get_assembly_from_binary(self.curr_dir + '/testfiles/taxCalc-ivb-iaca')
        osa = osaca.OSACA('IVB', assembly)
        result = osa.generate_text_output()
        result = result[result.find('Port Binding in Cycles Per Iteration:'):]
-        with open(curr_dir + '/test_osaca_iaca.out', encoding='utf-8') as f:
+        with open(self.curr_dir + '/test_osaca_iaca.out', encoding='utf-8') as f:
            assertion = f.read()
        self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))

    # Test ASM file with IACA marker in two lines
    def testIACAasm1(self):
-        curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
-        with open(curr_dir + '/testfiles/taxCalc-ivb-iaca.S') as f:
+        with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca.S') as f:
            osa = osaca.OSACA('IVB', f.read())
        result = osa.generate_text_output()
        result = result[result.find('Port Binding in Cycles Per Iteration:'):]
-        with open(curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
+        with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
            assertion = f.read()
        self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))

    # Test ASM file with IACA marker in four lines
    def testIACAasm2(self):
-        curr_dir = '/'.join(os.path.realpath(__file__).split('/')[:-1])
-        with open(curr_dir + '/testfiles/taxCalc-ivb-iaca2.S') as f:
+        with open(self.curr_dir + '/testfiles/taxCalc-ivb-iaca2.S') as f:
            osa = osaca.OSACA('IVB', f.read())
        result = osa.generate_text_output()
        result = result[result.find('Port Binding in Cycles Per Iteration:'):]
-        with open(curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
+        with open(self.curr_dir + '/test_osaca_iaca_asm.out', encoding='utf-8') as f:
            assertion = f.read()
        self.assertEqual(assertion.replace(' ', ''), result.replace(' ', ''))
+
+    #@unittest.skip("Skip until required instructions are supported.")
+    def test_asm_API(self):
+        with open(self.curr_dir + '/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s') as f:
+            osa = osaca.OSACA('SKX', f.read())
+
+        text_output = osa.create_output()
+        print(text_output)
+        # Derived from IACA (and manually considering OSACAs equal distribution to ports)
+        self.assertEqual(dict(osa.get_port_occupation_cycles()),
+                         {'0': 4.0,
+                          '0DV': 0.0,
+                          '1': 3.5,
+                          '2': 3.5,
+                          '3': 3.5,
+                          '4': 1.0,
+                          '5': 4.5,
+                          '6': 3.5,
+                          '7': 0.0})
+        # TODO consider frontend bottleneck -> 6.25 cy
+        self.assertEqual(osa.get_total_throughput(),
+                         4.5)
--- a/tests/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s
+++ b/tests/testfiles/3d-7pt.icc.skx.avx512.iaca_marked.s
@@ -0,0 +1,653 @@
+	.section	__TEXT,__text,regular,pure_instructions
+	.macosx_version_min 10, 14
+	.globl	_main                   ## -- Begin function main
+	.p2align	4, 0x90
+_main:                                  ## @main
+	.cfi_startproc
+## %bb.0:
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	subq	$408, %rsp              ## imm = 0x198
+	.cfi_offset %rbx, -56
+	.cfi_offset %r12, -48
+	.cfi_offset %r13, -40
+	.cfi_offset %r14, -32
+	.cfi_offset %r15, -24
+	movq	%rsi, %rbx
+	movq	16(%rsi), %rdi
+	callq	_atoi
+	movl	%eax, %r14d
+	movq	24(%rbx), %rdi
+	callq	_atoi
+                                        ## kill: def $eax killed $eax def $rax
+	movq	%r14, -96(%rbp)         ## 8-byte Spill
+	movl	%r14d, %ecx
+	imull	%r14d, %ecx
+	movl	%ecx, -88(%rbp)         ## 4-byte Spill
+	movq	%rax, -72(%rbp)         ## 8-byte Spill
+	imull	%eax, %ecx
+	movslq	%ecx, %r13
+	shlq	$3, %r13
+	leaq	-56(%rbp), %rdi
+	movl	$32, %esi
+	movq	%r13, %rdx
+	callq	_posix_memalign
+	testl	%eax, %eax
+	je	LBB0_2
+## %bb.1:
+	movq	$0, -56(%rbp)
+	xorl	%ebx, %ebx
+	jmp	LBB0_3
+LBB0_2:
+	movq	-56(%rbp), %rbx
+LBB0_3:
+	leaq	-56(%rbp), %rdi
+	movl	$32, %esi
+	movq	%r13, %rdx
+	callq	_posix_memalign
+	testl	%eax, %eax
+	je	LBB0_5
+## %bb.4:
+	movq	$0, -56(%rbp)
+	xorl	%eax, %eax
+	jmp	LBB0_6
+LBB0_5:
+	movq	-56(%rbp), %rax
+LBB0_6:
+	movq	%rax, -80(%rbp)         ## 8-byte Spill
+	movq	-96(%rbp), %r9          ## 8-byte Reload
+	movabsq	$4602641980904887326, %rax ## imm = 0x3FDFDE7EEC22D41E
+	movq	%rax, -56(%rbp)
+	cmpl	$3, -72(%rbp)           ## 4-byte Folded Reload
+	jl	LBB0_15
+## %bb.7:
+	movabsq	$4294967296, %r12       ## imm = 0x100000000
+	leal	-1(%r9), %ecx
+	movslq	%r9d, %rax
+	movslq	-88(%rbp), %rdx         ## 4-byte Folded Reload
+	movq	%rdx, -160(%rbp)        ## 8-byte Spill
+	movq	-72(%rbp), %rsi         ## 8-byte Reload
+	leal	-1(%rsi), %edx
+	leaq	8(%rbx,%rax,8), %rsi
+	movq	%rsi, -152(%rbp)        ## 8-byte Spill
+	movq	-80(%rbp), %rsi         ## 8-byte Reload
+	leaq	8(%rsi,%rax,8), %rsi
+	movq	%rsi, -144(%rbp)        ## 8-byte Spill
+	leaq	(,%rax,8), %rsi
+	movq	%rsi, -104(%rbp)        ## 8-byte Spill
+	leaq	2(%rax), %rsi
+	movq	%rsi, -136(%rbp)        ## 8-byte Spill
+	shlq	$32, %rax
+	movq	%rax, -184(%rbp)        ## 8-byte Spill
+	addq	$-1, %rcx
+	movl	%r9d, %eax
+	movq	%rax, -176(%rbp)        ## 8-byte Spill
+	movl	$1, %eax
+	movabsq	$4601149042440805838, %rdi ## imm = 0x3FDA90AD19501DCE
+	movq	%rdx, -208(%rbp)        ## 8-byte Spill
+	.p2align	4, 0x90
+LBB0_8:                                 ## =>This Loop Header: Depth=1
+                                        ##     Child Loop BB0_10 Depth 2
+                                        ##       Child Loop BB0_11 Depth 3
+	cmpl	$2, %r9d
+	jle	LBB0_14
+## %bb.9:                               ##   in Loop: Header=BB0_8 Depth=1
+	movl	%eax, %r14d
+	imull	-88(%rbp), %r14d        ## 4-byte Folded Reload
+	leaq	1(%rax), %r8
+	movq	-160(%rbp), %rdx        ## 8-byte Reload
+	movq	%rdx, %rsi
+	movq	%r8, -168(%rbp)         ## 8-byte Spill
+	imulq	%r8, %rsi
+	movq	-152(%rbp), %r10        ## 8-byte Reload
+	leaq	(%r10,%rsi,8), %r8
+	leaq	-1(%rax), %rsi
+	imulq	%rdx, %rsi
+	leaq	(%r10,%rsi,8), %r10
+	movq	%rax, %rsi
+	imulq	%rdx, %rsi
+	movq	-144(%rbp), %rdx        ## 8-byte Reload
+	leaq	(%rdx,%rsi,8), %r11
+	addl	-136(%rbp), %esi        ## 4-byte Folded Reload
+	shlq	$32, %rsi
+	movl	%r9d, %r15d
+	imull	%eax, %r15d
+	leal	2(%r15), %r13d
+	imull	%r9d, %r13d
+	addl	$1, %r13d
+	addq	$1, %r14
+	addl	$1, %r15d
+	imull	%r9d, %r15d
+	movl	$1, %eax
+	.p2align	4, 0x90
+LBB0_10:                                ##   Parent Loop BB0_8 Depth=1
+                                        ## =>  This Loop Header: Depth=2
+                                        ##       Child Loop BB0_11 Depth 3
+	movq	%rax, -112(%rbp)        ## 8-byte Spill
+	leaq	1(%rax), %rax
+	movq	%rax, -192(%rbp)        ## 8-byte Spill
+	movq	%rsi, -120(%rbp)        ## 8-byte Spill
+	xorl	%edx, %edx
+	.p2align	4, 0x90
+LBB0_11:                                ##   Parent Loop BB0_8 Depth=1
+                                        ##     Parent Loop BB0_10 Depth=2
+                                        ## =>    This Inner Loop Header: Depth=3
+	movq	%rdi, (%r11,%rdx,8)
+	leal	(%r15,%rdx), %r9d
+	movslq	%r9d, %rax
+	movq	%rdi, (%rbx,%rax,8)
+	movq	%rsi, %rax
+	sarq	$29, %rax
+	movq	%rdi, (%rbx,%rax)
+	leal	(%r14,%rdx), %eax
+	cltq
+	movq	%rdi, (%rbx,%rax,8)
+	leal	(%r13,%rdx), %eax
+	cltq
+	movq	%rdi, (%rbx,%rax,8)
+	movq	%rdi, (%r10,%rdx,8)
+	movq	%rdi, (%r8,%rdx,8)
+	addq	$1, %rdx
+	addq	%r12, %rsi
+	cmpq	%rdx, %rcx
+	jne	LBB0_11
+## %bb.12:                              ##   in Loop: Header=BB0_10 Depth=2
+	movq	-104(%rbp), %rax        ## 8-byte Reload
+	addq	%rax, %r8
+	addq	%rax, %r10
+	addq	%rax, %r11
+	movq	-120(%rbp), %rsi        ## 8-byte Reload
+	addq	-184(%rbp), %rsi        ## 8-byte Folded Reload
+	movq	-176(%rbp), %rax        ## 8-byte Reload
+	addq	%rax, %r13
+	addq	%rax, %r14
+	addq	%rax, %r15
+	cmpq	%rdx, -112(%rbp)        ## 8-byte Folded Reload
+	movq	-192(%rbp), %rax        ## 8-byte Reload
+	jne	LBB0_10
+## %bb.13:                              ##   in Loop: Header=BB0_8 Depth=1
+	movq	-168(%rbp), %rsi        ## 8-byte Reload
+	movq	%rsi, %rax
+	movq	-96(%rbp), %r9          ## 8-byte Reload
+	movq	-208(%rbp), %rdx        ## 8-byte Reload
+	cmpq	%rdx, %rsi
+	jne	LBB0_8
+	jmp	LBB0_15
+	.p2align	4, 0x90
+LBB0_14:                                ##   in Loop: Header=BB0_8 Depth=1
+	addq	$1, %rax
+	movq	%rax, %rsi
+	cmpq	%rdx, %rsi
+	jne	LBB0_8
+LBB0_15:
+	movq	_var_false@GOTPCREL(%rip), %rax
+	cmpl	$0, (%rax)
+	je	LBB0_17
+## %bb.16:
+	movq	%rbx, %rdi
+	callq	_dummy
+	movq	-80(%rbp), %rdi         ## 8-byte Reload
+	callq	_dummy
+	leaq	-56(%rbp), %rdi
+	callq	_dummy
+	movq	-96(%rbp), %r9          ## 8-byte Reload
+LBB0_17:
+	cmpl	$3, -72(%rbp)           ## 4-byte Folded Reload
+	jl	LBB0_59
+## %bb.18:
+	movabsq	$4294967296, %r14       ## imm = 0x100000000
+	leal	-1(%r9), %ecx
+	movslq	%r9d, %rsi
+	movslq	-88(%rbp), %rax         ## 4-byte Folded Reload
+	movq	%rax, -312(%rbp)        ## 8-byte Spill
+	movq	-72(%rbp), %rax         ## 8-byte Reload
+	addl	$-1, %eax
+	movq	%rax, -72(%rbp)         ## 8-byte Spill
+	leaq	-1(%rcx), %rax
+	leaq	-2(%rcx), %rdi
+	movq	%rdi, -424(%rbp)        ## 8-byte Spill
+	leaq	1(%rsi), %rdi
+	movq	%rdi, -224(%rbp)        ## 8-byte Spill
+	leaq	(%rsi,%rcx), %rdi
+	movq	%rdi, -304(%rbp)        ## 8-byte Spill
+	movl	%r9d, %edi
+	movq	%rdi, -256(%rbp)        ## 8-byte Spill
+	movq	%rcx, -264(%rbp)        ## 8-byte Spill
+	leaq	(%rbx,%rcx,8), %rcx
+	addq	$-8, %rcx
+	movq	%rcx, -352(%rbp)        ## 8-byte Spill
+	leal	6(%r9), %ecx
+	andl	$7, %ecx
+	movq	%rax, -448(%rbp)        ## 8-byte Spill
+	movq	%rcx, -344(%rbp)        ## 8-byte Spill
+	subq	%rcx, %rax
+	movq	%rsi, %rcx
+	shlq	$32, %rcx
+	movq	%rcx, -440(%rbp)        ## 8-byte Spill
+	leaq	1(%rax), %rcx
+	movq	%rcx, -328(%rbp)        ## 8-byte Spill
+	movq	%rax, -336(%rbp)        ## 8-byte Spill
+	leal	1(%rax), %eax
+	movl	%eax, -212(%rbp)        ## 4-byte Spill
+	leaq	2(%rsi), %rax
+	movq	%rax, -296(%rbp)        ## 8-byte Spill
+	movq	-80(%rbp), %rax         ## 8-byte Reload
+	leaq	8(%rax,%rsi,8), %rax
+	movq	%rax, -288(%rbp)        ## 8-byte Spill
+	leaq	(,%rsi,8), %rax
+	movq	%rax, -432(%rbp)        ## 8-byte Spill
+	movq	%rsi, -200(%rbp)        ## 8-byte Spill
+	leaq	(%rbx,%rsi,8), %rax
+	addq	$8, %rax
+	movq	%rax, -280(%rbp)        ## 8-byte Spill
+	movl	$1, %eax
+	.p2align	4, 0x90
+LBB0_19:                                ## =>This Loop Header: Depth=1
+                                        ##     Child Loop BB0_52 Depth 2
+                                        ##       Child Loop BB0_37 Depth 3
+                                        ##       Child Loop BB0_55 Depth 3
+	cmpl	$2, %r9d
+	jle	LBB0_58
+## %bb.20:                              ##   in Loop: Header=BB0_19 Depth=1
+	movq	%rax, %rcx
+	movq	%rax, %r12
+	movq	-312(%rbp), %r15        ## 8-byte Reload
+	imulq	%r15, %r12
+	leaq	1(%rax), %rax
+	movl	%r9d, %edi
+	imull	%ecx, %edi
+	leal	1(%rdi), %r8d
+	imull	%r9d, %r8d
+	addl	$2, %edi
+	imull	%r9d, %edi
+	movq	%rax, -320(%rbp)        ## 8-byte Spill
+	movq	%rax, %r13
+	imulq	%r15, %r13
+	movq	-224(%rbp), %rdx        ## 8-byte Reload
+	leaq	(%rdx,%r13), %rax
+	movq	%rax, -408(%rbp)        ## 8-byte Spill
+	movq	-304(%rbp), %rsi        ## 8-byte Reload
+	leaq	(%rsi,%r13), %rax
+	movq	%rax, -400(%rbp)        ## 8-byte Spill
+	addq	$-1, %rcx
+	imulq	%r15, %rcx
+	leaq	(%rdx,%rcx), %rax
+	movq	%rax, -392(%rbp)        ## 8-byte Spill
+	leaq	(%rsi,%rcx), %rax
+	movq	%rax, -384(%rbp)        ## 8-byte Spill
+	movq	-296(%rbp), %rax        ## 8-byte Reload
+	leal	(%rax,%r12), %eax
+	shlq	$32, %rax
+	movq	%rax, -104(%rbp)        ## 8-byte Spill
+	movq	-280(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%r13,8), %r10
+	leaq	(%rax,%rcx,8), %r11
+	movl	%r12d, %edx
+	addq	$1, %rdx
+	movq	-200(%rbp), %rax        ## 8-byte Reload
+	addq	%rax, %r13
+	movq	%r13, -144(%rbp)        ## 8-byte Spill
+	addq	%rax, %rcx
+	movq	%rcx, -152(%rbp)        ## 8-byte Spill
+	leal	2(%r8), %eax
+	movq	%rax, -240(%rbp)        ## 8-byte Spill
+	leal	1(%r12), %eax
+	movq	%rax, -416(%rbp)        ## 8-byte Spill
+	movq	%rdi, %rax
+	movq	%rdi, -112(%rbp)        ## 8-byte Spill
+	leal	1(%rdi), %r15d
+	movq	-224(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%r12), %rcx
+	leaq	(%rsi,%r12), %rax
+	movq	%rax, -368(%rbp)        ## 8-byte Spill
+	movq	-288(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%r12,8), %rsi
+	leaq	-8(%rax,%r12,8), %rax
+	movq	%rax, -136(%rbp)        ## 8-byte Spill
+	movq	%r12, -120(%rbp)        ## 8-byte Spill
+	leaq	1(%r12), %rax
+	movq	%rax, -360(%rbp)        ## 8-byte Spill
+	leal	-1(%r8), %eax
+	movl	%eax, -124(%rbp)        ## 4-byte Spill
+	movq	%rcx, -376(%rbp)        ## 8-byte Spill
+	movq	%rcx, -272(%rbp)        ## 8-byte Spill
+	movq	%r8, -248(%rbp)         ## 8-byte Spill
+	movq	%r8, %rdi
+	movq	%r15, -232(%rbp)        ## 8-byte Spill
+	movq	%r15, %r8
+	xorl	%r12d, %r12d
+	movl	$1, %eax
+	jmp	LBB0_52
+	.p2align	4, 0x90
+LBB0_21:                                ##   in Loop: Header=BB0_52 Depth=2
+	movl	%r9d, %edx
+	imull	%r12d, %edx
+	movq	-248(%rbp), %rax        ## 8-byte Reload
+	leal	(%rax,%rdx), %ecx
+	movq	-424(%rbp), %rax        ## 8-byte Reload
+	leal	(%rcx,%rax), %esi
+	cmpl	%ecx, %esi
+	jl	LBB0_53
+## %bb.22:                              ##   in Loop: Header=BB0_52 Depth=2
+	movq	%rax, %rcx
+	shrq	$32, %rcx
+	jne	LBB0_53
+## %bb.23:                              ##   in Loop: Header=BB0_52 Depth=2
+	movq	-240(%rbp), %rsi        ## 8-byte Reload
+	leal	(%rsi,%rdx), %esi
+	leal	(%rsi,%rax), %edi
+	cmpl	%esi, %edi
+	jl	LBB0_53
+## %bb.24:                              ##   in Loop: Header=BB0_52 Depth=2
+	testq	%rcx, %rcx
+	jne	LBB0_53
+## %bb.25:                              ##   in Loop: Header=BB0_52 Depth=2
+	movq	-416(%rbp), %rsi        ## 8-byte Reload
+	leal	(%rsi,%rdx), %esi
+	leal	(%rsi,%rax), %edi
+	cmpl	%esi, %edi
+	jl	LBB0_53
+## %bb.26:                              ##   in Loop: Header=BB0_52 Depth=2
+	testq	%rcx, %rcx
+	jne	LBB0_53
+## %bb.27:                              ##   in Loop: Header=BB0_52 Depth=2
+	addl	-232(%rbp), %edx        ## 4-byte Folded Reload
+	leal	(%rdx,%rax), %esi
+	cmpl	%edx, %esi
+	jl	LBB0_53
+## %bb.28:                              ##   in Loop: Header=BB0_52 Depth=2
+	testq	%rcx, %rcx
+	jne	LBB0_53
+## %bb.29:                              ##   in Loop: Header=BB0_52 Depth=2
+	movq	-192(%rbp), %rdx        ## 8-byte Reload
+	movq	%rdx, %rsi
+	imulq	-200(%rbp), %rsi        ## 8-byte Folded Reload
+	movq	-376(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%rsi), %rdi
+	movq	-368(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%rsi), %r13
+	movq	-408(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%rsi), %r11
+	movq	-400(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%rsi), %rcx
+	movq	-392(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%rsi), %r10
+	addq	-384(%rbp), %rsi        ## 8-byte Folded Reload
+                                        ## kill: def $edx killed $edx killed $rdx def $rdx
+	imull	-256(%rbp), %edx        ## 4-byte Folded Reload
+	movq	-232(%rbp), %rax        ## 8-byte Reload
+	leal	(%rax,%rdx), %r12d
+	movq	-360(%rbp), %rax        ## 8-byte Reload
+	leal	(%rax,%rdx), %r9d
+	movq	-240(%rbp), %rax        ## 8-byte Reload
+	leal	(%rax,%rdx), %eax
+	movl	%eax, -60(%rbp)         ## 4-byte Spill
+	addl	-248(%rbp), %edx        ## 4-byte Folded Reload
+	movq	-80(%rbp), %rax         ## 8-byte Reload
+	leaq	(%rax,%rdi,8), %rdi
+	leaq	(%rbx,%rcx,8), %rcx
+	cmpq	%rcx, %rdi
+	leaq	(%rax,%r13,8), %rcx
+	leaq	(%rbx,%r11,8), %r11
+	setb	-45(%rbp)               ## 1-byte Folded Spill
+	cmpq	%rcx, %r11
+	leaq	(%rbx,%r10,8), %r10
+	leaq	(%rbx,%rsi,8), %r11
+	movslq	%r12d, %rsi
+	setb	-44(%rbp)               ## 1-byte Folded Spill
+	cmpq	%r11, %rdi
+	setb	%r12b
+	cmpq	%rcx, %r10
+	leaq	(%rbx,%rsi,8), %r10
+	movq	-352(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%rsi,8), %rsi
+	movslq	%r9d, %r9
+	setb	-43(%rbp)               ## 1-byte Folded Spill
+	cmpq	%rsi, %rdi
+	setb	%r11b
+	cmpq	%rcx, %r10
+	leaq	(%rbx,%r9,8), %r10
+	leaq	(%rax,%r9,8), %rsi
+	movslq	-60(%rbp), %r9          ## 4-byte Folded Reload
+	setb	-60(%rbp)               ## 1-byte Folded Spill
+	cmpq	%rsi, %rdi
+	setb	%r13b
+	cmpq	%rcx, %r10
+	leaq	(%rbx,%r9,8), %r10
+	leaq	(%rax,%r9,8), %rsi
+	movslq	%edx, %rdx
+	setb	-42(%rbp)               ## 1-byte Folded Spill
+	cmpq	%rsi, %rdi
+	setb	%r9b
+	cmpq	%rcx, %r10
+	leaq	(%rax,%rdx,8), %rsi
+	setb	-41(%rbp)               ## 1-byte Folded Spill
+	cmpq	%rsi, %rdi
+	leaq	(%rbx,%rdx,8), %rdx
+	setb	%r10b
+	cmpq	%rcx, %rdx
+	setb	%dl
+	leaq	-55(%rbp), %rax
+	cmpq	%rdi, %rax
+	seta	%dil
+	leaq	-56(%rbp), %rax
+	cmpq	%rcx, %rax
+	setb	%al
+	movb	-44(%rbp), %cl          ## 1-byte Reload
+	testb	%cl, -45(%rbp)          ## 1-byte Folded Reload
+	jne	LBB0_53
+## %bb.30:                              ##   in Loop: Header=BB0_52 Depth=2
+	andb	-43(%rbp), %r12b        ## 1-byte Folded Reload
+	jne	LBB0_53
+## %bb.31:                              ##   in Loop: Header=BB0_52 Depth=2
+	andb	-60(%rbp), %r11b        ## 1-byte Folded Reload
+	jne	LBB0_53
+## %bb.32:                              ##   in Loop: Header=BB0_52 Depth=2
+	andb	-42(%rbp), %r13b        ## 1-byte Folded Reload
+	jne	LBB0_53
+## %bb.33:                              ##   in Loop: Header=BB0_52 Depth=2
+	andb	-41(%rbp), %r9b         ## 1-byte Folded Reload
+	jne	LBB0_53
+## %bb.34:                              ##   in Loop: Header=BB0_52 Depth=2
+	movl	$1, %r9d
+	andb	%dl, %r10b
+	jne	LBB0_54
+## %bb.35:                              ##   in Loop: Header=BB0_52 Depth=2
+	andb	%al, %dil
+	jne	LBB0_54
+## %bb.36:                              ##   in Loop: Header=BB0_52 Depth=2
+	vbroadcastsd	-56(%rbp), %zmm0
+	movq	-104(%rbp), %rdx        ## 8-byte Reload
+	xorl	%esi, %esi
+	movq	-336(%rbp), %r9         ## 8-byte Reload
+	movabsq	$34359738368, %rdi      ## imm = 0x800000000
+	movq	%rdi, %r10
+	movq	-184(%rbp), %r11        ## 8-byte Reload
+	movq	-176(%rbp), %r15        ## 8-byte Reload
+	movq	-168(%rbp), %r12        ## 8-byte Reload
+	movq	-88(%rbp), %rdi         ## 8-byte Reload
+	movq	-160(%rbp), %rax        ## 8-byte Reload
+	.p2align	4, 0x90
+        movl      $111, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+LBB0_37:                                ##   Parent Loop BB0_19 Depth=1
+                                        ##     Parent Loop BB0_52 Depth=2
+                                        ## =>    This Inner Loop Header: Depth=3
+	leal	(%rax,%rsi), %ecx
+	movslq	%ecx, %rcx
+	vmovupd	(%rbx,%rcx,8), %zmm1
+	movq	%rdx, %rcx
+	sarq	$29, %rcx
+	vaddpd	(%rbx,%rcx), %zmm1, %zmm1
+	leal	(%r12,%rsi), %ecx
+	movslq	%ecx, %rcx
+	vaddpd	(%rbx,%rcx,8), %zmm1, %zmm1
+	leal	(%r8,%rsi), %ecx
+	movslq	%ecx, %rcx
+	vaddpd	(%rbx,%rcx,8), %zmm1, %zmm1
+	vaddpd	(%r15,%rsi,8), %zmm1, %zmm1
+	vaddpd	(%r11,%rsi,8), %zmm1, %zmm1
+	vmulpd	%zmm0, %zmm1, %zmm1
+	vmovupd	%zmm1, (%rdi,%rsi,8)
+	addq	$8, %rsi
+	addq	%r10, %rdx
+	cmpq	%rsi, %r9
+	jne	LBB0_37
+        movl      $222, %ebx # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     100        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     103        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+        .byte     144        # INSERTED BY KERNCRAFT IACA MARKER UTILITY
+## %bb.38:                              ##   in Loop: Header=BB0_52 Depth=2
+	movq	-328(%rbp), %r9         ## 8-byte Reload
+	movl	-212(%rbp), %eax        ## 4-byte Reload
+	movl	%eax, %r15d
+	cmpl	$0, -344(%rbp)          ## 4-byte Folded Reload
+	jne	LBB0_54
+	jmp	LBB0_56
+	.p2align	4, 0x90
+LBB0_52:                                ##   Parent Loop BB0_19 Depth=1
+                                        ## =>  This Loop Header: Depth=2
+                                        ##       Child Loop BB0_37 Depth 3
+                                        ##       Child Loop BB0_55 Depth 3
+	movq	%rdx, -168(%rbp)        ## 8-byte Spill
+	addq	$1, %rax
+	movl	$1, %r15d
+	cmpq	$8, -448(%rbp)          ## 8-byte Folded Reload
+	movq	%r10, -184(%rbp)        ## 8-byte Spill
+	movq	%r11, -176(%rbp)        ## 8-byte Spill
+	movq	%rsi, -88(%rbp)         ## 8-byte Spill
+	movq	%rdi, -160(%rbp)        ## 8-byte Spill
+	movq	%r12, -192(%rbp)        ## 8-byte Spill
+	movq	%rax, -208(%rbp)        ## 8-byte Spill
+	jae	LBB0_21
+LBB0_53:                                ##   in Loop: Header=BB0_52 Depth=2
+	movl	$1, %r9d
+LBB0_54:                                ##   in Loop: Header=BB0_52 Depth=2
+	movq	-136(%rbp), %rax        ## 8-byte Reload
+	leaq	(%rax,%r9,8), %rdx
+	movq	-144(%rbp), %rax        ## 8-byte Reload
+	leaq	(%r9,%rax), %rcx
+	leaq	(%rbx,%rcx,8), %r11
+	movq	-152(%rbp), %rax        ## 8-byte Reload
+	leaq	(%r9,%rax), %rcx
+	leaq	(%rbx,%rcx,8), %r10
+	movq	-272(%rbp), %rax        ## 8-byte Reload
+	leal	(%r9,%rax), %r12d
+	shlq	$32, %r12
+	movq	-264(%rbp), %r13        ## 8-byte Reload
+	subq	%r9, %r13
+	movq	-112(%rbp), %rax        ## 8-byte Reload
+	leal	(%r15,%rax), %esi
+	movq	-120(%rbp), %rax        ## 8-byte Reload
+	leal	(%r15,%rax), %edi
+	addl	-124(%rbp), %r15d       ## 4-byte Folded Reload
+	xorl	%ecx, %ecx
+	.p2align	4, 0x90
+LBB0_55:                                ##   Parent Loop BB0_19 Depth=1
+                                        ##     Parent Loop BB0_52 Depth=2
+                                        ## =>    This Inner Loop Header: Depth=3
+	leal	(%r15,%rcx), %eax
+	cltq
+	vmovsd	(%rbx,%rax,8), %xmm0    ## xmm0 = mem[0],zero
+	movq	%r12, %rax
+	sarq	$29, %rax
+	vaddsd	(%rbx,%rax), %xmm0, %xmm0
+	leal	(%rdi,%rcx), %eax
+	cltq
+	vaddsd	(%rbx,%rax,8), %xmm0, %xmm0
+	leal	(%rsi,%rcx), %eax
+	cltq
+	vaddsd	(%rbx,%rax,8), %xmm0, %xmm0
+	vaddsd	(%r10,%rcx,8), %xmm0, %xmm0
+	vaddsd	(%r11,%rcx,8), %xmm0, %xmm0
+	vmulsd	-56(%rbp), %xmm0, %xmm0
+	vmovsd	%xmm0, (%rdx,%rcx,8)
+	addq	$1, %rcx
+	addq	%r14, %r12
+	cmpq	%rcx, %r13
+	jne	LBB0_55
+LBB0_56:                                ##   in Loop: Header=BB0_52 Depth=2
+	movq	-192(%rbp), %r12        ## 8-byte Reload
+	addq	$1, %r12
+	movq	-104(%rbp), %rax        ## 8-byte Reload
+	addq	-440(%rbp), %rax        ## 8-byte Folded Reload
+	movq	%rax, -104(%rbp)        ## 8-byte Spill
+	movq	-432(%rbp), %rcx        ## 8-byte Reload
+	movq	-88(%rbp), %rsi         ## 8-byte Reload
+	addq	%rcx, %rsi
+	movq	-184(%rbp), %r10        ## 8-byte Reload
+	addq	%rcx, %r10
+	movq	-176(%rbp), %r11        ## 8-byte Reload
+	addq	%rcx, %r11
+	movq	-256(%rbp), %rax        ## 8-byte Reload
+	addq	%rax, %r8
+	movq	-168(%rbp), %rdx        ## 8-byte Reload
+	addq	%rax, %rdx
+	movq	-160(%rbp), %rdi        ## 8-byte Reload
+	addq	%rax, %rdi
+	addq	%rcx, -136(%rbp)        ## 8-byte Folded Spill
+	movq	-200(%rbp), %rax        ## 8-byte Reload
+	addq	%rax, -144(%rbp)        ## 8-byte Folded Spill
+	addq	%rax, -152(%rbp)        ## 8-byte Folded Spill
+	addq	%rax, -272(%rbp)        ## 8-byte Folded Spill
+	movq	-96(%rbp), %r9          ## 8-byte Reload
+	movq	-112(%rbp), %rax        ## 8-byte Reload
+	addl	%r9d, %eax
+	movq	%rax, -112(%rbp)        ## 8-byte Spill
+	movq	-120(%rbp), %rax        ## 8-byte Reload
+	addl	%r9d, %eax
+	movq	%rax, -120(%rbp)        ## 8-byte Spill
+	addl	%r9d, -124(%rbp)        ## 4-byte Folded Spill
+	movq	-208(%rbp), %rax        ## 8-byte Reload
+	cmpq	-264(%rbp), %rax        ## 8-byte Folded Reload
+	jne	LBB0_52
+## %bb.57:                              ##   in Loop: Header=BB0_19 Depth=1
+	movq	-320(%rbp), %rcx        ## 8-byte Reload
+	movq	%rcx, %rax
+	cmpq	-72(%rbp), %rcx         ## 8-byte Folded Reload
+	jne	LBB0_19
+	jmp	LBB0_59
+	.p2align	4, 0x90
+LBB0_58:                                ##   in Loop: Header=BB0_19 Depth=1
+	movq	%rax, %rcx
+	addq	$1, %rcx
+	movq	%rcx, %rax
+	cmpq	-72(%rbp), %rcx         ## 8-byte Folded Reload
+	jne	LBB0_19
+LBB0_59:
+	movq	_var_false@GOTPCREL(%rip), %rax
+	cmpl	$0, (%rax)
+	je	LBB0_61
+## %bb.60:
+	movq	%rbx, %rdi
+	vzeroupper
+	callq	_dummy
+	movq	-80(%rbp), %rdi         ## 8-byte Reload
+	callq	_dummy
+	leaq	-56(%rbp), %rdi
+	callq	_dummy
+LBB0_61:
+	xorl	%eax, %eax
+	addq	$408, %rsp              ## imm = 0x198
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	vzeroupper
+	retq
+	.cfi_endproc
+                                        ## -- End function
+
+.subsections_via_symbols