From b033b3b7aa1bb9f8519e1badec661414b846d045 Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Sat, 17 Apr 2021 11:06:39 +0200
Subject: [PATCH 01/12] allow different base with prefix for offset values

---
 osaca/semantics/kernel_dg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/osaca/semantics/kernel_dg.py b/osaca/semantics/kernel_dg.py
index fd44208..adf0419 100755
--- a/osaca/semantics/kernel_dg.py
+++ b/osaca/semantics/kernel_dg.py
@@ -313,9 +313,9 @@ class KernelDG(nx.DiGraph):
             # determine absolute address change
             addr_change = 0
             if src.offset and "value" in src.offset:
-                addr_change += int(src.offset.value)
+                addr_change += int(src.offset.value, 0)
             if mem.offset:
-                addr_change -= int(mem.offset.value)
+                addr_change -= int(mem.offset.value, 0)
             if mem.base and src.base:
                 base_change = register_changes.get(
                     src.base.get('prefix', '')+src.base.name,

From 607d459569a279ff3fe1fbe6efdf72ca5b04894c Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Sat, 17 Apr 2021 12:46:44 +0200
Subject: [PATCH 02/12] keep dependency paths as generators instead of lists

---
 osaca/semantics/kernel_dg.py | 41 ++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/osaca/semantics/kernel_dg.py b/osaca/semantics/kernel_dg.py
index adf0419..dd6d5c3 100755
--- a/osaca/semantics/kernel_dg.py
+++ b/osaca/semantics/kernel_dg.py
@@ -88,31 +88,32 @@ class KernelDG(nx.DiGraph):
         loopcarried_deps = []
         paths = []
         for instr in kernel:
-            paths += list(nx.algorithms.simple_paths.all_simple_paths(
+            paths.append(nx.algorithms.simple_paths.all_simple_paths(
                 dg, instr.line_number, instr.line_number + offset))
 
         paths_set = set()
-        for path in paths:
-            lat_sum = 0.0
-            # extend path by edge bound latencies (e.g., store-to-load latency)
-            lat_path = []
-            for s, d in nx.utils.pairwise(path):
-                edge_lat = dg.edges[s, d]['latency']
-                # map source node back to original line numbers
-                if s >= offset:
-                    s -= offset
-                lat_path.append((s, edge_lat))
-                lat_sum += edge_lat
-            if d >= offset:
-                d -= offset
-            lat_path.sort()
+        for path_gen in paths:
+            for path in path_gen:
+                lat_sum = 0.0
+                # extend path by edge bound latencies (e.g., store-to-load latency)
+                lat_path = []
+                for s, d in nx.utils.pairwise(path):
+                    edge_lat = dg.edges[s, d]['latency']
+                    # map source node back to original line numbers
+                    if s >= offset:
+                        s -= offset
+                    lat_path.append((s, edge_lat))
+                    lat_sum += edge_lat
+                if d >= offset:
+                    d -= offset
+                lat_path.sort()
 
-            # Ignore duplicate paths which differ only in the root node
-            if tuple(lat_path) in paths_set:
-                continue
-            paths_set.add(tuple(lat_path))
+                # Ignore duplicate paths which differ only in the root node
+                if tuple(lat_path) in paths_set:
+                    continue
+                paths_set.add(tuple(lat_path))
 
-            loopcarried_deps.append((lat_sum, lat_path))
+                loopcarried_deps.append((lat_sum, lat_path))
         loopcarried_deps.sort(reverse=True)
 
         # map lcd back to nodes

From 152360bad2c9cdcb17cf8b522541db3db63126c0 Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Mon, 19 Apr 2021 00:04:03 +0200
Subject: [PATCH 03/12] enhanced LCD analysis by making it parallel and added
 timeout flag

---
 osaca/frontend.py            |  59 +++++++++---
 osaca/osaca.py               |  12 ++-
 osaca/semantics/kernel_dg.py | 175 ++++++++++++++++++++++++-----------
 3 files changed, 174 insertions(+), 72 deletions(-)

diff --git a/osaca/frontend.py b/osaca/frontend.py
index f9dc030..65ca702 100755
--- a/osaca/frontend.py
+++ b/osaca/frontend.py
@@ -163,6 +163,7 @@ class Frontend(object):
         ignore_unknown=False,
         arch_warning=False,
         length_warning=False,
+        lcd_warning=False,
         verbose=False,
     ):
         """
@@ -176,17 +177,19 @@ class Frontend(object):
         :param ignore_unknown: flag for ignore warning if performance data is missing, defaults to
             `False`
         :type ignore_unknown: boolean, optional
-        :param print_arch_warning: flag for additional user warning to specify micro-arch
-        :type print_arch_warning: boolean, optional
-        :param print_length_warning: flag for additional user warning to specify kernel length with
+        :param arch_warning: flag for additional user warning to specify micro-arch
+        :type arch_warning: boolean, optional
+        :param length_warning: flag for additional user warning to specify kernel length with
                                      --lines
-        :type print_length_warning: boolean, optional
+        :type length_warning: boolean, optional
+        :param lcd_warning: flag for additional user warning due to LCD analysis timed out
+        :type lcd_warning: boolean, optional
         :param verbose: flag for verbosity level, defaults to False
         :type verbose: boolean, optional
         """
         return (
             self._header_report()
-            + self._user_warnings(arch_warning, length_warning)
+            + self._user_warnings_header(arch_warning, length_warning)
             + self._symbol_map()
             + self.combined_view(
                 kernel,
@@ -194,6 +197,7 @@ class Frontend(object):
                 kernel_dg.get_loopcarried_dependencies(),
                 ignore_unknown,
             )
+            + self._user_warnings_footer(lcd_warning)
             + self.loopcarried_dependencies(kernel_dg.get_loopcarried_dependencies())
         )
 
@@ -236,8 +240,9 @@ class Frontend(object):
         if dep_dict:
             longest_lcd = max(dep_dict, key=lambda ln: dep_dict[ln]['latency'])
             lcd_sum = dep_dict[longest_lcd]['latency']
-            lcd_lines = {instr["line_number"]: lat
-                           for instr, lat in dep_dict[longest_lcd]["dependencies"]}
+            lcd_lines = {
+                instr["line_number"]: lat for instr, lat in dep_dict[longest_lcd]["dependencies"]
+            }
 
         s += headline_str.format(headline) + "\n"
         s += (
@@ -311,18 +316,24 @@ class Frontend(object):
         ).format(amount, "-" * len(str(amount)))
         return s
 
-    def _user_warnings(self, arch_warning, length_warning):
+    def _user_warnings_header(self, arch_warning, length_warning):
         """Returns warning texts for giving the user more insight in what he is doing."""
+        dashed_line = (
+            "-------------------------------------------------------------------------"
+            "------------------------\n"
+        )
         arch_text = (
-            "WARNING: No micro-architecture was specified and a default uarch was used.\n"
-            "         Specify the uarch with --arch. See --help for more information.\n"
+            "-------------------------- WARNING: No micro-architecture was specified "
+            "-------------------------\n"
+            "         A default uarch for this particular ISA was used. Specify "
+            "the uarch with --arch.\n         See --help for more information.\n" + dashed_line
         )
         length_text = (
-            "WARNING: You are analyzing a large amount of instruction forms. Analysis "
-            "across loops/block boundaries often do not make much sense.\n"
-            "         Specify the kernel length with --length. See --help for more "
-            "information.\n"
-            "         If this is intentional, you can safely ignore this message.\n"
+            "----------------- WARNING: You are analyzing a large amount of instruction forms "
+            "----------------\n         Analysis across loops/block boundaries often do not make"
+            " much sense.\n         Specify the kernel length with --length. See --help for more "
+            "information.\n         If this is intentional, you can safely ignore this message.\n"
+            + dashed_line
         )
 
         warnings = ""
@@ -331,6 +342,24 @@ class Frontend(object):
         warnings += "\n"
         return warnings
 
+    def _user_warnings_footer(self, lcd_warning):
+        """Returns warning texts for giving the user more insight in what he is doing."""
+        dashed_line = (
+            "-------------------------------------------------------------------------"
+            "------------------------\n"
+        )
+        lcd_text = (
+            "-------------------------------- WARNING: LCD analysis timed out "
+            "-------------------------------\n         While searching for all dependency chains"
+            " the analysis timed out.\n         Decrease the number of instructions or set the "
+            "timeout threshold with --lcd-timeout.\n         See --help for more "
+            "information.\n" + dashed_line
+        )
+        warnings = "\n"
+        warnings += lcd_text if lcd_warning else ""
+        warnings += "\n"
+        return warnings
+
     def _get_separator_list(self, separator, separator_2=" "):
         """Creates column view for seperators in the TP/combined view."""
         separator_list = []
diff --git a/osaca/osaca.py b/osaca/osaca.py
index 40b25d9..b97bcb7 100755
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -146,6 +146,15 @@ def create_parser(parser=None):
         action="store_true",
         help="Ignore if instructions cannot be found in the data file and print analysis anyway.",
     )
+    parser.add_argument(
+        "--lcd-timeout",
+        dest="lcd_timeout",
+        metavar="SECONDS",
+        type=int,
+        default=10,
+        help="Set timeout in seconds for LCD analysis. After timeout, OSACA will continue"
+        " its analysis with the dependency paths found up to this point. Defaults to 10.",
+    )
     parser.add_argument(
         "--verbose", "-v", action="count", default=0, help="Increases verbosity level."
     )
@@ -303,7 +312,7 @@ def inspect(args, output_file=sys.stdout):
         semantics.assign_optimal_throughput(kernel)
 
     # Create DiGrahps
-    kernel_graph = KernelDG(kernel, parser, machine_model, semantics)
+    kernel_graph = KernelDG(kernel, parser, machine_model, semantics, args.lcd_timeout)
     if args.dotpath is not None:
         kernel_graph.export_graph(args.dotpath if args.dotpath != "." else None)
     # Print analysis
@@ -315,6 +324,7 @@ def inspect(args, output_file=sys.stdout):
             ignore_unknown=ignore_unknown,
             arch_warning=print_arch_warning,
             length_warning=print_length_warning,
+            lcd_warning=kernel_graph.timed_out,
             verbose=verbose,
         ),
         file=output_file,
diff --git a/osaca/semantics/kernel_dg.py b/osaca/semantics/kernel_dg.py
index dd6d5c3..94a5dfc 100755
--- a/osaca/semantics/kernel_dg.py
+++ b/osaca/semantics/kernel_dg.py
@@ -1,22 +1,39 @@
 #!/usr/bin/env python3
 
 import copy
-from itertools import chain, product
+import time
 from collections import defaultdict
+from itertools import accumulate, chain, product
+from multiprocessing import Manager, Process, cpu_count
 
 import networkx as nx
-
 from osaca.parser import AttrDict
-from osaca.semantics import INSTR_FLAGS, MachineModel, ArchSemantics
+from osaca.semantics import INSTR_FLAGS, ArchSemantics, MachineModel
+
 
 class KernelDG(nx.DiGraph):
-    def __init__(self, parsed_kernel, parser, hw_model: MachineModel, semantics: ArchSemantics):
+    # threshold for checking dependency graph sequential or in parallel
+    INSTRUCTION_THRESHOLD = 50
+
+    def __init__(
+        self, parsed_kernel, parser, hw_model: MachineModel, semantics: ArchSemantics, timeout=10
+    ):
+        self.timed_out = False
         self.kernel = parsed_kernel
         self.parser = parser
         self.model = hw_model
         self.arch_sem = semantics
         self.dg = self.create_DG(self.kernel)
-        self.loopcarried_deps = self.check_for_loopcarried_dep(self.kernel)
+        self.loopcarried_deps = self.check_for_loopcarried_dep(self.kernel, timeout)
+
+    def _extend_path(self, dst_list, kernel, dg, offset):
+        for instr in kernel:
+            generator_path = nx.algorithms.simple_paths.all_simple_paths(
+                dg, instr.line_number, instr.line_number + offset
+            )
+            tmp_list = list(generator_path)
+            dst_list.extend(tmp_list)
+        # print('Thread [{}-{}] done'.format(kernel[0]['line_number'], kernel[-1]['line_number']))
 
     def create_DG(self, kernel):
         """
@@ -65,17 +82,19 @@ class KernelDG(nx.DiGraph):
                 dg.nodes[dep["line_number"]]["instruction_form"] = dep
         return dg
 
-    def check_for_loopcarried_dep(self, kernel):
+    def check_for_loopcarried_dep(self, kernel, timeout=10):
         """
         Try to find loop-carried dependencies in given kernel.
 
         :param kernel: Parsed asm kernel with assigned semantic information
         :type kernel: list
+        :param timeout: Timeout in seconds for parallel execution, defaults
+                                    to `10`. Set to `0` for no timeout
+        :type timeout: int
         :returns: `dict` -- dependency dictionary with all cyclic LCDs
         """
         # increase line number for second kernel loop
         offset = max(1000, max([i.line_number for i in kernel]))
-        first_line_no = kernel[0].line_number
         tmp_kernel = [] + kernel
         for orig_iform in kernel:
             temp_iform = copy.copy(orig_iform)
@@ -86,34 +105,72 @@ class KernelDG(nx.DiGraph):
 
         # build cyclic loop-carried dependencies
         loopcarried_deps = []
-        paths = []
-        for instr in kernel:
-            paths.append(nx.algorithms.simple_paths.all_simple_paths(
-                dg, instr.line_number, instr.line_number + offset))
+        all_paths = []
+
+        klen = len(kernel)
+        if klen >= self.INSTRUCTION_THRESHOLD:
+            # parallel execution with static scheduling
+            num_cores = cpu_count()
+            workload = int((klen - 1) / num_cores) + 1
+            starts = [tid * workload for tid in range(num_cores)]
+            ends = [min((tid + 1) * workload, klen) for tid in range(num_cores)]
+            instrs = [kernel[s:e] for s, e in zip(starts, ends)]
+            with Manager() as manager:
+                all_paths = manager.list()
+                processes = [
+                    Process(target=self._extend_path, args=(all_paths, instr_section, dg, offset))
+                    for instr_section in instrs
+                ]
+                for p in processes:
+                    p.start()
+                start_time = time.time()
+                while time.time() - start_time <= timeout:
+                    if any(p.is_alive() for p in processes):
+                        time.sleep(0.2)
+                    else:
+                        # all procs done
+                        for p in processes:
+                            p.join()
+                        break
+                else:
+                    self.timed_out = True
+                    # terminate running processes
+                    for p in processes:
+                        if p.is_alive():
+                            p.kill()
+                        p.join()
+                all_paths = list(all_paths)
+        else:
+            # sequential execution to avoid overhead when analyzing smaller kernels
+            for instr in kernel:
+                all_paths.extend(
+                    nx.algorithms.simple_paths.all_simple_paths(
+                        dg, instr.line_number, instr.line_number + offset
+                    )
+                )
 
         paths_set = set()
-        for path_gen in paths:
-            for path in path_gen:
-                lat_sum = 0.0
-                # extend path by edge bound latencies (e.g., store-to-load latency)
-                lat_path = []
-                for s, d in nx.utils.pairwise(path):
-                    edge_lat = dg.edges[s, d]['latency']
-                    # map source node back to original line numbers
-                    if s >= offset:
-                        s -= offset
-                    lat_path.append((s, edge_lat))
-                    lat_sum += edge_lat
-                if d >= offset:
-                    d -= offset
-                lat_path.sort()
+        for path in all_paths:
+            lat_sum = 0.0
+            # extend path by edge bound latencies (e.g., store-to-load latency)
+            lat_path = []
+            for s, d in nx.utils.pairwise(path):
+                edge_lat = dg.edges[s, d]['latency']
+                # map source node back to original line numbers
+                if s >= offset:
+                    s -= offset
+                lat_path.append((s, edge_lat))
+                lat_sum += edge_lat
+            if d >= offset:
+                d -= offset
+            lat_path.sort()
 
-                # Ignore duplicate paths which differ only in the root node
-                if tuple(lat_path) in paths_set:
-                    continue
-                paths_set.add(tuple(lat_path))
+            # Ignore duplicate paths which differ only in the root node
+            if tuple(lat_path) in paths_set:
+                continue
+            paths_set.add(tuple(lat_path))
 
-                loopcarried_deps.append((lat_sum, lat_path))
+            loopcarried_deps.append((lat_sum, lat_path))
         loopcarried_deps.sort(reverse=True)
 
         # map lcd back to nodes
@@ -121,8 +178,10 @@ class KernelDG(nx.DiGraph):
         for lat_sum, involved_lines in loopcarried_deps:
             loopcarried_deps_dict[involved_lines[0][0]] = {
                 "root": self._get_node_by_lineno(involved_lines[0][0]),
-                "dependencies": [(self._get_node_by_lineno(ln), lat) for ln, lat in involved_lines],
-                "latency": lat_sum
+                "dependencies": [
+                    (self._get_node_by_lineno(ln), lat) for ln, lat in involved_lines
+                ],
+                "latency": lat_sum,
             }
         return loopcarried_deps_dict
 
@@ -168,9 +227,7 @@ class KernelDG(nx.DiGraph):
             # split to DAG
             raise NotImplementedError("Kernel is cyclic.")
 
-    def find_depending(
-        self, instruction_form, instructions, flag_dependencies=False
-    ):
+    def find_depending(self, instruction_form, instructions, flag_dependencies=False):
         """
         Find instructions in `instructions` depending on a given instruction form's results.
 
@@ -190,15 +247,15 @@ class KernelDG(nx.DiGraph):
             # TODO instructions before must be considered as well, if they update registers
             # not used by insruction_form. E.g., validation/build/A64FX/gcc/O1/gs-2d-5pt.marked.s
             register_changes = self._update_reg_changes(instruction_form)
-            #print("FROM", instruction_form.line, register_changes)
+            # print("FROM", instruction_form.line, register_changes)
             for i, instr_form in enumerate(instructions):
                 self._update_reg_changes(instr_form, register_changes)
-                #print("  TO", instr_form.line, register_changes)
+                # print("  TO", instr_form.line, register_changes)
                 if "register" in dst:
                     # read of register
                     if self.is_read(dst.register, instr_form) and not (
-                            dst.get("pre_indexed", False) or
-                            dst.get("post_indexed", False)):
+                        dst.get("pre_indexed", False) or dst.get("post_indexed", False)
+                    ):
                         yield instr_form, []
                     # write to register -> abort
                     if self.is_written(dst.register, instr_form):
@@ -215,10 +272,10 @@ class KernelDG(nx.DiGraph):
                     if "pre_indexed" in dst.memory:
                         if self.is_written(dst.memory.base, instr_form):
                             break
-                    #if dst.memory.base:
+                    # if dst.memory.base:
                     #    if self.is_read(dst.memory.base, instr_form):
                     #        yield instr_form, []
-                    #if dst.memory.index:
+                    # if dst.memory.index:
                     #    if self.is_read(dst.memory.index, instr_form):
                     #        yield instr_form, []
                     if "post_indexed" in dst.memory:
@@ -226,7 +283,7 @@ class KernelDG(nx.DiGraph):
                         if self.is_written(dst.memory.base, instr_form):
                             break
                     # TODO record register changes
-                    #      (e.g., mov, leaadd, sub, inc, dec) in instructions[:i] 
+                    #      (e.g., mov, leaadd, sub, inc, dec) in instructions[:i]
                     #      and pass to is_memload and is_memstore to consider relevance.
                     # load from same location (presumed)
                     if self.is_memload(dst.memory, instr_form, register_changes):
@@ -286,7 +343,9 @@ class KernelDG(nx.DiGraph):
                 if src.memory.base is not None:
                     is_read = self.parser.is_reg_dependend_of(register, src.memory.base) or is_read
                 if src.memory.index is not None:
-                    is_read = self.parser.is_reg_dependend_of(register, src.memory.index) or is_read
+                    is_read = (
+                        self.parser.is_reg_dependend_of(register, src.memory.index) or is_read
+                    )
         # Check also if read in destination memory address
         for dst in chain(
             instruction_form.semantic_operands.destination,
@@ -296,7 +355,9 @@ class KernelDG(nx.DiGraph):
                 if dst.memory.base is not None:
                     is_read = self.parser.is_reg_dependend_of(register, dst.memory.base) or is_read
                 if dst.memory.index is not None:
-                    is_read = self.parser.is_reg_dependend_of(register, dst.memory.index) or is_read
+                    is_read = (
+                        self.parser.is_reg_dependend_of(register, dst.memory.index) or is_read
+                    )
         return is_read
 
     def is_memload(self, mem, instruction_form, register_changes={}):
@@ -319,36 +380,38 @@ class KernelDG(nx.DiGraph):
                 addr_change -= int(mem.offset.value, 0)
             if mem.base and src.base:
                 base_change = register_changes.get(
-                    src.base.get('prefix', '')+src.base.name,
-                    {'name': src.base.get('prefix', '')+src.base.name, 'value': 0})
+                    src.base.get('prefix', '') + src.base.name,
+                    {'name': src.base.get('prefix', '') + src.base.name, 'value': 0},
+                )
                 if base_change is None:
                     # Unknown change occurred
                     continue
-                if mem.base.get('prefix', '')+mem.base['name'] != base_change['name']:
+                if mem.base.get('prefix', '') + mem.base['name'] != base_change['name']:
                     # base registers do not match
                     continue
                 addr_change += base_change['value']
             elif mem.base or src.base:
-                    # base registers do not match
-                    continue
+                # base registers do not match
+                continue
             if mem.index and src.index:
                 index_change = register_changes.get(
-                    src.index.get('prefix', '')+src.index.name,
-                    {'name': src.index.get('prefix', '')+src.index.name, 'value': 0})
+                    src.index.get('prefix', '') + src.index.name,
+                    {'name': src.index.get('prefix', '') + src.index.name, 'value': 0},
+                )
                 if index_change is None:
                     # Unknown change occurred
                     continue
                 if mem.scale != src.scale:
                     # scale factors do not match
                     continue
-                if mem.index.get('prefix', '')+mem.index['name'] != index_change['name']:
+                if mem.index.get('prefix', '') + mem.index['name'] != index_change['name']:
                     # index registers do not match
                     continue
                 addr_change += index_change['value'] * src.scale
             elif mem.index or src.index:
-                    # index registers do not match
-                    continue
-            #if instruction_form.line_number == 3:
+                # index registers do not match
+                continue
+            # if instruction_form.line_number == 3:
             if addr_change == 0:
                 return True
         return False

From e6a54ee1316d72b7016264dd9f3b5f6e584b8754 Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Mon, 19 Apr 2021 00:05:53 +0200
Subject: [PATCH 04/12] added CLX as synonym for CSX uarch

---
 osaca/osaca.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/osaca/osaca.py b/osaca/osaca.py
index b97bcb7..ec10e3a 100755
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -20,6 +20,7 @@ SUPPORTED_ARCHS = [
     "BDW",
     "SKX",
     "CSX",
+    "CLX",
     "ICL",
     "ZEN1",
     "ZEN2",
@@ -190,6 +191,9 @@ def check_arguments(args, parser):
         parser.error(
             "Microarchitecture not supported. Please see --help for all valid architecture codes."
         )
+    # manually set CLX to CSX to support both abbreviations
+    if args.arch.upper() == "CLX":
+        args.arch = "CSX"
     if "import_data" in args and args.import_data not in supported_import_files:
         parser.error(
             "Microbenchmark not supported for data import. Please see --help for all valid "

From 6db08c7e8e0a42ccd6e48475b22b68420e4f5cb7 Mon Sep 17 00:00:00 2001
From: Jan <20126033+JanLJL@users.noreply.github.com>
Date: Mon, 19 Apr 2021 00:27:24 +0200
Subject: [PATCH 05/12] added lcd-timeout flag, citations and updated credits

---
 README.rst | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/README.rst b/README.rst
index 0698291..b2dc099 100644
--- a/README.rst
+++ b/README.rst
@@ -82,10 +82,10 @@ The usage of OSACA can be listed as:
 
 .. code:: bash
 
-    osaca [-h] [-V] [--arch ARCH] [--fixed] [--lines LINES] [--db-check] 
-    	  [--import MICROBENCH] [--insert-marker] 
-	  [--export-graph GRAPHNAME] [--ignore-unknown] [--verbose]
-	  [--out OUT]
+    osaca [-h] [-V] [--arch ARCH] [--fixed] [--lines LINES]
+    	  [--ignore-unknown] [--lcd-timeout SECONDS]
+    	  [--db-check] [--import MICROBENCH] [--insert-marker]
+	  [--export-graph GRAPHNAME] [--out OUT] [--verbose]
 	  FILEPATH
 
 -h, --help
@@ -118,6 +118,9 @@ The usage of OSACA can be listed as:
 --ignore-unknown
   Force OSACA to apply a throughput and latency of 0.0 cy for all unknown instruction forms.
   If not specified, a warning will be printed instead if one ore more isntruction form is unknown to OSACA.
+--lcd-timeout SECONDS
+  Set timeout in seconds for LCD analysis. After timeout, OSACA will continue its analysis with the dependency paths found up to this point.
+  Defaults to `10`.
 -v, --verbose
   Increases verbosity level
 -o OUT, --out OUT
@@ -370,9 +373,16 @@ In the bottom, all loop-carried dependencies are shown, each with a list of line
 
 You can find more (already marked) examples and sample outputs for various architectures in the `examples <examples/>`__ directory.
 
+Citations
+=========
+If you use OSACA for scientific work you can cite us as  (for the Bibtex, see the `Wiki <https://github.com/RRZE-HPC/OSACA/wiki#acknowledgement>`_):
+
+* `Automated Instruction Stream Throughput Prediction for Intel and AMD Microarchitectures <https://doi.org/10.1109/PMBS.2018.8641578>`_ (`Pre-print <https://arxiv.org/abs/1809.00912>`_)
+* `Automatic Throughput and Critical Path Analysis of x86 and ARM Assembly Kernels <https://doi.org/10.1109/PMBS49563.2019.00006>`_ (`Pre-print <https://arxiv.org/abs/1910.00214>`_)
+
 Credits
 =======
-Implementation: Jan Laukemann
+Implementation: Jan Laukemann, Julian Hammer
 
 License
 =======

From a82a0e24a342ac732e0a6973d3897f20bdb5cdba Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Mon, 19 Apr 2021 00:34:21 +0200
Subject: [PATCH 06/12] bugfixed CLX as uarch flag

---
 osaca/osaca.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/osaca/osaca.py b/osaca/osaca.py
index ec10e3a..7f5a606 100755
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -20,7 +20,6 @@ SUPPORTED_ARCHS = [
     "BDW",
     "SKX",
     "CSX",
-    "CLX",
     "ICL",
     "ZEN1",
     "ZEN2",
@@ -182,6 +181,9 @@ def check_arguments(args, parser):
     """
     supported_import_files = ["ibench", "asmbench"]
 
+    # manually set CLX to CSX to support both abbreviations
+    if args.arch.upper() == "CLX":
+        args.arch = "CSX"
     if args.arch is None and (args.check_db or "import_data" in args):
         parser.error(
             "DB check and data import cannot work with a default microarchitecture. "
@@ -191,9 +193,6 @@ def check_arguments(args, parser):
         parser.error(
             "Microarchitecture not supported. Please see --help for all valid architecture codes."
         )
-    # manually set CLX to CSX to support both abbreviations
-    if args.arch.upper() == "CLX":
-        args.arch = "CSX"
     if "import_data" in args and args.import_data not in supported_import_files:
         parser.error(
             "Microbenchmark not supported for data import. Please see --help for all valid "

From cfc061e5e35ebd5283972691e6fb51d2eedb93a2 Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Mon, 19 Apr 2021 10:14:26 +0200
Subject: [PATCH 07/12] version bump

---
 osaca/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/osaca/__init__.py b/osaca/__init__.py
index 5d99ad5..18cb511 100644
--- a/osaca/__init__.py
+++ b/osaca/__init__.py
@@ -1,6 +1,6 @@
 """Open Source Architecture Code Analyzer"""
 name = "osaca"
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 
 # To trigger travis deployment to pypi, do the following:
 # 1. Increment __version___

From 3f31235f8a51b5f02f4d11a9c2a423009046fb8e Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Mon, 19 Apr 2021 10:57:51 +0200
Subject: [PATCH 08/12] added no timeout option

---
 osaca/osaca.py               |  3 ++-
 osaca/semantics/kernel_dg.py | 37 +++++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/osaca/osaca.py b/osaca/osaca.py
index 7f5a606..d010bab 100755
--- a/osaca/osaca.py
+++ b/osaca/osaca.py
@@ -153,7 +153,8 @@ def create_parser(parser=None):
         type=int,
         default=10,
         help="Set timeout in seconds for LCD analysis. After timeout, OSACA will continue"
-        " its analysis with the dependency paths found up to this point. Defaults to 10.",
+        " its analysis with the dependency paths found up to this point. Defaults to 10."
+        " Set to -1 for no timeout.",
     )
     parser.add_argument(
         "--verbose", "-v", action="count", default=0, help="Increases verbosity level."
diff --git a/osaca/semantics/kernel_dg.py b/osaca/semantics/kernel_dg.py
index 94a5dfc..2bff2be 100755
--- a/osaca/semantics/kernel_dg.py
+++ b/osaca/semantics/kernel_dg.py
@@ -2,12 +2,10 @@
 
 import copy
 import time
-from collections import defaultdict
-from itertools import accumulate, chain, product
+from itertools import chain
 from multiprocessing import Manager, Process, cpu_count
 
 import networkx as nx
-from osaca.parser import AttrDict
 from osaca.semantics import INSTR_FLAGS, ArchSemantics, MachineModel
 
 
@@ -123,22 +121,27 @@ class KernelDG(nx.DiGraph):
                 ]
                 for p in processes:
                     p.start()
-                start_time = time.time()
-                while time.time() - start_time <= timeout:
-                    if any(p.is_alive() for p in processes):
-                        time.sleep(0.2)
-                    else:
-                        # all procs done
-                        for p in processes:
-                            p.join()
-                        break
-                else:
-                    self.timed_out = True
-                    # terminate running processes
+                if (timeout == -1):
+                    # no timeout
                     for p in processes:
-                        if p.is_alive():
-                            p.kill()
                         p.join()
+                else:
+                    start_time = time.time()
+                    while time.time() - start_time <= timeout:
+                        if any(p.is_alive() for p in processes):
+                            time.sleep(0.2)
+                        else:
+                            # all procs done
+                            for p in processes:
+                                p.join()
+                            break
+                    else:
+                        self.timed_out = True
+                        # terminate running processes
+                        for p in processes:
+                            if p.is_alive():
+                                p.kill()
+                            p.join()
                 all_paths = list(all_paths)
         else:
             # sequential execution to avoid overhead when analyzing smaller kernels

From 6d85fbe9e4f4b9057f03d5e1f6965496313c6d9c Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Mon, 19 Apr 2021 10:58:11 +0200
Subject: [PATCH 09/12] fixed duplicate hyperlink tags

---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index b2dc099..3d54f10 100644
--- a/README.rst
+++ b/README.rst
@@ -377,8 +377,8 @@ Citations
 =========
 If you use OSACA for scientific work you can cite us as  (for the Bibtex, see the `Wiki <https://github.com/RRZE-HPC/OSACA/wiki#acknowledgement>`_):
 
-* `Automated Instruction Stream Throughput Prediction for Intel and AMD Microarchitectures <https://doi.org/10.1109/PMBS.2018.8641578>`_ (`Pre-print <https://arxiv.org/abs/1809.00912>`_)
-* `Automatic Throughput and Critical Path Analysis of x86 and ARM Assembly Kernels <https://doi.org/10.1109/PMBS49563.2019.00006>`_ (`Pre-print <https://arxiv.org/abs/1910.00214>`_)
+* `Automated Instruction Stream Throughput Prediction for Intel and AMD Microarchitectures <https://doi.org/10.1109/PMBS.2018.8641578>`_ (`Pre-print PMBS18 <https://arxiv.org/abs/1809.00912>`_)
+* `Automatic Throughput and Critical Path Analysis of x86 and ARM Assembly Kernels <https://doi.org/10.1109/PMBS49563.2019.00006>`_ (`Pre-print PMBS19 <https://arxiv.org/abs/1910.00214>`_)
 
 Credits
 =======

From dafec70e6e5a5b642c94bc770fc4631594cc312f Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Mon, 19 Apr 2021 11:33:29 +0200
Subject: [PATCH 10/12] added wheel to pypi publishing

---
 .github/workflows/test-n-publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-n-publish.yml b/.github/workflows/test-n-publish.yml
index 6f745b2..f6ac108 100644
--- a/.github/workflows/test-n-publish.yml
+++ b/.github/workflows/test-n-publish.yml
@@ -31,11 +31,11 @@ jobs:
     - uses: codecov/codecov-action@v1
     - name: Build package
       run: |
-        python setup.py build sdist
+        python setup.py build sdist bdist_wheel
     - name: Publish to PyPI
       if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
       uses: pypa/gh-action-pypi-publish@master
       with:
         skip_existing: true
         user: __token__
-        password: ${{ secrets.pypi_password }}
\ No newline at end of file
+        password: ${{ secrets.pypi_password }}

From 3d1c6aae8db08ef8fc4d4ad7e6d89b89e1ce2bcb Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Tue, 20 Apr 2021 13:59:32 +0200
Subject: [PATCH 11/12] set min requirement to py3.6

---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index fd5053a..e26528b 100755
--- a/setup.py
+++ b/setup.py
@@ -91,7 +91,6 @@ setup(
         # Specify the Python versions you support here. In particular, ensure
         # that you indicate wheter you support Python2, Python 3 or both.
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
@@ -107,7 +106,7 @@ setup(
     # requirements files see:
     # https://packaging.python.org/en/latest/requirements.html
     install_requires=["networkx", "pyparsing>=2.3.1", "ruamel.yaml>=0.15.71"],
-    python_requires=">=3.5",
+    python_requires=">=3.6",
     # List additional groups of dependencies here (e.g. development
     # dependencies). You can install these using the following syntax,
     # for example:

From 1de644cd62b87fb64185b2935f57326984fc05ac Mon Sep 17 00:00:00 2001
From: JanLJL <jan.laukemann@fau.de>
Date: Tue, 20 Apr 2021 13:59:56 +0200
Subject: [PATCH 12/12] fixed incompatibilty to py3.6

---
 osaca/frontend.py            | 6 +++---
 osaca/semantics/kernel_dg.py | 6 +++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/osaca/frontend.py b/osaca/frontend.py
index 65ca702..fa6b014 100755
--- a/osaca/frontend.py
+++ b/osaca/frontend.py
@@ -351,9 +351,9 @@ class Frontend(object):
         lcd_text = (
             "-------------------------------- WARNING: LCD analysis timed out "
             "-------------------------------\n         While searching for all dependency chains"
-            " the analysis timed out.\n         Decrease the number of instructions or set the "
-            "timeout threshold with --lcd-timeout.\n         See --help for more "
-            "information.\n" + dashed_line
+            " the analysis timed out and might be\n         incomplete. Decrease the number of "
+            "instructions or set the timeout threshold\n         with --lcd-timeout. See --help"
+            " for more information.\n" + dashed_line
         )
         warnings = "\n"
         warnings += lcd_text if lcd_warning else ""
diff --git a/osaca/semantics/kernel_dg.py b/osaca/semantics/kernel_dg.py
index 2bff2be..97c3a62 100755
--- a/osaca/semantics/kernel_dg.py
+++ b/osaca/semantics/kernel_dg.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 
 import copy
+import os
+import signal
 import time
 from itertools import chain
 from multiprocessing import Manager, Process, cpu_count
@@ -140,7 +142,9 @@ class KernelDG(nx.DiGraph):
                         # terminate running processes
                         for p in processes:
                             if p.is_alive():
-                                p.kill()
+                                # Python 3.6 does not support Process.kill().
+                                # Can be changed to `p.kill()` after EoL (01/22) of Py3.6
+                                os.kill(p.pid, signal.SIGKILL)
                             p.join()
                 all_paths = list(all_paths)
         else: