From 43199278eb52eaeefcec516e70dbfd64aa0f5cf9 Mon Sep 17 00:00:00 2001
From: Andrew Leech <andrew.leech@planetinnovation.com.au>
Date: Wed, 25 Feb 2026 22:40:10 +1100
Subject: [PATCH] tests/run-tests.py: Ignore known-flaky test failures.

Reclassify failures of tests listed in flaky_tests_to_ignore as "ignored"
instead of retrying them. Ignored tests still run and their output is
reported, but they don't affect the exit code. The ci.sh --exclude lists
for these tests are removed so they run normally.

Signed-off-by: Andrew Leech <andrew.leech@planet-innovation.com>
---
 tests/run-tests.py  | 28 ++++++++++++++++++++++++++++
 tests/test_utils.py | 16 +++++++++++++++-
 tools/ci.sh         | 18 +++++-------------
 3 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/tests/run-tests.py b/tests/run-tests.py
index 2add13df21..84daf4cbbf 100755
--- a/tests/run-tests.py
+++ b/tests/run-tests.py
@@ -31,6 +31,7 @@ from test_utils import (
     get_test_instance,
     prepare_script_for_target,
     create_test_report,
+    FLAKY_REASON_PREFIX,
 )
 
 RV32_ARCH_FLAGS = {
@@ -193,6 +194,23 @@ platform_tests_to_skip = {
     ),
 }
 
+# Tests with known intermittent failures. These tests still run, but failures
+# are reclassified as "ignored" instead of "fail" so they don't affect the CI
+# exit code. Paths are relative to the tests/ directory (must match test_file
+# format used by run_one_test, which normalises backslashes to forward slashes).
+#
+# Values are (reason, platforms) tuples where platforms is None (all platforms)
+# or a tuple of sys.platform strings to restrict ignoring to those platforms.
+flaky_tests_to_ignore = {
+    "thread/thread_gc1.py": ("GC race condition", None),
+    "thread/stress_schedule.py": ("intermittent crash under QEMU", None),
+    "thread/stress_recurse.py": ("stack overflow under emulation", None),
+    "thread/stress_heap.py": ("flaky on macOS", ("darwin",)),
+    "cmdline/repl_lock.py": ("REPL timing under QEMU", None),
+    "cmdline/repl_cont.py": ("REPL escaping on macOS", ("darwin",)),
+    "extmod/time_time_ns.py": ("CI runner clock precision", None),
+}
+
 # These tests don't test float explicitly but rather use it to perform the test.
 tests_requiring_float = (
     "extmod/asyncio_basic.py",
@@ -1062,6 +1080,16 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1):
             print(line)
         sys.exit(2)
 
+    # Reclassify known-flaky test failures as ignored.
+    # Safe to mutate: thread pool has joined.
+    results = test_results.value
+    for i, r in enumerate(results):
+        if r[1] == "fail":
+            reason, platforms = flaky_tests_to_ignore.get(r[0], (None, None))
+            if reason is not None:
+                if platforms is None or sys.platform in platforms:
+                    results[i] = (r[0], "ignored", "{}: {}".format(FLAKY_REASON_PREFIX, reason))
+
     # Return test results.
     return test_results.value, testcase_count.value
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 99b92ea7b3..7e43c4cae9 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -22,6 +22,9 @@ def base_path(*p):
 sys.path.append(base_path("../tools"))
 import pyboard
 
+# Prefix used by run-tests.py to tag known-flaky test results.
+FLAKY_REASON_PREFIX = "flaky"
+
 # File with the test results.
 _RESULTS_FILE = "_results.json"
 
@@ -313,11 +316,12 @@ def create_test_report(args, test_results, testcase_count=None):
         r for r in test_results if r[1] == "skip" and r[2] == "too large"
     )
     failed_tests = list(r for r in test_results if r[1] == "fail")
+    ignored_tests = list(r for r in test_results if r[1] == "ignored")
     dry_run = getattr(args, "dry_run", False)
     if dry_run:
         found_tests = list(r for r in test_results if r[1] == "found")
 
-    num_tests_performed = len(passed_tests) + len(failed_tests)
+    num_tests_performed = len(passed_tests) + len(failed_tests) + len(ignored_tests)
 
     if dry_run:
         print("{} tests found".format(len(found_tests)))
@@ -329,6 +333,14 @@ def create_test_report(args, test_results, testcase_count=None):
 
         print("{} tests passed".format(len(passed_tests)))
 
+    if len(ignored_tests) > 0:
+        print(
+            "{} tests had known-flaky failures (ignored): {}".format(
+                len(ignored_tests),
+                " ".join("{} [{}]".format(t[0], t[2]) for t in ignored_tests),
+            )
+        )
+
     if len(skipped_tests) > 0:
         print(
             "{} tests skipped: {}".format(
@@ -365,6 +377,8 @@ def create_test_report(args, test_results, testcase_count=None):
                 "results": list(test for test in test_results),
                 # A list of failed tests.  This is deprecated, use the "results" above instead.
                 "failed_tests": [test[0] for test in failed_tests],
+                # A list of known-flaky tests whose failures were ignored.
+                "ignored_tests": [test[0] for test in ignored_tests],
             },
             f,
             default=to_json,
diff --git a/tools/ci.sh b/tools/ci.sh
index 588bb31638..6bc74bd54c 100755
--- a/tools/ci.sh
+++ b/tools/ci.sh
@@ -905,9 +905,7 @@ function ci_unix_macos_run_tests {
     # Issues with macOS tests:
     # - float_parse and float_parse_doubleprec parse/print floats out by a few mantissa bits
     # - ffi_callback crashes for an unknown reason
-    # - thread/stress_heap.py is flaky
-    # - thread/thread_gc1.py is flaky
-    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback|thread/stress_heap|thread/thread_gc1).py')
+    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback).py')
 }
 
 function ci_unix_qemu_mips_setup {
@@ -927,10 +925,8 @@ function ci_unix_qemu_mips_build {
 function ci_unix_qemu_mips_run_tests {
     # Issues with MIPS tests:
     # - thread/stress_aes.py takes around 90 seconds
-    # - thread/stress_recurse.py is flaky
-    # - thread/thread_gc1.py is flaky
     file ./ports/unix/build-coverage/micropython
-    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py')
+    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py)
 }
 
 function ci_unix_qemu_arm_setup {
@@ -950,10 +946,8 @@ function ci_unix_qemu_arm_build {
 function ci_unix_qemu_arm_run_tests {
     # Issues with ARM tests:
     # - thread/stress_aes.py takes around 70 seconds
-    # - thread/stress_recurse.py is flaky
-    # - thread/thread_gc1.py is flaky
     file ./ports/unix/build-coverage/micropython
-    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py')
+    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py)
 }
 
 function ci_unix_qemu_riscv64_setup {
@@ -976,12 +970,10 @@ function ci_unix_qemu_riscv64_build {
 
 function ci_unix_qemu_riscv64_run_tests {
     # Issues with RISCV-64 tests:
-    # - thread/stress_aes.py takes around 180 seconds
-    # - thread/stress_recurse.py is flaky
-    # - thread/thread_gc1.py is flaky
+    # - thread/stress_aes.py takes around 180 seconds, so exclude it to keep execution time down
     file ./ports/unix/build-coverage/micropython
     pushd tests
-    MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=200 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py'
+    MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-tests.py --exclude 'thread/stress_aes.py'
     MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-natmodtests.py extmod/btree*.py extmod/deflate*.py extmod/framebuf*.py extmod/heapq*.py extmod/random_basic*.py extmod/re*.py
     popd
 }