tests/run-tests.py: Ignore known-flaky test failures.

Reclassify failures of tests listed in flaky_tests_to_ignore as "ignored" instead of retrying them. Ignored tests still run and their output is reported, but they don't affect the exit code. The ci.sh --exclude lists for these tests are removed so they run normally. Signed-off-by: Andrew Leech <andrew.leech@planet-innovation.com>
2026-04-06 00:50:15 +02:00 · 2026-02-25 22:40:10 +11:00
parent ad054fc520
commit 43199278eb
3 changed files with 48 additions and 14 deletions
--- a/tests/run-tests.py
+++ b/tests/run-tests.py
@@ -31,6 +31,7 @@ from test_utils import (
    get_test_instance,
    prepare_script_for_target,
    create_test_report,
+    FLAKY_REASON_PREFIX,
 )

 RV32_ARCH_FLAGS = {
@@ -193,6 +194,23 @@ platform_tests_to_skip = {
    ),
 }

+# Tests with known intermittent failures. These tests still run, but failures
+# are reclassified as "ignored" instead of "fail" so they don't affect the CI
+# exit code. Paths are relative to the tests/ directory (must match test_file
+# format used by run_one_test, which normalises backslashes to forward slashes).
+#
+# Values are (reason, platforms) tuples where platforms is None (all platforms)
+# or a tuple of sys.platform strings to restrict ignoring to those platforms.
+flaky_tests_to_ignore = {
+    "thread/thread_gc1.py": ("GC race condition", None),
+    "thread/stress_schedule.py": ("intermittent crash under QEMU", None),
+    "thread/stress_recurse.py": ("stack overflow under emulation", None),
+    "thread/stress_heap.py": ("flaky on macOS", ("darwin",)),
+    "cmdline/repl_lock.py": ("REPL timing under QEMU", None),
+    "cmdline/repl_cont.py": ("REPL escaping on macOS", ("darwin",)),
+    "extmod/time_time_ns.py": ("CI runner clock precision", None),
+}
+
 # These tests don't test float explicitly but rather use it to perform the test.
 tests_requiring_float = (
    "extmod/asyncio_basic.py",
@@ -1062,6 +1080,16 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1):
            print(line)
        sys.exit(2)

+    # Reclassify known-flaky test failures as ignored.
+    # Safe to mutate: thread pool has joined.
+    results = test_results.value
+    for i, r in enumerate(results):
+        if r[1] == "fail":
+            reason, platforms = flaky_tests_to_ignore.get(r[0], (None, None))
+            if reason is not None:
+                if platforms is None or sys.platform in platforms:
+                    results[i] = (r[0], "ignored", "{}: {}".format(FLAKY_REASON_PREFIX, reason))
+
    # Return test results.
    return test_results.value, testcase_count.value

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -22,6 +22,9 @@ def base_path(*p):
 sys.path.append(base_path("../tools"))
 import pyboard

+# Prefix used by run-tests.py to tag known-flaky test results.
+FLAKY_REASON_PREFIX = "flaky"
+
 # File with the test results.
 _RESULTS_FILE = "_results.json"

@@ -313,11 +316,12 @@ def create_test_report(args, test_results, testcase_count=None):
        r for r in test_results if r[1] == "skip" and r[2] == "too large"
    )
    failed_tests = list(r for r in test_results if r[1] == "fail")
+    ignored_tests = list(r for r in test_results if r[1] == "ignored")
    dry_run = getattr(args, "dry_run", False)
    if dry_run:
        found_tests = list(r for r in test_results if r[1] == "found")

-    num_tests_performed = len(passed_tests) + len(failed_tests)
+    num_tests_performed = len(passed_tests) + len(failed_tests) + len(ignored_tests)

    if dry_run:
        print("{} tests found".format(len(found_tests)))
@@ -329,6 +333,14 @@ def create_test_report(args, test_results, testcase_count=None):

        print("{} tests passed".format(len(passed_tests)))

+    if len(ignored_tests) > 0:
+        print(
+            "{} tests had known-flaky failures (ignored): {}".format(
+                len(ignored_tests),
+                " ".join("{} [{}]".format(t[0], t[2]) for t in ignored_tests),
+            )
+        )
+
    if len(skipped_tests) > 0:
        print(
            "{} tests skipped: {}".format(
@@ -365,6 +377,8 @@ def create_test_report(args, test_results, testcase_count=None):
                "results": list(test for test in test_results),
                # A list of failed tests.  This is deprecated, use the "results" above instead.
                "failed_tests": [test[0] for test in failed_tests],
+                # A list of known-flaky tests whose failures were ignored.
+                "ignored_tests": [test[0] for test in ignored_tests],
            },
            f,
            default=to_json,
--- a/tools/ci.sh
+++ b/tools/ci.sh
@@ -905,9 +905,7 @@ function ci_unix_macos_run_tests {
    # Issues with macOS tests:
    # - float_parse and float_parse_doubleprec parse/print floats out by a few mantissa bits
    # - ffi_callback crashes for an unknown reason
-    # - thread/stress_heap.py is flaky
-    # - thread/thread_gc1.py is flaky
-    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback|thread/stress_heap|thread/thread_gc1).py')
+    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback).py')
 }

 function ci_unix_qemu_mips_setup {
@@ -927,10 +925,8 @@ function ci_unix_qemu_mips_build {
 function ci_unix_qemu_mips_run_tests {
    # Issues with MIPS tests:
    # - thread/stress_aes.py takes around 90 seconds
-    # - thread/stress_recurse.py is flaky
-    # - thread/thread_gc1.py is flaky
    file ./ports/unix/build-coverage/micropython
-    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py')
+    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py)
 }

 function ci_unix_qemu_arm_setup {
@@ -950,10 +946,8 @@ function ci_unix_qemu_arm_build {
 function ci_unix_qemu_arm_run_tests {
    # Issues with ARM tests:
    # - thread/stress_aes.py takes around 70 seconds
-    # - thread/stress_recurse.py is flaky
-    # - thread/thread_gc1.py is flaky
    file ./ports/unix/build-coverage/micropython
-    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py')
+    (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py)
 }

 function ci_unix_qemu_riscv64_setup {
@@ -976,12 +970,10 @@ function ci_unix_qemu_riscv64_build {

 function ci_unix_qemu_riscv64_run_tests {
    # Issues with RISCV-64 tests:
-    # - thread/stress_aes.py takes around 180 seconds
-    # - thread/stress_recurse.py is flaky
-    # - thread/thread_gc1.py is flaky
+    # - thread/stress_aes.py takes around 180 seconds, so exclude it to keep execution time down
    file ./ports/unix/build-coverage/micropython
    pushd tests
-    MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=200 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py'
+    MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-tests.py --exclude 'thread/stress_aes.py'
    MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-natmodtests.py extmod/btree*.py extmod/deflate*.py extmod/framebuf*.py extmod/heapq*.py extmod/random_basic*.py extmod/re*.py
    popd
 }