From 43199278eb52eaeefcec516e70dbfd64aa0f5cf9 Mon Sep 17 00:00:00 2001 From: Andrew Leech Date: Wed, 25 Feb 2026 22:40:10 +1100 Subject: [PATCH] tests/run-tests.py: Ignore known-flaky test failures. Reclassify failures of tests listed in flaky_tests_to_ignore as "ignored" instead of retrying them. Ignored tests still run and their output is reported, but they don't affect the exit code. The ci.sh --exclude lists for these tests are removed so they run normally. Signed-off-by: Andrew Leech --- tests/run-tests.py | 28 ++++++++++++++++++++++++++++ tests/test_utils.py | 16 +++++++++++++++- tools/ci.sh | 18 +++++------------- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/tests/run-tests.py b/tests/run-tests.py index 2add13df21..84daf4cbbf 100755 --- a/tests/run-tests.py +++ b/tests/run-tests.py @@ -31,6 +31,7 @@ from test_utils import ( get_test_instance, prepare_script_for_target, create_test_report, + FLAKY_REASON_PREFIX, ) RV32_ARCH_FLAGS = { @@ -193,6 +194,23 @@ platform_tests_to_skip = { ), } +# Tests with known intermittent failures. These tests still run, but failures +# are reclassified as "ignored" instead of "fail" so they don't affect the CI +# exit code. Paths are relative to the tests/ directory (must match test_file +# format used by run_one_test, which normalises backslashes to forward slashes). +# +# Values are (reason, platforms) tuples where platforms is None (all platforms) +# or a tuple of sys.platform strings to restrict ignoring to those platforms. +flaky_tests_to_ignore = { + "thread/thread_gc1.py": ("GC race condition", None), + "thread/stress_schedule.py": ("intermittent crash under QEMU", None), + "thread/stress_recurse.py": ("stack overflow under emulation", None), + "thread/stress_heap.py": ("flaky on macOS", ("darwin",)), + "cmdline/repl_lock.py": ("REPL timing under QEMU", None), + "cmdline/repl_cont.py": ("REPL escaping on macOS", ("darwin",)), + "extmod/time_time_ns.py": ("CI runner clock precision", None), +} + # These tests don't test float explicitly but rather use it to perform the test. tests_requiring_float = ( "extmod/asyncio_basic.py", @@ -1062,6 +1080,16 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1): print(line) sys.exit(2) + # Reclassify known-flaky test failures as ignored. + # Safe to mutate: thread pool has joined. + results = test_results.value + for i, r in enumerate(results): + if r[1] == "fail": + reason, platforms = flaky_tests_to_ignore.get(r[0], (None, None)) + if reason is not None: + if platforms is None or sys.platform in platforms: + results[i] = (r[0], "ignored", "{}: {}".format(FLAKY_REASON_PREFIX, reason)) + # Return test results. return test_results.value, testcase_count.value diff --git a/tests/test_utils.py b/tests/test_utils.py index 99b92ea7b3..7e43c4cae9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -22,6 +22,9 @@ def base_path(*p): sys.path.append(base_path("../tools")) import pyboard +# Prefix used by run-tests.py to tag known-flaky test results. +FLAKY_REASON_PREFIX = "flaky" + # File with the test results. _RESULTS_FILE = "_results.json" @@ -313,11 +316,12 @@ def create_test_report(args, test_results, testcase_count=None): r for r in test_results if r[1] == "skip" and r[2] == "too large" ) failed_tests = list(r for r in test_results if r[1] == "fail") + ignored_tests = list(r for r in test_results if r[1] == "ignored") dry_run = getattr(args, "dry_run", False) if dry_run: found_tests = list(r for r in test_results if r[1] == "found") - num_tests_performed = len(passed_tests) + len(failed_tests) + num_tests_performed = len(passed_tests) + len(failed_tests) + len(ignored_tests) if dry_run: print("{} tests found".format(len(found_tests))) @@ -329,6 +333,14 @@ def create_test_report(args, test_results, testcase_count=None): print("{} tests passed".format(len(passed_tests))) + if len(ignored_tests) > 0: + print( + "{} tests had known-flaky failures (ignored): {}".format( + len(ignored_tests), + " ".join("{} [{}]".format(t[0], t[2]) for t in ignored_tests), + ) + ) + if len(skipped_tests) > 0: print( "{} tests skipped: {}".format( @@ -365,6 +377,8 @@ def create_test_report(args, test_results, testcase_count=None): "results": list(test for test in test_results), # A list of failed tests. This is deprecated, use the "results" above instead. "failed_tests": [test[0] for test in failed_tests], + # A list of known-flaky tests whose failures were ignored. + "ignored_tests": [test[0] for test in ignored_tests], }, f, default=to_json, diff --git a/tools/ci.sh b/tools/ci.sh index 588bb31638..6bc74bd54c 100755 --- a/tools/ci.sh +++ b/tools/ci.sh @@ -905,9 +905,7 @@ function ci_unix_macos_run_tests { # Issues with macOS tests: # - float_parse and float_parse_doubleprec parse/print floats out by a few mantissa bits # - ffi_callback crashes for an unknown reason - # - thread/stress_heap.py is flaky - # - thread/thread_gc1.py is flaky - (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback|thread/stress_heap|thread/thread_gc1).py') + (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback).py') } function ci_unix_qemu_mips_setup { @@ -927,10 +925,8 @@ function ci_unix_qemu_mips_build { function ci_unix_qemu_mips_run_tests { # Issues with MIPS tests: # - thread/stress_aes.py takes around 90 seconds - # - thread/stress_recurse.py is flaky - # - thread/thread_gc1.py is flaky file ./ports/unix/build-coverage/micropython - (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py') + (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py) } function ci_unix_qemu_arm_setup { @@ -950,10 +946,8 @@ function ci_unix_qemu_arm_build { function ci_unix_qemu_arm_run_tests { # Issues with ARM tests: # - thread/stress_aes.py takes around 70 seconds - # - thread/stress_recurse.py is flaky - # - thread/thread_gc1.py is flaky file ./ports/unix/build-coverage/micropython - (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py') + (cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py) } function ci_unix_qemu_riscv64_setup { @@ -976,12 +970,10 @@ function ci_unix_qemu_riscv64_build { function ci_unix_qemu_riscv64_run_tests { # Issues with RISCV-64 tests: - # - thread/stress_aes.py takes around 180 seconds - # - thread/stress_recurse.py is flaky - # - thread/thread_gc1.py is flaky + # - thread/stress_aes.py takes around 180 seconds, so exclude it to keep execution time down file ./ports/unix/build-coverage/micropython pushd tests - MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=200 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py' + MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-tests.py --exclude 'thread/stress_aes.py' MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-natmodtests.py extmod/btree*.py extmod/deflate*.py extmod/framebuf*.py extmod/heapq*.py extmod/random_basic*.py extmod/re*.py popd }