tests/run-tests.py: Ignore known-flaky test failures.

Reclassify failures of tests listed in flaky_tests_to_ignore as "ignored"
instead of retrying them. Ignored tests still run and their output is
reported, but they don't affect the exit code. The ci.sh --exclude lists
for these tests are removed so they run normally.

Signed-off-by: Andrew Leech <andrew.leech@planet-innovation.com>
This commit is contained in:
Andrew Leech
2026-02-25 22:40:10 +11:00
committed by Damien George
parent ad054fc520
commit 43199278eb
3 changed files with 48 additions and 14 deletions

View File

@@ -31,6 +31,7 @@ from test_utils import (
get_test_instance,
prepare_script_for_target,
create_test_report,
FLAKY_REASON_PREFIX,
)
RV32_ARCH_FLAGS = {
@@ -193,6 +194,23 @@ platform_tests_to_skip = {
),
}
# Tests with known intermittent failures. These tests still run, but failures
# are reclassified as "ignored" instead of "fail" so they don't affect the CI
# exit code. Paths are relative to the tests/ directory (must match test_file
# format used by run_one_test, which normalises backslashes to forward slashes).
#
# Values are (reason, platforms) tuples where platforms is None (all platforms)
# or a tuple of sys.platform strings to restrict ignoring to those platforms.
flaky_tests_to_ignore = {
"thread/thread_gc1.py": ("GC race condition", None),
"thread/stress_schedule.py": ("intermittent crash under QEMU", None),
"thread/stress_recurse.py": ("stack overflow under emulation", None),
"thread/stress_heap.py": ("flaky on macOS", ("darwin",)),
"cmdline/repl_lock.py": ("REPL timing under QEMU", None),
"cmdline/repl_cont.py": ("REPL escaping on macOS", ("darwin",)),
"extmod/time_time_ns.py": ("CI runner clock precision", None),
}
# These tests don't test float explicitly but rather use it to perform the test.
tests_requiring_float = (
"extmod/asyncio_basic.py",
@@ -1062,6 +1080,16 @@ def run_tests(pyb, tests, args, result_dir, num_threads=1):
print(line)
sys.exit(2)
# Reclassify known-flaky test failures as ignored.
# Safe to mutate: thread pool has joined.
results = test_results.value
for i, r in enumerate(results):
if r[1] == "fail":
reason, platforms = flaky_tests_to_ignore.get(r[0], (None, None))
if reason is not None:
if platforms is None or sys.platform in platforms:
results[i] = (r[0], "ignored", "{}: {}".format(FLAKY_REASON_PREFIX, reason))
# Return test results.
return test_results.value, testcase_count.value

View File

@@ -22,6 +22,9 @@ def base_path(*p):
sys.path.append(base_path("../tools"))
import pyboard
# Prefix used by run-tests.py to tag known-flaky test results.
FLAKY_REASON_PREFIX = "flaky"
# File with the test results.
_RESULTS_FILE = "_results.json"
@@ -313,11 +316,12 @@ def create_test_report(args, test_results, testcase_count=None):
r for r in test_results if r[1] == "skip" and r[2] == "too large"
)
failed_tests = list(r for r in test_results if r[1] == "fail")
ignored_tests = list(r for r in test_results if r[1] == "ignored")
dry_run = getattr(args, "dry_run", False)
if dry_run:
found_tests = list(r for r in test_results if r[1] == "found")
num_tests_performed = len(passed_tests) + len(failed_tests)
num_tests_performed = len(passed_tests) + len(failed_tests) + len(ignored_tests)
if dry_run:
print("{} tests found".format(len(found_tests)))
@@ -329,6 +333,14 @@ def create_test_report(args, test_results, testcase_count=None):
print("{} tests passed".format(len(passed_tests)))
if len(ignored_tests) > 0:
print(
"{} tests had known-flaky failures (ignored): {}".format(
len(ignored_tests),
" ".join("{} [{}]".format(t[0], t[2]) for t in ignored_tests),
)
)
if len(skipped_tests) > 0:
print(
"{} tests skipped: {}".format(
@@ -365,6 +377,8 @@ def create_test_report(args, test_results, testcase_count=None):
"results": list(test for test in test_results),
# A list of failed tests. This is deprecated, use the "results" above instead.
"failed_tests": [test[0] for test in failed_tests],
# A list of known-flaky tests whose failures were ignored.
"ignored_tests": [test[0] for test in ignored_tests],
},
f,
default=to_json,

View File

@@ -905,9 +905,7 @@ function ci_unix_macos_run_tests {
# Issues with macOS tests:
# - float_parse and float_parse_doubleprec parse/print floats out by a few mantissa bits
# - ffi_callback crashes for an unknown reason
# - thread/stress_heap.py is flaky
# - thread/thread_gc1.py is flaky
(cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback|thread/stress_heap|thread/thread_gc1).py')
(cd tests && MICROPY_MICROPYTHON=../ports/unix/build-standard/micropython ./run-tests.py --exclude '(float_parse|float_parse_doubleprec|ffi_callback).py')
}
function ci_unix_qemu_mips_setup {
@@ -927,10 +925,8 @@ function ci_unix_qemu_mips_build {
function ci_unix_qemu_mips_run_tests {
# Issues with MIPS tests:
# - thread/stress_aes.py takes around 90 seconds
# - thread/stress_recurse.py is flaky
# - thread/thread_gc1.py is flaky
file ./ports/unix/build-coverage/micropython
(cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py')
(cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=180 ./run-tests.py)
}
function ci_unix_qemu_arm_setup {
@@ -950,10 +946,8 @@ function ci_unix_qemu_arm_build {
function ci_unix_qemu_arm_run_tests {
# Issues with ARM tests:
# - thread/stress_aes.py takes around 70 seconds
# - thread/stress_recurse.py is flaky
# - thread/thread_gc1.py is flaky
file ./ports/unix/build-coverage/micropython
(cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py')
(cd tests && MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=90 ./run-tests.py)
}
function ci_unix_qemu_riscv64_setup {
@@ -976,12 +970,10 @@ function ci_unix_qemu_riscv64_build {
function ci_unix_qemu_riscv64_run_tests {
# Issues with RISCV-64 tests:
# - thread/stress_aes.py takes around 180 seconds
# - thread/stress_recurse.py is flaky
# - thread/thread_gc1.py is flaky
# - thread/stress_aes.py takes around 180 seconds, so exclude it to keep execution time down
file ./ports/unix/build-coverage/micropython
pushd tests
MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython MICROPY_TEST_TIMEOUT=200 ./run-tests.py --exclude 'thread/stress_recurse.py|thread/thread_gc1.py'
MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-tests.py --exclude 'thread/stress_aes.py'
MICROPY_MICROPYTHON=../ports/unix/build-coverage/micropython ./run-natmodtests.py extmod/btree*.py extmod/deflate*.py extmod/framebuf*.py extmod/heapq*.py extmod/random_basic*.py extmod/re*.py
popd
}