From eb59a12b3662eafdcc2a6fb963749566cd5811b1 Mon Sep 17 00:00:00 2001 From: Juliusz Sosinowicz Date: Thu, 11 Jun 2026 20:28:31 +0000 Subject: [PATCH] parallel-make-check: close the fail-fast race, contain callable errors Two fixes from the second Copilot review round: A process spawned between abort_others()' live_procs snapshot and its registration escaped the kill sweep, leaving that build/check running to completion after fail-fast had begun. Re-check stop_event right after registering the process and SIGTERM its process group if the abort already started: either the registration happened before the sweep's snapshot (the sweep kills it) or it happened after stop_event was set (the re-check sees it), so the window is closed. Exceptions from callable steps (user_settings staging, private-dir copies) used to escape the worker thread and crash the whole script with no summary. They are now recorded as that config's step failure with the exception written to its make-check.log, e.g. a bad "user_settings" path reports FAIL (stage ) while the other configs keep running; the fail-fast bookkeeping is shared with the nonzero-exit path via record_failure(). --- .github/scripts/parallel-make-check.py | 41 +++++++++++++++++++------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/.github/scripts/parallel-make-check.py b/.github/scripts/parallel-make-check.py index 57d0b61d3a..63d462cb9d 100755 --- a/.github/scripts/parallel-make-check.py +++ b/.github/scripts/parallel-make-check.py @@ -242,13 +242,32 @@ def run_config(cfg, opts): failed = None start = time.monotonic() log = bdir / "make-check.log" + + def record_failure(step): + # Classify a failed step, doing the fail-fast bookkeeping: the + # first failure wins and aborts everyone else; any failure after + # the abort began is reported as aborted instead. + if not opts.fail_fast: + return step + with fail_lock: + label = "aborted" if stop_event.is_set() else step + stop_event.set() + if label != "aborted": + abort_others() + return label + with open(log, "w") as logf: for step, cmd in steps: if opts.fail_fast and stop_event.is_set(): failed = "aborted" break if callable(cmd): - cmd() + try: + cmd() + except Exception as e: # one config's bug, not the run's + print(f"+ {step}: {e!r}", file=logf, flush=True) + failed = record_failure(step) + break continue print(f"+ {' '.join(cmd)}", file=logf, flush=True) # stdin=DEVNULL so a test that reads stdin sees EOF (as in CI) @@ -259,22 +278,22 @@ def run_config(cfg, opts): start_new_session=True) with procs_lock: live_procs.add(proc) + if opts.fail_fast and stop_event.is_set(): + # Close the race with abort_others(): if its sweep ran + # between our stop_event check above and the registration + # just now, this process escaped the sweep - kill it + # ourselves (the wait() below then reaps it quickly). + try: + os.killpg(proc.pid, signal.SIGTERM) + except (ProcessLookupError, PermissionError): + proc.terminate() try: rc = proc.wait() finally: with procs_lock: live_procs.discard(proc) if rc != 0: - if opts.fail_fast: - # The first failure wins; any nonzero exit after the - # abort began was most likely our SIGTERM. - with fail_lock: - failed = "aborted" if stop_event.is_set() else step - stop_event.set() - if failed != "aborted": - abort_others() - else: - failed = step + failed = record_failure(step) break minutes = (time.monotonic() - start) / 60 with print_lock: