From 1f6abed28ed0085f84eafb75a85cf87fd09bc572 Mon Sep 17 00:00:00 2001 From: Juliusz Sosinowicz Date: Thu, 11 Jun 2026 20:48:19 +0000 Subject: [PATCH] CI hardening: stamp under set -e, SIGKILL escalation for fail-fast Third Copilot review round: - Makefile.am: run the test-data stamp recipe body under set -e. A failed symlink mid-loop previously did not fail the compound command (only the last command's status counted), so a partially-populated build tree could be stamped complete. Now any failed setup command aborts the recipe and the stamp is not created. - parallel-make-check.py: fail-fast sent SIGTERM only, so a test that traps or ignores SIGTERM could keep the job alive until the workflow timeout. abort_others() now polls the swept processes and SIGKILLs whatever is still alive after a 10 s grace period, and the post-registration race-window kill escalates the same way (bounded wait, then SIGKILL). Verified with a config running "trap '' TERM; sleep 300": the run completes in ~10 s with the stubborn config reported as aborted and no surviving processes. --- .github/scripts/parallel-make-check.py | 44 +++++++++++++++++++------- Makefile.am | 7 ++-- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/.github/scripts/parallel-make-check.py b/.github/scripts/parallel-make-check.py index 63d462cb9d..9395470e99 100755 --- a/.github/scripts/parallel-make-check.py +++ b/.github/scripts/parallel-make-check.py @@ -50,8 +50,9 @@ # directory; see --private-dir for the exception. # # The first failing config aborts the others (pending configs are skipped, -# in-flight ones are killed) so CI fails fast; pass --no-fail-fast to run -# everything and report every failure. +# in-flight ones get SIGTERM, then SIGKILL after a 10 s grace period) so CI +# fails fast; pass --no-fail-fast to run everything and report every +# failure. import argparse import json @@ -96,16 +97,33 @@ live_procs = set() procs_lock = threading.Lock() -def abort_others(): - # Every subprocess starts its own session, so killing the process +def kill_group(p, sig): + # Every subprocess starts its own session, so signalling the process # group takes down the whole make/test tree under it. + try: + os.killpg(p.pid, sig) + except (ProcessLookupError, PermissionError): + try: + p.send_signal(sig) + except ProcessLookupError: + pass + + +def abort_others(): with procs_lock: procs = list(live_procs) for p in procs: - try: - os.killpg(p.pid, signal.SIGTERM) - except (ProcessLookupError, PermissionError): - pass + kill_group(p, signal.SIGTERM) + # Bounded escalation: SIGKILL whatever ignored the SIGTERM, so + # fail-fast cannot hang behind a test that traps/ignores SIGTERM. + deadline = time.monotonic() + 10 + while any(p.poll() is None for p in procs): + if time.monotonic() > deadline: + for p in procs: + if p.poll() is None: + kill_group(p, signal.SIGKILL) + break + time.sleep(0.2) def nproc(): @@ -282,11 +300,13 @@ def run_config(cfg, opts): # Close the race with abort_others(): if its sweep ran # between our stop_event check above and the registration # just now, this process escaped the sweep - kill it - # ourselves (the wait() below then reaps it quickly). + # ourselves (the wait() below then reaps it), escalating + # like the sweep does if SIGTERM is ignored. + kill_group(proc, signal.SIGTERM) try: - os.killpg(proc.pid, signal.SIGTERM) - except (ProcessLookupError, PermissionError): - proc.terminate() + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + kill_group(proc, signal.SIGKILL) try: rc = proc.wait() finally: diff --git a/Makefile.am b/Makefile.am index 0215e0600a..4f3f8fce0c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -299,12 +299,15 @@ TESTS += $(check_SCRIPTS) # # The setup and the distclean cleanup use rm -rf: a --private-dir run of # .github/scripts/parallel-make-check.py replaces the certs symlink with a -# private directory copy, which rm -f would not remove. +# private directory copy, which rm -f would not remove. The recipe body +# runs under set -e so a failed symlink aborts the build instead of being +# stamped complete. ############################################################################## BUILT_SOURCES += wolfssl-test-data.stamp wolfssl-test-data.stamp: - $(AM_V_at)if test "$(abs_top_srcdir)" != "$(abs_top_builddir)"; then \ + $(AM_V_at)set -e; \ + if test "$(abs_top_srcdir)" != "$(abs_top_builddir)"; then \ $(MKDIR_P) tests scripts examples; \ for f in certs input quit; do \ rm -rf "$$f"; \