From c9d71d52f8919f77da6ffe32bd97f8b2d9e82c86 Mon Sep 17 00:00:00 2001 From: Juliusz Sosinowicz Date: Thu, 25 Jun 2026 09:35:13 +0000 Subject: [PATCH] parallel-make-check.py: add generic pool extensions for arbitrary commands Let any command ride the build/check pool, not just wolfSSL builds: build false skips configure/make/check (config is just prepare+run) netns true runs each command under 'bwrap --unshare-net --cap-add CAP_NET_ADMIN' (its own network namespace) so parallel network tests can't collide on ports and can configure that namespace shards fan a config out into N instances, each with $SHARD (1..N) and $SHARDS=N in its env and its own build-- dir, so a command can split its work N ways (the pool load-balances them) Error out, rather than silently degrade, on two misconfigurations that otherwise surface as confusing test failures: netns requested but bwrap missing (commands would share the host namespace and collide on ports), and config-name collisions after shard fan-out (two jobs would share a build dir and race). --- .github/scripts/parallel-make-check.py | 124 +++++++++++++++++++++---- 1 file changed, 107 insertions(+), 17 deletions(-) diff --git a/.github/scripts/parallel-make-check.py b/.github/scripts/parallel-make-check.py index 4e8febcf31..039b01648d 100755 --- a/.github/scripts/parallel-make-check.py +++ b/.github/scripts/parallel-make-check.py @@ -32,6 +32,20 @@ # checks, e.g. [["wolfcrypt/test/testwolfcrypt"]] # comment ignored; JSON has no comment syntax, so notes go here # +# The pool is not wolfSSL-specific; these keys let any command ride it: +# +# build false skips configure/make/check, so the config is just its +# prepare+run commands (default true). Use it to run an +# arbitrary command across the pool. +# netns true runs each command under "bwrap --unshare-net" (its own +# network namespace), so parallel network tests can't collide +# on ports (default false; needs bubblewrap). +# shards fan the config out into N instances run as separate jobs, +# each with $SHARD (1..N) and $SHARDS=N in its environment and +# its own build-- dir, so a command can split work +# N ways (default 1). The pool (--threads) still bounds how +# many run at once, so N>threads load-balances dynamically. +# # For example: # # [ @@ -71,7 +85,7 @@ import threading import time from collections.abc import Callable from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from pathlib import Path from typing import NoReturn @@ -94,9 +108,25 @@ class Config: # Whether "minutes" was given in the JSON (vs the 1.0 default); only an # explicit estimate is checked for >50% drift against the real time. minutes_provided: bool = False + # Generic-command extensions. Defaults keep a config behaving as a + # wolfSSL build. With build=false a config is just its prepare+run + # commands (no configure/make/check), so any command can ride the pool. + build: bool = True + # netns=true runs each command under "bwrap --unshare-net" so parallel + # network tests can't collide on ports (same isolation as the .test scripts). + netns: bool = False + # shards>1 fans the config out into that many instances, each run with + # $SHARD (1..N) and $SHARDS=N in its environment so the command can pick + # its slice of the work; each instance gets its own build-- dir. + shards: int = 1 + # Extra environment for the commands (set by the shard fan-out). + env: dict[str, str] = field(default_factory=dict) SRCDIR = Path(__file__).resolve().parents[2] ON_GITHUB = os.environ.get("GITHUB_ACTIONS") == "true" +# Used by configs with "netns": true to give each command its own network +# namespace (so parallel network tests can't collide on ports). +BWRAP = shutil.which("bwrap") print_lock = threading.Lock() # Fail-fast state: the first failure sets stop_event (under fail_lock, so @@ -162,7 +192,8 @@ def load_configs(opts: argparse.Namespace, error(f"{opts.json}: config entries must be objects: {entry!r}") unknown = set(entry) - {"name", "configure", "cc", "cflags", "ldflags", "minutes", "user_settings", - "check", "prepare", "run", "comment"} + "check", "prepare", "run", "comment", + "build", "netns", "shards"} if unknown: error(f"{opts.json}: unknown key(s) in {entry.get('name', entry)!r}: " f"{' '.join(sorted(unknown))}") @@ -198,6 +229,12 @@ def load_configs(opts: argparse.Namespace, check = entry.get("check", True) if not isinstance(check, bool): error(f"{opts.json}: \"check\" must be a boolean in {name!r}") + for key in ("build", "netns"): + if not isinstance(entry.get(key, False), bool): + error(f"{opts.json}: \"{key}\" must be a boolean in {name!r}") + shards = entry.get("shards", 1) + if isinstance(shards, bool) or not isinstance(shards, int) or shards < 1: + error(f"{opts.json}: \"shards\" must be an integer >= 1 in {name!r}") cc = entry.get("cc", opts.cc or "") if not isinstance(cc, str): error(f"{opts.json}: \"cc\" must be a string in {name!r}") @@ -215,7 +252,10 @@ def load_configs(opts: argparse.Namespace, float(minutes), user_settings, check, list(entry.get("prepare", [])), list(entry.get("run", [])), - minutes_provided="minutes" in entry)) + minutes_provided="minutes" in entry, + build=entry.get("build", True), + netns=entry.get("netns", False), + shards=shards)) if not configs: error(f"{opts.json}: no configs") return configs @@ -323,16 +363,23 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None, lambda: shutil.copy(SRCDIR / cfg.user_settings, bdir / "user_settings.h"))) steps += [(" ".join(cmd), cmd) for cmd in cfg.prepare] - steps += [("configure", configure), ("make", make)] - if cfg.check: - steps += [ - # Prebuild the check programs without running any tests so - # "make check" below is pure test execution. - ("make check TESTS=", make + ["check", "TESTS="]), - ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)), - ("make check", ["make"] + flags + ["check"]), - ] + if cfg.build: + steps += [("configure", configure), ("make", make)] + if cfg.check: + steps += [ + # Prebuild the check programs without running any tests so + # "make check" below is pure test execution. + ("make check TESTS=", make + ["check", "TESTS="]), + ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)), + ("make check", ["make"] + flags + ["check"]), + ] steps += [(" ".join(cmd), cmd) for cmd in cfg.run] + # With "netns", each command runs in its own network namespace; --chdir + # keeps the build dir as cwd inside the sandbox. CAP_NET_ADMIN lets the + # command configure that netns (bring interfaces up, add addresses). + netns = ([BWRAP, "--unshare-net", "--cap-add", "CAP_NET_ADMIN", + "--dev-bind", "/", "/", "--chdir", str(bdir)] + if cfg.netns and BWRAP else []) failed: str | None = None start = time.monotonic() log = bdir / "make-check.log" @@ -363,12 +410,14 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None, failed = record_failure(step) break continue + cmd = netns + cmd print(f"+ {' '.join(cmd)}", file=logf, flush=True) # stdin=DEVNULL so a test that reads stdin sees EOF (as in CI) # instead of blocking forever on an interactive/socket stdin. proc = subprocess.Popen(cmd, cwd=bdir, stdout=logf, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, + env={**os.environ, **cfg.env}, start_new_session=True) with procs_lock: live_procs.add(proc) @@ -438,14 +487,21 @@ def summarize(results: list[tuple[Config, str | None, float]], # (serial configure/link/test phases show up here). busy_min = sum(minutes for _, _, minutes in results) ncpu = nproc() + thread_min = wall_min * nthreads + cpu_avail = wall_min * ncpu + # Guard the ratios against a zero wall time (e.g. every job a no-op, which + # can happen when there are more shards than work) so the line never + # divides by zero. + occupancy = 100 * busy_min / thread_min if thread_min else 0 + cpu_util = 100 * cpu_min / cpu_avail if cpu_avail else 0 lines += [ "", f"{len(results)} configs in {wall_min:.1f} min on {nthreads} " f"threads / {ncpu} CPUs: " - f"thread occupancy {100 * busy_min / (wall_min * nthreads):.0f}% " - f"({busy_min:.1f} of {wall_min * nthreads:.1f} thread-min), " - f"CPU utilization {100 * cpu_min / (wall_min * ncpu):.0f}% " - f"({cpu_min:.1f} of {wall_min * ncpu:.1f} CPU-min)", + f"thread occupancy {occupancy:.0f}% " + f"({busy_min:.1f} of {thread_min:.1f} thread-min), " + f"CPU utilization {cpu_util:.0f}% " + f"({cpu_min:.1f} of {cpu_avail:.1f} CPU-min)", ] table = "\n".join(lines) print(table) @@ -455,6 +511,18 @@ def summarize(results: list[tuple[Config, str | None, float]], print(f"### make check\n\n{table}", file=f) +def shard_instances(cfg: Config) -> list[Config]: + # A config that asks for shards>1 becomes that many independent jobs: each + # gets its index as $SHARD (1..N) / $SHARDS=N and its own build-- + # dir, so its command can run one slice of the work. A config with the + # default shards=1 is left as a single unchanged job. + if cfg.shards <= 1: + return [cfg] + return [replace(cfg, name=f"{cfg.name}-{k}", shards=1, + env={**cfg.env, "SHARD": str(k), "SHARDS": str(cfg.shards)}) + for k in range(1, cfg.shards + 1)] + + def main() -> int: p = argparse.ArgumentParser( description="Build and make check every configuration from a JSON " @@ -537,6 +605,28 @@ def main() -> int: loads[i] += cfg.minutes selected = shards[k - 1] + # Replace each config with its shard instances (a no-op for shards=1), + # then re-sort so the pool still takes the longest jobs first. Done after + # --shard so a CI-level split and in-job fan-out compose. + expanded = [] + for cfg in selected: + expanded.extend(shard_instances(cfg)) + expanded.sort(key=lambda cfg: -cfg.minutes) + selected = expanded + + # A fanned-out name (-) could collide with another config's name, + # which would make two jobs share a build- dir and race. Catch it, + # like the duplicate-name check in load_configs. + names = [cfg.name for cfg in selected] + dups = sorted({n for n in names if names.count(n) > 1}) + if dups: + p.error(f"config names collide after shard fan-out: {' '.join(dups)}") + + if any(cfg.netns for cfg in selected) and not BWRAP: + p.error("netns requested but bwrap not found; install bubblewrap " + "(without it the commands share the host network namespace " + "and collide on ports)") + if opts.list: for cfg in selected: print(f"{cfg.name} [{cfg.minutes:g} min]: " @@ -546,7 +636,7 @@ def main() -> int: print(f"shard {opts.shard}: no configs to run") return 0 - if not (SRCDIR / "configure").exists(): + if any(cfg.build for cfg in selected) and not (SRCDIR / "configure").exists(): subprocess.run(["./autogen.sh"], cwd=SRCDIR, check=True) nthreads = max(1, min(opts.threads, len(selected)))