From c9d71d52f8919f77da6ffe32bd97f8b2d9e82c86 Mon Sep 17 00:00:00 2001
From: Juliusz Sosinowicz <juliusz@wolfssl.com>
Date: Thu, 25 Jun 2026 09:35:13 +0000
Subject: [PATCH] parallel-make-check.py: add generic pool extensions for
 arbitrary commands

Let any command ride the build/check pool, not just wolfSSL builds:
  build  false skips configure/make/check (config is just prepare+run)
  netns  true runs each command under 'bwrap --unshare-net --cap-add
         CAP_NET_ADMIN' (its own network namespace) so parallel network
         tests can't collide on ports and can configure that namespace
  shards fan a config out into N instances, each with $SHARD (1..N) and
         $SHARDS=N in its env and its own build-<name>-<k> dir, so a
         command can split its work N ways (the pool load-balances them)

Error out, rather than silently degrade, on two misconfigurations that
otherwise surface as confusing test failures: netns requested but bwrap
missing (commands would share the host namespace and collide on ports),
and config-name collisions after shard fan-out (two jobs would share a
build dir and race).
---
 .github/scripts/parallel-make-check.py | 124 +++++++++++++++++++++----
 1 file changed, 107 insertions(+), 17 deletions(-)
diff --git a/.github/scripts/parallel-make-check.py b/.github/scripts/parallel-make-check.py
index 4e8febcf31..039b01648d 100755
--- a/.github/scripts/parallel-make-check.py
+++ b/.github/scripts/parallel-make-check.py
@@ -32,6 +32,20 @@
 #              checks, e.g. [["wolfcrypt/test/testwolfcrypt"]]
 #   comment    ignored; JSON has no comment syntax, so notes go here
 #
+# The pool is not wolfSSL-specific; these keys let any command ride it:
+#
+#   build      false skips configure/make/check, so the config is just its
+#              prepare+run commands (default true). Use it to run an
+#              arbitrary command across the pool.
+#   netns      true runs each command under "bwrap --unshare-net" (its own
+#              network namespace), so parallel network tests can't collide
+#              on ports (default false; needs bubblewrap).
+#   shards     fan the config out into N instances run as separate jobs,
+#              each with $SHARD (1..N) and $SHARDS=N in its environment and
+#              its own build-<name>-<k> dir, so a command can split work
+#              N ways (default 1). The pool (--threads) still bounds how
+#              many run at once, so N>threads load-balances dynamically.
+#
 # For example:
 #
 #   [
@@ -71,7 +85,7 @@ import threading
 import time
 from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import NoReturn
 
@@ -94,9 +108,25 @@ class Config:
     # Whether "minutes" was given in the JSON (vs the 1.0 default); only an
     # explicit estimate is checked for >50% drift against the real time.
     minutes_provided: bool = False
+    # Generic-command extensions. Defaults keep a config behaving as a
+    # wolfSSL build. With build=false a config is just its prepare+run
+    # commands (no configure/make/check), so any command can ride the pool.
+    build: bool = True
+    # netns=true runs each command under "bwrap --unshare-net" so parallel
+    # network tests can't collide on ports (same isolation as the .test scripts).
+    netns: bool = False
+    # shards>1 fans the config out into that many instances, each run with
+    # $SHARD (1..N) and $SHARDS=N in its environment so the command can pick
+    # its slice of the work; each instance gets its own build-<name>-<k> dir.
+    shards: int = 1
+    # Extra environment for the commands (set by the shard fan-out).
+    env: dict[str, str] = field(default_factory=dict)
 
 SRCDIR = Path(__file__).resolve().parents[2]
 ON_GITHUB = os.environ.get("GITHUB_ACTIONS") == "true"
+# Used by configs with "netns": true to give each command its own network
+# namespace (so parallel network tests can't collide on ports).
+BWRAP = shutil.which("bwrap")
 print_lock = threading.Lock()
 
 # Fail-fast state: the first failure sets stop_event (under fail_lock, so
@@ -162,7 +192,8 @@ def load_configs(opts: argparse.Namespace,
             error(f"{opts.json}: config entries must be objects: {entry!r}")
         unknown = set(entry) - {"name", "configure", "cc", "cflags",
                                 "ldflags", "minutes", "user_settings",
-                                "check", "prepare", "run", "comment"}
+                                "check", "prepare", "run", "comment",
+                                "build", "netns", "shards"}
         if unknown:
             error(f"{opts.json}: unknown key(s) in {entry.get('name', entry)!r}: "
                   f"{' '.join(sorted(unknown))}")
@@ -198,6 +229,12 @@ def load_configs(opts: argparse.Namespace,
         check = entry.get("check", True)
         if not isinstance(check, bool):
             error(f"{opts.json}: \"check\" must be a boolean in {name!r}")
+        for key in ("build", "netns"):
+            if not isinstance(entry.get(key, False), bool):
+                error(f"{opts.json}: \"{key}\" must be a boolean in {name!r}")
+        shards = entry.get("shards", 1)
+        if isinstance(shards, bool) or not isinstance(shards, int) or shards < 1:
+            error(f"{opts.json}: \"shards\" must be an integer >= 1 in {name!r}")
         cc = entry.get("cc", opts.cc or "")
         if not isinstance(cc, str):
             error(f"{opts.json}: \"cc\" must be a string in {name!r}")
@@ -215,7 +252,10 @@ def load_configs(opts: argparse.Namespace,
                               float(minutes), user_settings, check,
                               list(entry.get("prepare", [])),
                               list(entry.get("run", [])),
-                              minutes_provided="minutes" in entry))
+                              minutes_provided="minutes" in entry,
+                              build=entry.get("build", True),
+                              netns=entry.get("netns", False),
+                              shards=shards))
     if not configs:
         error(f"{opts.json}: no configs")
     return configs
@@ -323,16 +363,23 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None,
                       lambda: shutil.copy(SRCDIR / cfg.user_settings,
                                           bdir / "user_settings.h")))
     steps += [(" ".join(cmd), cmd) for cmd in cfg.prepare]
-    steps += [("configure", configure), ("make", make)]
-    if cfg.check:
-        steps += [
-            # Prebuild the check programs without running any tests so
-            # "make check" below is pure test execution.
-            ("make check TESTS=", make + ["check", "TESTS="]),
-            ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)),
-            ("make check", ["make"] + flags + ["check"]),
-        ]
+    if cfg.build:
+        steps += [("configure", configure), ("make", make)]
+        if cfg.check:
+            steps += [
+                # Prebuild the check programs without running any tests so
+                # "make check" below is pure test execution.
+                ("make check TESTS=", make + ["check", "TESTS="]),
+                ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)),
+                ("make check", ["make"] + flags + ["check"]),
+            ]
     steps += [(" ".join(cmd), cmd) for cmd in cfg.run]
+    # With "netns", each command runs in its own network namespace; --chdir
+    # keeps the build dir as cwd inside the sandbox. CAP_NET_ADMIN lets the
+    # command configure that netns (bring interfaces up, add addresses).
+    netns = ([BWRAP, "--unshare-net", "--cap-add", "CAP_NET_ADMIN",
+              "--dev-bind", "/", "/", "--chdir", str(bdir)]
+             if cfg.netns and BWRAP else [])
     failed: str | None = None
     start = time.monotonic()
     log = bdir / "make-check.log"
@@ -363,12 +410,14 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None,
                     failed = record_failure(step)
                     break
                 continue
+            cmd = netns + cmd
             print(f"+ {' '.join(cmd)}", file=logf, flush=True)
             # stdin=DEVNULL so a test that reads stdin sees EOF (as in CI)
             # instead of blocking forever on an interactive/socket stdin.
             proc = subprocess.Popen(cmd, cwd=bdir, stdout=logf,
                                     stderr=subprocess.STDOUT,
                                     stdin=subprocess.DEVNULL,
+                                    env={**os.environ, **cfg.env},
                                     start_new_session=True)
             with procs_lock:
                 live_procs.add(proc)
@@ -438,14 +487,21 @@ def summarize(results: list[tuple[Config, str | None, float]],
     # (serial configure/link/test phases show up here).
     busy_min = sum(minutes for _, _, minutes in results)
     ncpu = nproc()
+    thread_min = wall_min * nthreads
+    cpu_avail = wall_min * ncpu
+    # Guard the ratios against a zero wall time (e.g. every job a no-op, which
+    # can happen when there are more shards than work) so the line never
+    # divides by zero.
+    occupancy = 100 * busy_min / thread_min if thread_min else 0
+    cpu_util = 100 * cpu_min / cpu_avail if cpu_avail else 0
     lines += [
         "",
         f"{len(results)} configs in {wall_min:.1f} min on {nthreads} "
         f"threads / {ncpu} CPUs: "
-        f"thread occupancy {100 * busy_min / (wall_min * nthreads):.0f}% "
-        f"({busy_min:.1f} of {wall_min * nthreads:.1f} thread-min), "
-        f"CPU utilization {100 * cpu_min / (wall_min * ncpu):.0f}% "
-        f"({cpu_min:.1f} of {wall_min * ncpu:.1f} CPU-min)",
+        f"thread occupancy {occupancy:.0f}% "
+        f"({busy_min:.1f} of {thread_min:.1f} thread-min), "
+        f"CPU utilization {cpu_util:.0f}% "
+        f"({cpu_min:.1f} of {cpu_avail:.1f} CPU-min)",
     ]
     table = "\n".join(lines)
     print(table)
@@ -455,6 +511,18 @@ def summarize(results: list[tuple[Config, str | None, float]],
             print(f"### make check\n\n{table}", file=f)
 
 
+def shard_instances(cfg: Config) -> list[Config]:
+    # A config that asks for shards>1 becomes that many independent jobs: each
+    # gets its index as $SHARD (1..N) / $SHARDS=N and its own build-<name>-<k>
+    # dir, so its command can run one slice of the work. A config with the
+    # default shards=1 is left as a single unchanged job.
+    if cfg.shards <= 1:
+        return [cfg]
+    return [replace(cfg, name=f"{cfg.name}-{k}", shards=1,
+                    env={**cfg.env, "SHARD": str(k), "SHARDS": str(cfg.shards)})
+            for k in range(1, cfg.shards + 1)]
+
+
 def main() -> int:
     p = argparse.ArgumentParser(
         description="Build and make check every configuration from a JSON "
@@ -537,6 +605,28 @@ def main() -> int:
             loads[i] += cfg.minutes
         selected = shards[k - 1]
 
+    # Replace each config with its shard instances (a no-op for shards=1),
+    # then re-sort so the pool still takes the longest jobs first. Done after
+    # --shard so a CI-level split and in-job fan-out compose.
+    expanded = []
+    for cfg in selected:
+        expanded.extend(shard_instances(cfg))
+    expanded.sort(key=lambda cfg: -cfg.minutes)
+    selected = expanded
+
+    # A fanned-out name (<name>-<k>) could collide with another config's name,
+    # which would make two jobs share a build-<name> dir and race. Catch it,
+    # like the duplicate-name check in load_configs.
+    names = [cfg.name for cfg in selected]
+    dups = sorted({n for n in names if names.count(n) > 1})
+    if dups:
+        p.error(f"config names collide after shard fan-out: {' '.join(dups)}")
+
+    if any(cfg.netns for cfg in selected) and not BWRAP:
+        p.error("netns requested but bwrap not found; install bubblewrap "
+                "(without it the commands share the host network namespace "
+                "and collide on ports)")
+
     if opts.list:
         for cfg in selected:
             print(f"{cfg.name} [{cfg.minutes:g} min]: "
@@ -546,7 +636,7 @@ def main() -> int:
         print(f"shard {opts.shard}: no configs to run")
         return 0
 
-    if not (SRCDIR / "configure").exists():
+    if any(cfg.build for cfg in selected) and not (SRCDIR / "configure").exists():
         subprocess.run(["./autogen.sh"], cwd=SRCDIR, check=True)
 
     nthreads = max(1, min(opts.threads, len(selected)))