Merge pull request #10771 from julek-wolfssl/socat-parallel-shards

socat CI: run the test suite as parallel shards via parallel-make-check.py
2026-07-05 11:50:52 +02:00 · 2026-06-25 08:00:59 -07:00
parent ac01707f55 044a477378
commit 10444189d4
2 changed files with 164 additions and 24 deletions
@@ -32,6 +32,20 @@
 #              checks, e.g. [["wolfcrypt/test/testwolfcrypt"]]
 #   comment    ignored; JSON has no comment syntax, so notes go here
 #
+# The pool is not wolfSSL-specific; these keys let any command ride it:
+#
+#   build      false skips configure/make/check, so the config is just its
+#              prepare+run commands (default true). Use it to run an
+#              arbitrary command across the pool.
+#   netns      true runs each command under "bwrap --unshare-net" (its own
+#              network namespace), so parallel network tests can't collide
+#              on ports (default false; needs bubblewrap).
+#   shards     fan the config out into N instances run as separate jobs,
+#              each with $SHARD (1..N) and $SHARDS=N in its environment and
+#              its own build-<name>-<k> dir, so a command can split work
+#              N ways (default 1). The pool (--threads) still bounds how
+#              many run at once, so N>threads load-balances dynamically.
+#
 # For example:
 #
 #   [
@@ -71,7 +85,7 @@ import threading
 import time
 from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import NoReturn

@@ -94,9 +108,25 @@ class Config:
    # Whether "minutes" was given in the JSON (vs the 1.0 default); only an
    # explicit estimate is checked for >50% drift against the real time.
    minutes_provided: bool = False
+    # Generic-command extensions. Defaults keep a config behaving as a
+    # wolfSSL build. With build=false a config is just its prepare+run
+    # commands (no configure/make/check), so any command can ride the pool.
+    build: bool = True
+    # netns=true runs each command under "bwrap --unshare-net" so parallel
+    # network tests can't collide on ports (same isolation as the .test scripts).
+    netns: bool = False
+    # shards>1 fans the config out into that many instances, each run with
+    # $SHARD (1..N) and $SHARDS=N in its environment so the command can pick
+    # its slice of the work; each instance gets its own build-<name>-<k> dir.
+    shards: int = 1
+    # Extra environment for the commands (set by the shard fan-out).
+    env: dict[str, str] = field(default_factory=dict)

 SRCDIR = Path(__file__).resolve().parents[2]
 ON_GITHUB = os.environ.get("GITHUB_ACTIONS") == "true"
+# Used by configs with "netns": true to give each command its own network
+# namespace (so parallel network tests can't collide on ports).
+BWRAP = shutil.which("bwrap")
 print_lock = threading.Lock()

 # Fail-fast state: the first failure sets stop_event (under fail_lock, so
@@ -162,7 +192,8 @@ def load_configs(opts: argparse.Namespace,
            error(f"{opts.json}: config entries must be objects: {entry!r}")
        unknown = set(entry) - {"name", "configure", "cc", "cflags",
                                "ldflags", "minutes", "user_settings",
-                                "check", "prepare", "run", "comment"}
+                                "check", "prepare", "run", "comment",
+                                "build", "netns", "shards"}
        if unknown:
            error(f"{opts.json}: unknown key(s) in {entry.get('name', entry)!r}: "
                  f"{' '.join(sorted(unknown))}")
@@ -198,6 +229,12 @@ def load_configs(opts: argparse.Namespace,
        check = entry.get("check", True)
        if not isinstance(check, bool):
            error(f"{opts.json}: \"check\" must be a boolean in {name!r}")
+        for key in ("build", "netns"):
+            if not isinstance(entry.get(key, False), bool):
+                error(f"{opts.json}: \"{key}\" must be a boolean in {name!r}")
+        shards = entry.get("shards", 1)
+        if isinstance(shards, bool) or not isinstance(shards, int) or shards < 1:
+            error(f"{opts.json}: \"shards\" must be an integer >= 1 in {name!r}")
        cc = entry.get("cc", opts.cc or "")
        if not isinstance(cc, str):
            error(f"{opts.json}: \"cc\" must be a string in {name!r}")
@@ -215,7 +252,10 @@ def load_configs(opts: argparse.Namespace,
                              float(minutes), user_settings, check,
                              list(entry.get("prepare", [])),
                              list(entry.get("run", [])),
-                              minutes_provided="minutes" in entry))
+                              minutes_provided="minutes" in entry,
+                              build=entry.get("build", True),
+                              netns=entry.get("netns", False),
+                              shards=shards))
    if not configs:
        error(f"{opts.json}: no configs")
    return configs
@@ -323,16 +363,23 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None,
                      lambda: shutil.copy(SRCDIR / cfg.user_settings,
                                          bdir / "user_settings.h")))
    steps += [(" ".join(cmd), cmd) for cmd in cfg.prepare]
-    steps += [("configure", configure), ("make", make)]
-    if cfg.check:
-        steps += [
-            # Prebuild the check programs without running any tests so
-            # "make check" below is pure test execution.
-            ("make check TESTS=", make + ["check", "TESTS="]),
-            ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)),
-            ("make check", ["make"] + flags + ["check"]),
-        ]
+    if cfg.build:
+        steps += [("configure", configure), ("make", make)]
+        if cfg.check:
+            steps += [
+                # Prebuild the check programs without running any tests so
+                # "make check" below is pure test execution.
+                ("make check TESTS=", make + ["check", "TESTS="]),
+                ("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)),
+                ("make check", ["make"] + flags + ["check"]),
+            ]
    steps += [(" ".join(cmd), cmd) for cmd in cfg.run]
+    # With "netns", each command runs in its own network namespace; --chdir
+    # keeps the build dir as cwd inside the sandbox. CAP_NET_ADMIN lets the
+    # command configure that netns (bring interfaces up, add addresses).
+    netns = ([BWRAP, "--unshare-net", "--cap-add", "CAP_NET_ADMIN",
+              "--dev-bind", "/", "/", "--chdir", str(bdir)]
+             if cfg.netns and BWRAP else [])
    failed: str | None = None
    start = time.monotonic()
    log = bdir / "make-check.log"
@@ -363,12 +410,14 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None,
                    failed = record_failure(step)
                    break
                continue
+            cmd = netns + cmd
            print(f"+ {' '.join(cmd)}", file=logf, flush=True)
            # stdin=DEVNULL so a test that reads stdin sees EOF (as in CI)
            # instead of blocking forever on an interactive/socket stdin.
            proc = subprocess.Popen(cmd, cwd=bdir, stdout=logf,
                                    stderr=subprocess.STDOUT,
                                    stdin=subprocess.DEVNULL,
+                                    env={**os.environ, **cfg.env},
                                    start_new_session=True)
            with procs_lock:
                live_procs.add(proc)
@@ -438,14 +487,21 @@ def summarize(results: list[tuple[Config, str | None, float]],
    # (serial configure/link/test phases show up here).
    busy_min = sum(minutes for _, _, minutes in results)
    ncpu = nproc()
+    thread_min = wall_min * nthreads
+    cpu_avail = wall_min * ncpu
+    # Guard the ratios against a zero wall time (e.g. every job a no-op, which
+    # can happen when there are more shards than work) so the line never
+    # divides by zero.
+    occupancy = 100 * busy_min / thread_min if thread_min else 0
+    cpu_util = 100 * cpu_min / cpu_avail if cpu_avail else 0
    lines += [
        "",
        f"{len(results)} configs in {wall_min:.1f} min on {nthreads} "
        f"threads / {ncpu} CPUs: "
-        f"thread occupancy {100 * busy_min / (wall_min * nthreads):.0f}% "
-        f"({busy_min:.1f} of {wall_min * nthreads:.1f} thread-min), "
-        f"CPU utilization {100 * cpu_min / (wall_min * ncpu):.0f}% "
-        f"({cpu_min:.1f} of {wall_min * ncpu:.1f} CPU-min)",
+        f"thread occupancy {occupancy:.0f}% "
+        f"({busy_min:.1f} of {thread_min:.1f} thread-min), "
+        f"CPU utilization {cpu_util:.0f}% "
+        f"({cpu_min:.1f} of {cpu_avail:.1f} CPU-min)",
    ]
    table = "\n".join(lines)
    print(table)
@@ -455,6 +511,18 @@ def summarize(results: list[tuple[Config, str | None, float]],
            print(f"### make check\n\n{table}", file=f)


+def shard_instances(cfg: Config) -> list[Config]:
+    # A config that asks for shards>1 becomes that many independent jobs: each
+    # gets its index as $SHARD (1..N) / $SHARDS=N and its own build-<name>-<k>
+    # dir, so its command can run one slice of the work. A config with the
+    # default shards=1 is left as a single unchanged job.
+    if cfg.shards <= 1:
+        return [cfg]
+    return [replace(cfg, name=f"{cfg.name}-{k}", shards=1,
+                    env={**cfg.env, "SHARD": str(k), "SHARDS": str(cfg.shards)})
+            for k in range(1, cfg.shards + 1)]
+
+
 def main() -> int:
    p = argparse.ArgumentParser(
        description="Build and make check every configuration from a JSON "
@@ -537,6 +605,36 @@ def main() -> int:
            loads[i] += cfg.minutes
        selected = shards[k - 1]

+    # Replace each config with its shard instances (a no-op for shards=1),
+    # then re-sort so the pool still takes the longest jobs first. Done after
+    # --shard so a CI-level split and in-job fan-out compose.
+    expanded = []
+    for cfg in selected:
+        expanded.extend(shard_instances(cfg))
+    expanded.sort(key=lambda cfg: -cfg.minutes)
+    selected = expanded
+
+    # A fanned-out name (<name>-<k>) could collide with another config's name,
+    # which would make two jobs share a build-<name> dir and race. Catch it,
+    # like the duplicate-name check in load_configs.
+    names = [cfg.name for cfg in selected]
+    dups = sorted({n for n in names if names.count(n) > 1})
+    if dups:
+        p.error(f"config names collide after shard fan-out: {' '.join(dups)}")
+
+    # netns needs bwrap; without it commands silently share the host network
+    # namespace and parallel network tests collide on ports. On CI that silent
+    # degradation is a misconfiguration, so fail loudly; locally just warn and
+    # let the run fall back to the shared namespace. --list needs neither bwrap
+    # nor a netns, so never block it.
+    if not opts.list and any(cfg.netns for cfg in selected) and not BWRAP:
+        msg = ("netns requested but bwrap not found; install bubblewrap "
+               "(without it commands share the host network namespace and "
+               "collide on ports)")
+        if ON_GITHUB:
+            p.error(msg)
+        warn(f"{msg}; falling back to the shared namespace")
+
    if opts.list:
        for cfg in selected:
            print(f"{cfg.name} [{cfg.minutes:g} min]: "
@@ -546,7 +644,7 @@ def main() -> int:
        print(f"shard {opts.shard}: no configs to run")
        return 0

-    if not (SRCDIR / "configure").exists():
+    if any(cfg.build for cfg in selected) and not (SRCDIR / "configure").exists():
        subprocess.run(["./autogen.sh"], cwd=SRCDIR, check=True)

    nthreads = max(1, min(opts.threads, len(selected)))
@@ -39,10 +39,11 @@ jobs:


  socat_check:
+    name: socat ${{ matrix.socat_version }}
    if: ${{ (github.repository_owner == 'wolfssl') && (github.event_name != 'pull_request' || github.event.pull_request.draft == false) }}
    runs-on: ubuntu-24.04
-    # This should be a safe limit for the tests to run.
-    timeout-minutes: 30
+    # This should be a safe limit for the parallel tests to run.
+    timeout-minutes: 15
    needs: build_wolfssl
    strategy:
      fail-fast: false
@@ -56,13 +57,15 @@ jobs:
      - name: Checkout wolfSSL CI actions
        uses: actions/checkout@v5
        with:
-          sparse-checkout: .github/actions
+          sparse-checkout: |
+            .github/actions
+            .github/scripts
          fetch-depth: 1

      - name: Install prereqs
        uses: ./.github/actions/install-apt-deps
        with:
-          packages: build-essential autoconf libtool pkg-config clang libc++-dev
+          packages: build-essential autoconf libtool pkg-config clang libc++-dev bubblewrap
          ghcr-debs-tag: ubuntu-24.04-full

      - name: Download lib
@@ -91,9 +94,48 @@ jobs:
          ./configure --with-wolfssl=$GITHUB_WORKSPACE/build-dir --enable-default-ipv=4
          make -j

+      # Ubuntu 24.04 can restrict unprivileged user namespaces via AppArmor,
+      # which leaves CAP_NET_ADMIN ineffective inside bwrap's netns; the shards
+      # need it to re-create IPv6 loopback there. Relax the restriction.
+      - name: Allow unprivileged user namespaces (for bwrap)
+        run: sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 || true
+
      - name: Run socat tests
-        working-directory: ./socat-${{ matrix.socat_version }}
+        env:
+          SOCAT_SRC: ${{ github.workspace }}/socat-${{ matrix.socat_version }}
+          EXPECT_FAIL: ${{ matrix.expect_fail }}
        run: |
          export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/build-dir/lib:$LD_LIBRARY_PATH
-          export SHELL=/bin/bash
-          SOCAT=$GITHUB_WORKSPACE/socat-${{ matrix.socat_version }}/socat ./test.sh -t 1.0 --expect-fail ${{ matrix.expect_fail }}
+          # The socat suite is sleep-bound, so run it as parallel shards via the
+          # shared parallel runner. The work is almost all waiting (timeouts and
+          # sleeps; only ~16% CPU even when packed), so oversubscribe: ~6 shards
+          # per CPU below, run 2 per CPU at once (--threads), so several overlap
+          # their waits (bigger runners get proportionally more). Each shard runs
+          # a round-robin slice of the tests ($SHARD/$SHARDS) in its own bwrap
+          # network namespace (no port collisions) and its own build-dir copy.
+          # ${tests:-0} keeps a shard that drew no test numbers a no-op (test 0
+          # matches nothing) instead of letting test.sh fall back to running the
+          # whole suite.
+          #
+          # bwrap --unshare-net gives each shard a fresh netns with loopback up
+          # but IPv4-only; re-create IPv6 loopback (CAP_NET_ADMIN is granted by
+          # the runner) so the suite's ::1 / dual-stack tests work as in the host
+          # namespace. fc00::1 and 192.0.2.1 are non-loopback placeholders so
+          # glibc's AI_ADDRCONFIG still returns IPv6/IPv4: with only loopback
+          # configured it drops the family, and socat's getaddrinfo then fails on
+          # numeric non-loopback addresses (e.g. the multicast tests). Best-effort
+          # (|| true), errors left visible so a runner without IPv6 still runs the
+          # IPv4 tests and any failure stays diagnosable in the log.
+          cat > socat-configs.json <<'EOF'
+          [{
+            "name": "socat", "build": false, "netns": true, "shards": __SHARDS__,
+            "run": [["bash", "-c", "set -e; ip link set lo up || true; sysctl -wq net.ipv6.conf.lo.disable_ipv6=0 || true; ip addr add ::1/128 dev lo || true; ip addr add fc00::1/128 dev lo || true; ip addr add 192.0.2.1/32 dev lo || true; sysctl -wq net.ipv6.bindv6only=0 || true; cp -a \"$SOCAT_SRC/.\" .; tests=$(seq \"$SHARD\" \"$SHARDS\" 999); SOCAT=\"$PWD/socat\" SHELL=/bin/bash ./test.sh -t 1.0 --expect-fail \"$EXPECT_FAIL\" ${tests:-0}"]]
+          }]
+          EOF
+          sed -i "s/__SHARDS__/$(( 6 * $(nproc) ))/" socat-configs.json
+          # Run 2 shards per CPU at once: the per-shard netns isolates ports, so
+          # the only real cost of overlap is CPU, and the suite barely uses any
+          # (mostly waiting), so this just overlaps the waits. fail-fast (the
+          # default) aborts the rest on the first failure.
+          .github/scripts/parallel-make-check.py \
+            --threads "$(( 2 * $(nproc) ))" socat-configs.json