mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2026-07-05 11:50:52 +02:00
Merge pull request #10771 from julek-wolfssl/socat-parallel-shards
socat CI: run the test suite as parallel shards via parallel-make-check.py
This commit is contained in:
@@ -32,6 +32,20 @@
|
||||
# checks, e.g. [["wolfcrypt/test/testwolfcrypt"]]
|
||||
# comment ignored; JSON has no comment syntax, so notes go here
|
||||
#
|
||||
# The pool is not wolfSSL-specific; these keys let any command ride it:
|
||||
#
|
||||
# build false skips configure/make/check, so the config is just its
|
||||
# prepare+run commands (default true). Use it to run an
|
||||
# arbitrary command across the pool.
|
||||
# netns true runs each command under "bwrap --unshare-net" (its own
|
||||
# network namespace), so parallel network tests can't collide
|
||||
# on ports (default false; needs bubblewrap).
|
||||
# shards fan the config out into N instances run as separate jobs,
|
||||
# each with $SHARD (1..N) and $SHARDS=N in its environment and
|
||||
# its own build-<name>-<k> dir, so a command can split work
|
||||
# N ways (default 1). The pool (--threads) still bounds how
|
||||
# many run at once, so N>threads load-balances dynamically.
|
||||
#
|
||||
# For example:
|
||||
#
|
||||
# [
|
||||
@@ -71,7 +85,7 @@ import threading
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass, field, replace
|
||||
from pathlib import Path
|
||||
from typing import NoReturn
|
||||
|
||||
@@ -94,9 +108,25 @@ class Config:
|
||||
# Whether "minutes" was given in the JSON (vs the 1.0 default); only an
|
||||
# explicit estimate is checked for >50% drift against the real time.
|
||||
minutes_provided: bool = False
|
||||
# Generic-command extensions. Defaults keep a config behaving as a
|
||||
# wolfSSL build. With build=false a config is just its prepare+run
|
||||
# commands (no configure/make/check), so any command can ride the pool.
|
||||
build: bool = True
|
||||
# netns=true runs each command under "bwrap --unshare-net" so parallel
|
||||
# network tests can't collide on ports (same isolation as the .test scripts).
|
||||
netns: bool = False
|
||||
# shards>1 fans the config out into that many instances, each run with
|
||||
# $SHARD (1..N) and $SHARDS=N in its environment so the command can pick
|
||||
# its slice of the work; each instance gets its own build-<name>-<k> dir.
|
||||
shards: int = 1
|
||||
# Extra environment for the commands (set by the shard fan-out).
|
||||
env: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
SRCDIR = Path(__file__).resolve().parents[2]
|
||||
ON_GITHUB = os.environ.get("GITHUB_ACTIONS") == "true"
|
||||
# Used by configs with "netns": true to give each command its own network
|
||||
# namespace (so parallel network tests can't collide on ports).
|
||||
BWRAP = shutil.which("bwrap")
|
||||
print_lock = threading.Lock()
|
||||
|
||||
# Fail-fast state: the first failure sets stop_event (under fail_lock, so
|
||||
@@ -162,7 +192,8 @@ def load_configs(opts: argparse.Namespace,
|
||||
error(f"{opts.json}: config entries must be objects: {entry!r}")
|
||||
unknown = set(entry) - {"name", "configure", "cc", "cflags",
|
||||
"ldflags", "minutes", "user_settings",
|
||||
"check", "prepare", "run", "comment"}
|
||||
"check", "prepare", "run", "comment",
|
||||
"build", "netns", "shards"}
|
||||
if unknown:
|
||||
error(f"{opts.json}: unknown key(s) in {entry.get('name', entry)!r}: "
|
||||
f"{' '.join(sorted(unknown))}")
|
||||
@@ -198,6 +229,12 @@ def load_configs(opts: argparse.Namespace,
|
||||
check = entry.get("check", True)
|
||||
if not isinstance(check, bool):
|
||||
error(f"{opts.json}: \"check\" must be a boolean in {name!r}")
|
||||
for key in ("build", "netns"):
|
||||
if not isinstance(entry.get(key, False), bool):
|
||||
error(f"{opts.json}: \"{key}\" must be a boolean in {name!r}")
|
||||
shards = entry.get("shards", 1)
|
||||
if isinstance(shards, bool) or not isinstance(shards, int) or shards < 1:
|
||||
error(f"{opts.json}: \"shards\" must be an integer >= 1 in {name!r}")
|
||||
cc = entry.get("cc", opts.cc or "")
|
||||
if not isinstance(cc, str):
|
||||
error(f"{opts.json}: \"cc\" must be a string in {name!r}")
|
||||
@@ -215,7 +252,10 @@ def load_configs(opts: argparse.Namespace,
|
||||
float(minutes), user_settings, check,
|
||||
list(entry.get("prepare", [])),
|
||||
list(entry.get("run", [])),
|
||||
minutes_provided="minutes" in entry))
|
||||
minutes_provided="minutes" in entry,
|
||||
build=entry.get("build", True),
|
||||
netns=entry.get("netns", False),
|
||||
shards=shards))
|
||||
if not configs:
|
||||
error(f"{opts.json}: no configs")
|
||||
return configs
|
||||
@@ -323,16 +363,23 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None,
|
||||
lambda: shutil.copy(SRCDIR / cfg.user_settings,
|
||||
bdir / "user_settings.h")))
|
||||
steps += [(" ".join(cmd), cmd) for cmd in cfg.prepare]
|
||||
steps += [("configure", configure), ("make", make)]
|
||||
if cfg.check:
|
||||
steps += [
|
||||
# Prebuild the check programs without running any tests so
|
||||
# "make check" below is pure test execution.
|
||||
("make check TESTS=", make + ["check", "TESTS="]),
|
||||
("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)),
|
||||
("make check", ["make"] + flags + ["check"]),
|
||||
]
|
||||
if cfg.build:
|
||||
steps += [("configure", configure), ("make", make)]
|
||||
if cfg.check:
|
||||
steps += [
|
||||
# Prebuild the check programs without running any tests so
|
||||
# "make check" below is pure test execution.
|
||||
("make check TESTS=", make + ["check", "TESTS="]),
|
||||
("private dirs", lambda: privatize_dirs(bdir, opts.private_dir)),
|
||||
("make check", ["make"] + flags + ["check"]),
|
||||
]
|
||||
steps += [(" ".join(cmd), cmd) for cmd in cfg.run]
|
||||
# With "netns", each command runs in its own network namespace; --chdir
|
||||
# keeps the build dir as cwd inside the sandbox. CAP_NET_ADMIN lets the
|
||||
# command configure that netns (bring interfaces up, add addresses).
|
||||
netns = ([BWRAP, "--unshare-net", "--cap-add", "CAP_NET_ADMIN",
|
||||
"--dev-bind", "/", "/", "--chdir", str(bdir)]
|
||||
if cfg.netns and BWRAP else [])
|
||||
failed: str | None = None
|
||||
start = time.monotonic()
|
||||
log = bdir / "make-check.log"
|
||||
@@ -363,12 +410,14 @@ def run_config(cfg: Config, opts: argparse.Namespace) -> tuple[str | None,
|
||||
failed = record_failure(step)
|
||||
break
|
||||
continue
|
||||
cmd = netns + cmd
|
||||
print(f"+ {' '.join(cmd)}", file=logf, flush=True)
|
||||
# stdin=DEVNULL so a test that reads stdin sees EOF (as in CI)
|
||||
# instead of blocking forever on an interactive/socket stdin.
|
||||
proc = subprocess.Popen(cmd, cwd=bdir, stdout=logf,
|
||||
stderr=subprocess.STDOUT,
|
||||
stdin=subprocess.DEVNULL,
|
||||
env={**os.environ, **cfg.env},
|
||||
start_new_session=True)
|
||||
with procs_lock:
|
||||
live_procs.add(proc)
|
||||
@@ -438,14 +487,21 @@ def summarize(results: list[tuple[Config, str | None, float]],
|
||||
# (serial configure/link/test phases show up here).
|
||||
busy_min = sum(minutes for _, _, minutes in results)
|
||||
ncpu = nproc()
|
||||
thread_min = wall_min * nthreads
|
||||
cpu_avail = wall_min * ncpu
|
||||
# Guard the ratios against a zero wall time (e.g. every job a no-op, which
|
||||
# can happen when there are more shards than work) so the line never
|
||||
# divides by zero.
|
||||
occupancy = 100 * busy_min / thread_min if thread_min else 0
|
||||
cpu_util = 100 * cpu_min / cpu_avail if cpu_avail else 0
|
||||
lines += [
|
||||
"",
|
||||
f"{len(results)} configs in {wall_min:.1f} min on {nthreads} "
|
||||
f"threads / {ncpu} CPUs: "
|
||||
f"thread occupancy {100 * busy_min / (wall_min * nthreads):.0f}% "
|
||||
f"({busy_min:.1f} of {wall_min * nthreads:.1f} thread-min), "
|
||||
f"CPU utilization {100 * cpu_min / (wall_min * ncpu):.0f}% "
|
||||
f"({cpu_min:.1f} of {wall_min * ncpu:.1f} CPU-min)",
|
||||
f"thread occupancy {occupancy:.0f}% "
|
||||
f"({busy_min:.1f} of {thread_min:.1f} thread-min), "
|
||||
f"CPU utilization {cpu_util:.0f}% "
|
||||
f"({cpu_min:.1f} of {cpu_avail:.1f} CPU-min)",
|
||||
]
|
||||
table = "\n".join(lines)
|
||||
print(table)
|
||||
@@ -455,6 +511,18 @@ def summarize(results: list[tuple[Config, str | None, float]],
|
||||
print(f"### make check\n\n{table}", file=f)
|
||||
|
||||
|
||||
def shard_instances(cfg: Config) -> list[Config]:
|
||||
# A config that asks for shards>1 becomes that many independent jobs: each
|
||||
# gets its index as $SHARD (1..N) / $SHARDS=N and its own build-<name>-<k>
|
||||
# dir, so its command can run one slice of the work. A config with the
|
||||
# default shards=1 is left as a single unchanged job.
|
||||
if cfg.shards <= 1:
|
||||
return [cfg]
|
||||
return [replace(cfg, name=f"{cfg.name}-{k}", shards=1,
|
||||
env={**cfg.env, "SHARD": str(k), "SHARDS": str(cfg.shards)})
|
||||
for k in range(1, cfg.shards + 1)]
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Build and make check every configuration from a JSON "
|
||||
@@ -537,6 +605,36 @@ def main() -> int:
|
||||
loads[i] += cfg.minutes
|
||||
selected = shards[k - 1]
|
||||
|
||||
# Replace each config with its shard instances (a no-op for shards=1),
|
||||
# then re-sort so the pool still takes the longest jobs first. Done after
|
||||
# --shard so a CI-level split and in-job fan-out compose.
|
||||
expanded = []
|
||||
for cfg in selected:
|
||||
expanded.extend(shard_instances(cfg))
|
||||
expanded.sort(key=lambda cfg: -cfg.minutes)
|
||||
selected = expanded
|
||||
|
||||
# A fanned-out name (<name>-<k>) could collide with another config's name,
|
||||
# which would make two jobs share a build-<name> dir and race. Catch it,
|
||||
# like the duplicate-name check in load_configs.
|
||||
names = [cfg.name for cfg in selected]
|
||||
dups = sorted({n for n in names if names.count(n) > 1})
|
||||
if dups:
|
||||
p.error(f"config names collide after shard fan-out: {' '.join(dups)}")
|
||||
|
||||
# netns needs bwrap; without it commands silently share the host network
|
||||
# namespace and parallel network tests collide on ports. On CI that silent
|
||||
# degradation is a misconfiguration, so fail loudly; locally just warn and
|
||||
# let the run fall back to the shared namespace. --list needs neither bwrap
|
||||
# nor a netns, so never block it.
|
||||
if not opts.list and any(cfg.netns for cfg in selected) and not BWRAP:
|
||||
msg = ("netns requested but bwrap not found; install bubblewrap "
|
||||
"(without it commands share the host network namespace and "
|
||||
"collide on ports)")
|
||||
if ON_GITHUB:
|
||||
p.error(msg)
|
||||
warn(f"{msg}; falling back to the shared namespace")
|
||||
|
||||
if opts.list:
|
||||
for cfg in selected:
|
||||
print(f"{cfg.name} [{cfg.minutes:g} min]: "
|
||||
@@ -546,7 +644,7 @@ def main() -> int:
|
||||
print(f"shard {opts.shard}: no configs to run")
|
||||
return 0
|
||||
|
||||
if not (SRCDIR / "configure").exists():
|
||||
if any(cfg.build for cfg in selected) and not (SRCDIR / "configure").exists():
|
||||
subprocess.run(["./autogen.sh"], cwd=SRCDIR, check=True)
|
||||
|
||||
nthreads = max(1, min(opts.threads, len(selected)))
|
||||
|
||||
@@ -39,10 +39,11 @@ jobs:
|
||||
|
||||
|
||||
socat_check:
|
||||
name: socat ${{ matrix.socat_version }}
|
||||
if: ${{ (github.repository_owner == 'wolfssl') && (github.event_name != 'pull_request' || github.event.pull_request.draft == false) }}
|
||||
runs-on: ubuntu-24.04
|
||||
# This should be a safe limit for the tests to run.
|
||||
timeout-minutes: 30
|
||||
# This should be a safe limit for the parallel tests to run.
|
||||
timeout-minutes: 15
|
||||
needs: build_wolfssl
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -56,13 +57,15 @@ jobs:
|
||||
- name: Checkout wolfSSL CI actions
|
||||
uses: actions/checkout@v5
|
||||
with:
|
||||
sparse-checkout: .github/actions
|
||||
sparse-checkout: |
|
||||
.github/actions
|
||||
.github/scripts
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install prereqs
|
||||
uses: ./.github/actions/install-apt-deps
|
||||
with:
|
||||
packages: build-essential autoconf libtool pkg-config clang libc++-dev
|
||||
packages: build-essential autoconf libtool pkg-config clang libc++-dev bubblewrap
|
||||
ghcr-debs-tag: ubuntu-24.04-full
|
||||
|
||||
- name: Download lib
|
||||
@@ -91,9 +94,48 @@ jobs:
|
||||
./configure --with-wolfssl=$GITHUB_WORKSPACE/build-dir --enable-default-ipv=4
|
||||
make -j
|
||||
|
||||
# Ubuntu 24.04 can restrict unprivileged user namespaces via AppArmor,
|
||||
# which leaves CAP_NET_ADMIN ineffective inside bwrap's netns; the shards
|
||||
# need it to re-create IPv6 loopback there. Relax the restriction.
|
||||
- name: Allow unprivileged user namespaces (for bwrap)
|
||||
run: sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 || true
|
||||
|
||||
- name: Run socat tests
|
||||
working-directory: ./socat-${{ matrix.socat_version }}
|
||||
env:
|
||||
SOCAT_SRC: ${{ github.workspace }}/socat-${{ matrix.socat_version }}
|
||||
EXPECT_FAIL: ${{ matrix.expect_fail }}
|
||||
run: |
|
||||
export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/build-dir/lib:$LD_LIBRARY_PATH
|
||||
export SHELL=/bin/bash
|
||||
SOCAT=$GITHUB_WORKSPACE/socat-${{ matrix.socat_version }}/socat ./test.sh -t 1.0 --expect-fail ${{ matrix.expect_fail }}
|
||||
# The socat suite is sleep-bound, so run it as parallel shards via the
|
||||
# shared parallel runner. The work is almost all waiting (timeouts and
|
||||
# sleeps; only ~16% CPU even when packed), so oversubscribe: ~6 shards
|
||||
# per CPU below, run 2 per CPU at once (--threads), so several overlap
|
||||
# their waits (bigger runners get proportionally more). Each shard runs
|
||||
# a round-robin slice of the tests ($SHARD/$SHARDS) in its own bwrap
|
||||
# network namespace (no port collisions) and its own build-dir copy.
|
||||
# ${tests:-0} keeps a shard that drew no test numbers a no-op (test 0
|
||||
# matches nothing) instead of letting test.sh fall back to running the
|
||||
# whole suite.
|
||||
#
|
||||
# bwrap --unshare-net gives each shard a fresh netns with loopback up
|
||||
# but IPv4-only; re-create IPv6 loopback (CAP_NET_ADMIN is granted by
|
||||
# the runner) so the suite's ::1 / dual-stack tests work as in the host
|
||||
# namespace. fc00::1 and 192.0.2.1 are non-loopback placeholders so
|
||||
# glibc's AI_ADDRCONFIG still returns IPv6/IPv4: with only loopback
|
||||
# configured it drops the family, and socat's getaddrinfo then fails on
|
||||
# numeric non-loopback addresses (e.g. the multicast tests). Best-effort
|
||||
# (|| true), errors left visible so a runner without IPv6 still runs the
|
||||
# IPv4 tests and any failure stays diagnosable in the log.
|
||||
cat > socat-configs.json <<'EOF'
|
||||
[{
|
||||
"name": "socat", "build": false, "netns": true, "shards": __SHARDS__,
|
||||
"run": [["bash", "-c", "set -e; ip link set lo up || true; sysctl -wq net.ipv6.conf.lo.disable_ipv6=0 || true; ip addr add ::1/128 dev lo || true; ip addr add fc00::1/128 dev lo || true; ip addr add 192.0.2.1/32 dev lo || true; sysctl -wq net.ipv6.bindv6only=0 || true; cp -a \"$SOCAT_SRC/.\" .; tests=$(seq \"$SHARD\" \"$SHARDS\" 999); SOCAT=\"$PWD/socat\" SHELL=/bin/bash ./test.sh -t 1.0 --expect-fail \"$EXPECT_FAIL\" ${tests:-0}"]]
|
||||
}]
|
||||
EOF
|
||||
sed -i "s/__SHARDS__/$(( 6 * $(nproc) ))/" socat-configs.json
|
||||
# Run 2 shards per CPU at once: the per-shard netns isolates ports, so
|
||||
# the only real cost of overlap is CPU, and the suite barely uses any
|
||||
# (mostly waiting), so this just overlaps the waits. fail-fast (the
|
||||
# default) aborts the rest on the first failure.
|
||||
.github/scripts/parallel-make-check.py \
|
||||
--threads "$(( 2 * $(nproc) ))" socat-configs.json
|
||||
|
||||
Reference in New Issue
Block a user