From 092d9407194d0e646ae40097c40cb7caaa9d51ce Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 19 Mar 2026 06:16:40 +0000 Subject: [PATCH 1/5] feat/nv ci test --- .ci/README.md | 171 ++++++++++++++++++++++++++++ .ci/build.py | 210 +++++++++++++++++++++++++++++++++++ .ci/config.yaml | 36 ++++++ .ci/images/ascend/Dockerfile | 31 ++++++ .ci/images/nvidia/Dockerfile | 26 +++++ .ci/run.py | 195 ++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- tests/test_add.py | 71 ++++++------ tests/test_rms_norm.py | 11 +- 9 files changed, 718 insertions(+), 35 deletions(-) create mode 100644 .ci/README.md create mode 100644 .ci/build.py create mode 100644 .ci/config.yaml create mode 100644 .ci/images/ascend/Dockerfile create mode 100644 .ci/images/nvidia/Dockerfile create mode 100644 .ci/run.py diff --git a/.ci/README.md b/.ci/README.md new file mode 100644 index 0000000..59ee101 --- /dev/null +++ b/.ci/README.md @@ -0,0 +1,171 @@ +# .ci — CI 镜像与流水线 + +本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。 + +## 目录结构 + +``` +.ci/ +├── config.yaml # 统一配置(registry、镜像、job 定义) +├── build.py # 镜像构建脚本 +├── run.py # CI 流水线执行脚本 +├── README.md +└── images/ + ├── nvidia/Dockerfile # NVIDIA 平台镜像 + └── ascend/Dockerfile # 昇腾平台镜像 +``` + +## 前置依赖 + +- Docker +- Python 3.10+ +- pyyaml (`pip install pyyaml`) + +## 配置文件 `config.yaml` + +```yaml +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +registry: + url: "" # Harbor 地址,本地开发时留空 + project: infiniops + credentials_env: REGISTRY_TOKEN + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" + +jobs: + nvidia_gpu: + image: stable # stable | latest | 具体 commit hash + platform: nvidia + resources: + gpu_ids: "0" # GPU 设备 ID,如 "0" "0,2" "all" + gpu_type: A100 + memory: 32GB + timeout: 3600 + setup: pip install .[dev] + stages: + - name: test + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml +``` + +- **`registry.url`** 为空时镜像仅保存在本地,tag 格式为 `-ci/:`。 +- **`images..build_args`** 会作为 `--build-arg` 传入 `docker build`。 +- **`jobs..image`** 支持 `stable`、`latest` 或具体 commit hash。 +- **`resources.gpu_ids`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 + +## 镜像构建 `build.py` + +```bash +python .ci/build.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--platform` | `all` | 构建平台:`nvidia`、`ascend` 或 `all` | +| `--commit` | `HEAD` | 用于镜像 tag 的 git ref | +| `--push` | — | 构建后推送到 registry | +| `--force` | — | 跳过变更检测,强制构建 | +| `--dry-run` | — | 仅打印命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 构建 nvidia 镜像(自动检测 Dockerfile 变更,无变更则跳过) +python .ci/build.py --platform nvidia + +# 强制构建 +python .ci/build.py --platform nvidia --force + +# 构建全部平台并推送到 registry +python .ci/build.py --push --force + +# 预览实际执行的 docker 命令 +python .ci/build.py --platform nvidia --force --dry-run +``` + +### 构建流程 + +1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更(`--force` 跳过此步) +2. `docker build` 构建镜像,同时打 `` 和 `latest` 两个 tag +3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器 +4. 若指定 `--push`,将两个 tag 推送到 registry + +### 产物 + +| Tag | 说明 | +|---|---| +| `infiniops-ci/:` | 精确追溯到某次构建 | +| `infiniops-ci/:latest` | 最近一次构建 | + +## 流水线执行 `run.py` + +```bash +python .ci/run.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--job` | 配置中第一个 job | 要执行的 job 名称 | +| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 | +| `--stage` | 全部 | 仅运行指定 stage | +| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 | +| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID,如 `0`、`0,2`、`all` | +| `--dry-run` | — | 仅打印 docker 命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 运行默认 job +python .ci/run.py + +# 指定分支和镜像版本 +python .ci/run.py --branch feature-xxx --image-tag latest + +# 只用 GPU 0 运行 +python .ci/run.py --gpu-id 0 + +# 用 GPU 0 和 2 运行 +python .ci/run.py --gpu-id 0,2 + +# 使用全部 GPU +python .ci/run.py --gpu-id all + +# 只跑 test stage +python .ci/run.py --stage test + +# 预览 docker 命令 +python .ci/run.py --dry-run +``` + +### 执行流程 + +1. 解析 job 配置,拉取对应镜像 +2. `docker run` 启动容器(自动挂载 GPU、限制内存) +3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令 +4. 依次执行各 stage,汇总结果 + +## 代理配置 + +如果网络环境需要代理,在宿主机设置环境变量后即可: + +```bash +export http_proxy=http://localhost:9991 +export https_proxy=http://localhost:9991 +``` + +- **`build.py`** 会自动透传代理到 `docker build`(通过 `--build-arg` + `--network host`)。 +- **`run.py`** 使用 `--network host`,容器内可直接访问宿主机代理。 diff --git a/.ci/build.py b/.ci/build.py new file mode 100644 index 0000000..489ebf0 --- /dev/null +++ b/.ci/build.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) + sys.exit(1) + + return result.stdout.strip() + + +def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): + """Check if any file under `dockerfile_dir` changed since `base_ref`.""" + result = subprocess.run( + ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], + capture_output=True, + text=True, + ) + + return bool(result.stdout.strip()) + + +def build_image_tag(registry_url, project, platform, tag): + if registry_url: + return f"{registry_url}/{project}/{platform}:{tag}" + + return f"{project}-ci/{platform}:{tag}" + + +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): + """Build a single platform image. Returns True on success.""" + registry_url = registry_cfg.get("url", "") + project = registry_cfg.get("project", "infiniops") + dockerfile_dir = platform_cfg["dockerfile"] + + commit_tag = build_image_tag(registry_url, project, platform, commit) + latest_tag = build_image_tag(registry_url, project, platform, "latest") + + build_args_cfg = platform_cfg.get("build_args", {}) + build_cmd = ["docker", "build", "--network", "host"] + for key, value in build_args_cfg.items(): + build_cmd.extend(["--build-arg", f"{key}={value}"]) + + for proxy_var in ("http_proxy", "https_proxy", "no_proxy"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper()) + if proxy_val: + build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + + private_sdk = platform_cfg.get("private_sdk", {}) + if private_sdk: + sdk_url = private_sdk.get("source", "") + if sdk_url.startswith("${") and sdk_url.endswith("}"): + env_var = sdk_url[2:-1] + sdk_url = os.environ.get(env_var, "") + if sdk_url: + build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) + + build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) + + if dry_run: + print(f"[dry-run] {' '.join(build_cmd)}") + if push: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") + + return True + + print(f"==> building {platform}: {commit_tag}", file=sys.stderr) + result = subprocess.run(build_cmd) + if result.returncode != 0: + error = { + "stage": "build", + "platform": platform, + "tag": commit_tag, + "exit_code": result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + if push: + for tag in (commit_tag, latest_tag): + print(f"==> pushing {tag}", file=sys.stderr) + push_result = subprocess.run(["docker", "push", tag]) + if push_result.returncode != 0: + error = { + "stage": "push", + "platform": platform, + "tag": tag, + "exit_code": push_result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Build CI Docker images") + parser.add_argument( + "--platform", + type=str, + default="all", + help="Platform to build: nvidia, ascend, or all (default: all)", + ) + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument( + "--commit", + type=str, + default="HEAD", + help="Git ref for tagging the image (default: HEAD)", + ) + parser.add_argument( + "--push", + action="store_true", + help="Push images to registry after building", + ) + parser.add_argument( + "--force", + action="store_true", + help="Skip change detection and force build", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print commands without executing", + ) + args = parser.parse_args() + + config = load_config(args.config) + registry_cfg = config.get("registry", {}) + images_cfg = config.get("images", {}) + + if not images_cfg: + print("error: no `images` section in config", file=sys.stderr) + sys.exit(1) + + if args.platform == "all": + platforms = list(images_cfg.keys()) + else: + if args.platform not in images_cfg: + print( + f"error: platform `{args.platform}` not found in config", + file=sys.stderr, + ) + sys.exit(1) + platforms = [args.platform] + + commit = get_git_commit(args.commit) + failed = False + + for platform in platforms: + platform_cfg = images_cfg[platform] + dockerfile_dir = platform_cfg["dockerfile"] + + if not Path(dockerfile_dir).is_dir(): + print( + f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}", + file=sys.stderr, + ) + continue + + if not args.force and not has_dockerfile_changed(dockerfile_dir): + print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) + continue + + ok = build_image( + platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run + ) + if not ok: + failed = True + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/config.yaml b/.ci/config.yaml new file mode 100644 index 0000000..fea3f7c --- /dev/null +++ b/.ci/config.yaml @@ -0,0 +1,36 @@ +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +registry: + url: "" # TODO: Harbor not ready yet + project: infiniops + credentials_env: REGISTRY_TOKEN + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: # TODO: Ascend image is not ready yet + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" + +jobs: + nvidia_gpu: + image: stable + platform: nvidia + resources: + gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" + gpu_type: A100 + memory: 32GB + timeout: 3600 + + setup: pip install .[dev] + + stages: + - name: test + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile new file mode 100644 index 0000000..87f7c91 --- /dev/null +++ b/.ci/images/ascend/Dockerfile @@ -0,0 +1,31 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + curl \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PRIVATE_SDK_URL +RUN if [ -n "$PRIVATE_SDK_URL" ]; then \ + curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \ + chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \ + rm /tmp/sdk.run; \ + fi + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile new file mode 100644 index 0000000..d89ea91 --- /dev/null +++ b/.ci/images/nvidia/Dockerfile @@ -0,0 +1,26 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG http_proxy +ARG https_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py new file mode 100644 index 0000000..0421a56 --- /dev/null +++ b/.ci/run.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" + +import argparse +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def resolve_image(config, platform, image_tag): + """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL.""" + registry = config.get("registry", {}) + registry_url = registry.get("url", "") + project = registry.get("project", "infiniops") + + if not registry_url: + return f"{project}-ci/{platform}:{image_tag}" + + return f"{registry_url}/{project}/{platform}:{image_tag}" + + +def build_runner_script(): + return r""" +export https_proxy=http://localhost:9991 +set -e +cd /workspace +git clone "$REPO_URL" repo +cd repo +git checkout "$BRANCH" +echo "========== Setup ==========" +eval "$SETUP_CMD" +set +e +failed=0 +for i in $(seq 1 "$NUM_STAGES"); do + name_var="STAGE_${i}_NAME" + cmd_var="STAGE_${i}_CMD" + name="${!name_var}" + cmd="${!cmd_var}" + echo "========== Stage: $name ==========" + eval "$cmd" || failed=1 +done +echo "========== Summary ==========" +exit $failed +""" + + +def build_docker_args( + config, job_name, repo_url, branch, stages, workdir, image_tag_override, + gpu_id_override=None, +): + job = config["jobs"][job_name] + platform = job.get("platform", "nvidia") + image_tag = image_tag_override or job.get("image", "stable") + image = resolve_image(config, platform, image_tag) + resources = job.get("resources", {}) + setup_cmd = job.get("setup", "pip install .[dev]") + + args = [ + "docker", + "run", + "--rm", + "--network", + "host", + "-i", + "-w", + workdir, + "-e", + f"REPO_URL={repo_url}", + "-e", + f"BRANCH={branch}", + "-e", + f"SETUP_CMD={setup_cmd}", + "-e", + f"NUM_STAGES={len(stages)}", + ] + for i, s in enumerate(stages): + args.append("-e") + args.append(f"STAGE_{i + 1}_NAME={s['name']}") + args.append("-e") + args.append(f"STAGE_{i + 1}_CMD={s['run']}") + + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) + gpu_count = resources.get("gpu_count", 0) + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif gpu_count and gpu_count > 0: + args.extend(["--gpus", f"count={gpu_count}"]) + + memory = resources.get("memory") + if memory: + mem = str(memory).upper().replace("GB", "g").replace("MB", "m") + if not mem.endswith("g") and not mem.endswith("m"): + mem = f"{mem}g" + args.extend(["--memory", mem]) + + timeout_sec = resources.get("timeout") + if timeout_sec: + args.extend(["--stop-timeout", str(timeout_sec)]) + + args.append(image) + args.append("bash") + args.append("-c") + args.append(build_runner_script().strip()) + + return args + + +def main(): + parser = argparse.ArgumentParser(description="Run Docker CI pipeline") + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument("--branch", type=str, help="Override repo branch") + parser.add_argument("--job", type=str, help="Job name to run (default: first job)") + parser.add_argument( + "--stage", + type=str, + help="Run only this stage name (still runs setup first)", + ) + parser.add_argument( + "--image-tag", + type=str, + help="Override image tag (stable, latest, or commit hash)", + ) + parser.add_argument( + "--gpu-id", + type=str, + help='GPU device IDs to use, e.g. "0", "0,2", "all"', + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print docker command and exit", + ) + args = parser.parse_args() + + config = load_config(args.config) + repo = config.get("repo", {}) + repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") + branch = args.branch or repo.get("branch", "dev-infra") + + jobs = config.get("jobs", {}) + if not jobs: + print("error: no jobs in config", file=sys.stderr) + sys.exit(1) + job_name = args.job or next(iter(jobs)) + if job_name not in jobs: + print(f"error: job {job_name!r} not in config", file=sys.stderr) + sys.exit(1) + + job = jobs[job_name] + all_stages = job.get("stages", []) + if args.stage: + stages = [s for s in all_stages if s["name"] == args.stage] + if not stages: + print(f"error: stage {args.stage!r} not found", file=sys.stderr) + sys.exit(1) + else: + stages = all_stages + + workdir = "/workspace" + docker_args = build_docker_args( + config, job_name, repo_url, branch, stages, workdir, args.image_tag, + gpu_id_override=args.gpu_id, + ) + + if args.dry_run: + print(" ".join(docker_args)) + + return + + sys.exit(subprocess.run(docker_args).returncode) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 765b90a..3dbc186 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "InfiniOps" version = "0.1.0" [project.optional-dependencies] -dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch"] +dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] [tool.scikit-build.wheel] install-dir = "infini" diff --git a/tests/test_add.py b/tests/test_add.py index 1c98d91..61d6715 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -4,15 +4,39 @@ from tests.utils import Payload, empty_strided, randint_strided, randn_strided -_INT_DTYPES = ( - torch.int16, - torch.uint16, - torch.int32, - torch.uint32, - torch.int64, - torch.uint64, +_INT_DTYPES = tuple( + d + for d in ( + torch.int16, + torch.int32, + torch.int64, + ) + if d is not None ) +_UINT_DTYPES = tuple( + d + for d in ( + getattr(torch, "uint16", None), + getattr(torch, "uint32", None), + getattr(torch, "uint64", None), + ) + if d is not None +) + +def _dtype_parametrize(): + candidates = [ + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + (torch.int16, 0, 0), + (torch.int32, 0, 0), + (getattr(torch, "uint32", None), 0, 0), + (torch.int64, 0, 0), + (getattr(torch, "uint64", None), 0, 0), + ] + return tuple((d, r, a) for (d, r, a) in candidates if d is not None) + @pytest.mark.auto_act_and_assert @pytest.mark.parametrize( @@ -32,30 +56,11 @@ ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), ), ) -@pytest.mark.parametrize( - ("dtype", "rtol", "atol"), - ( - (torch.float32, 1e-7, 1e-7), - (torch.float16, 1e-3, 1e-3), - (torch.bfloat16, 1e-2, 5e-3), - (torch.int16, 0, 0), - (torch.uint16, 0, 0), - (torch.int32, 0, 0), - (torch.uint32, 0, 0), - (torch.int64, 0, 0), - (torch.uint64, 0, 0), - ), -) -def test_add( - shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol -): - if dtype in _INT_DTYPES: - input = randint_strided( - 0, 100, shape, input_strides, dtype=dtype, device=device - ) - other = randint_strided( - 0, 100, shape, other_strides, dtype=dtype, device=device - ) +@pytest.mark.parametrize(("dtype", "rtol", "atol"), _dtype_parametrize()) +def test_add(shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol): + if dtype in _INT_DTYPES or dtype in _UINT_DTYPES: + input = randint_strided(0, 100, shape, input_strides, dtype=dtype, device=device) + other = randint_strided(0, 100, shape, other_strides, dtype=dtype, device=device) else: input = randn_strided(shape, input_strides, dtype=dtype, device=device) other = randn_strided(shape, other_strides, dtype=dtype, device=device) @@ -72,10 +77,10 @@ def _add(input, other, out): def _torch_add(input, other, out): - if input.dtype in (torch.uint16, torch.uint32, torch.uint64): + if input.dtype in _UINT_DTYPES: input = input.to(torch.int64) - if other.dtype in (torch.uint16, torch.uint32, torch.uint64): + if other.dtype in _UINT_DTYPES: other = other.to(torch.int64) res = torch.add(input, other) diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py index f447091..b0c9c5d 100644 --- a/tests/test_rms_norm.py +++ b/tests/test_rms_norm.py @@ -59,4 +59,13 @@ def _rms_norm(input, weight, *, eps=1e-6, out=None): def _torch_rms_norm(input, weight, *, eps=1e-6, out=None): - return torch.nn.functional.rms_norm(input, input.shape[-1:], weight=weight, eps=eps) + rms_norm_fn = getattr(torch.nn.functional, "rms_norm", None) + if rms_norm_fn is not None: + return rms_norm_fn(input, input.shape[-1:], weight=weight, eps=eps) + # Fallback for PyTorch < 2.3: RMS norm = (x / sqrt(mean(x^2) + eps)) * weight + rms = torch.sqrt(torch.mean(input * input, dim=-1, keepdim=True) + eps) + result = (input / rms) * weight + if out is not None: + out.copy_(result) + return out + return result From f15e113ff43b566059e35ed91bcc44dd29e85540 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 20 Mar 2026 07:24:55 +0000 Subject: [PATCH 2/5] feat: ci sys for nv platform --- .ci/README.md | 155 +++++------------- .ci/build.py | 103 ++++++++++-- .ci/config.yaml | 17 +- .ci/images/ascend/Dockerfile | 8 + .ci/images/nvidia/Dockerfile | 5 + .ci/run.py | 117 ++++++++++++-- .ci/tests/__init__.py | 0 .ci/tests/conftest.py | 42 +++++ .ci/tests/test_build.py | 186 ++++++++++++++++++++++ .ci/tests/test_run.py | 298 +++++++++++++++++++++++++++++++++++ 10 files changed, 775 insertions(+), 156 deletions(-) create mode 100644 .ci/tests/__init__.py create mode 100644 .ci/tests/conftest.py create mode 100644 .ci/tests/test_build.py create mode 100644 .ci/tests/test_run.py diff --git a/.ci/README.md b/.ci/README.md index 59ee101..0bd59bd 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -1,25 +1,18 @@ # .ci — CI 镜像与流水线 -本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。 - -## 目录结构 - ``` .ci/ -├── config.yaml # 统一配置(registry、镜像、job 定义) -├── build.py # 镜像构建脚本 -├── run.py # CI 流水线执行脚本 -├── README.md +├── config.yaml # 统一配置(镜像、job 定义) +├── build.py # 镜像构建 +├── run.py # CI 流水线执行 └── images/ - ├── nvidia/Dockerfile # NVIDIA 平台镜像 - └── ascend/Dockerfile # 昇腾平台镜像 + ├── nvidia/Dockerfile + └── ascend/Dockerfile ``` -## 前置依赖 +**前置依赖**:Docker、Python 3.10+、`pip install pyyaml` -- Docker -- Python 3.10+ -- pyyaml (`pip install pyyaml`) +--- ## 配置文件 `config.yaml` @@ -28,144 +21,72 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -registry: - url: "" # Harbor 地址,本地开发时留空 - project: infiniops - credentials_env: REGISTRY_TOKEN - images: nvidia: dockerfile: .ci/images/nvidia/ build_args: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 - ascend: - dockerfile: .ci/images/ascend/ - build_args: - BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 - private_sdk: - source: "${PRIVATE_SDK_URL}" jobs: nvidia_gpu: - image: stable # stable | latest | 具体 commit hash + image: latest # latest | platform: nvidia resources: - gpu_ids: "0" # GPU 设备 ID,如 "0" "0,2" "all" - gpu_type: A100 + gpu_ids: "0" # "0" | "0,2" | "all" memory: 32GB - timeout: 3600 + shm_size: 16g # 避免 PyTorch SHMEM 不足 + timeout: 3600 # 容器内脚本最大运行秒数 setup: pip install .[dev] + env: # 可选,注入容器环境变量 + MY_VAR: value stages: - name: test - run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml + run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml ``` -- **`registry.url`** 为空时镜像仅保存在本地,tag 格式为 `-ci/:`。 -- **`images..build_args`** 会作为 `--build-arg` 传入 `docker build`。 -- **`jobs..image`** 支持 `stable`、`latest` 或具体 commit hash。 -- **`resources.gpu_ids`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 +--- ## 镜像构建 `build.py` -```bash -python .ci/build.py [options] -``` - -| 参数 | 默认值 | 说明 | -|---|---|---| -| `--platform` | `all` | 构建平台:`nvidia`、`ascend` 或 `all` | -| `--commit` | `HEAD` | 用于镜像 tag 的 git ref | -| `--push` | — | 构建后推送到 registry | -| `--force` | — | 跳过变更检测,强制构建 | -| `--dry-run` | — | 仅打印命令,不执行 | -| `--config` | `.ci/config.yaml` | 配置文件路径 | - -### 示例 +| 参数 | 说明 | +|---|---| +| `--platform nvidia\|ascend\|all` | 构建平台,默认 `all` | +| `--force` | 跳过 Dockerfile 变更检测 | +| `--dry-run` | 打印命令不执行 | ```bash -# 构建 nvidia 镜像(自动检测 Dockerfile 变更,无变更则跳过) +# 检测变更后构建(无变更自动跳过) python .ci/build.py --platform nvidia # 强制构建 python .ci/build.py --platform nvidia --force - -# 构建全部平台并推送到 registry -python .ci/build.py --push --force - -# 预览实际执行的 docker 命令 -python .ci/build.py --platform nvidia --force --dry-run ``` -### 构建流程 +构建产物以宿主机本地镜像 tag 存储:`infiniops-ci/:` 和 `:latest`。 +代理、`no_proxy` 自动从宿主机环境变量透传到 `docker build`。 -1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更(`--force` 跳过此步) -2. `docker build` 构建镜像,同时打 `` 和 `latest` 两个 tag -3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器 -4. 若指定 `--push`,将两个 tag 推送到 registry +> `--push` 为预留功能,需在 `config.yaml` 中配置 `registry` 段后方可使用。 -### 产物 - -| Tag | 说明 | -|---|---| -| `infiniops-ci/:` | 精确追溯到某次构建 | -| `infiniops-ci/:latest` | 最近一次构建 | +--- ## 流水线执行 `run.py` -```bash -python .ci/run.py [options] -``` - -| 参数 | 默认值 | 说明 | -|---|---|---| -| `--job` | 配置中第一个 job | 要执行的 job 名称 | -| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 | -| `--stage` | 全部 | 仅运行指定 stage | -| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 | -| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID,如 `0`、`0,2`、`all` | -| `--dry-run` | — | 仅打印 docker 命令,不执行 | -| `--config` | `.ci/config.yaml` | 配置文件路径 | - -### 示例 +| 参数 | 说明 | +|---|---| +| `--branch` | 覆盖克隆分支 | +| `--stage` | 只运行指定 stage | +| `--image-tag` | 覆盖镜像 tag | +| `--gpu-id` | 覆盖 GPU 设备 ID | +| `--results-dir` | 宿主机目录,挂载到容器 `/workspace/results` | +| `--dry-run` | 打印 docker 命令不执行 | ```bash # 运行默认 job -python .ci/run.py - -# 指定分支和镜像版本 -python .ci/run.py --branch feature-xxx --image-tag latest - -# 只用 GPU 0 运行 -python .ci/run.py --gpu-id 0 - -# 用 GPU 0 和 2 运行 -python .ci/run.py --gpu-id 0,2 - -# 使用全部 GPU -python .ci/run.py --gpu-id all - -# 只跑 test stage -python .ci/run.py --stage test +python .ci/run.py --branch feat/my-feature --results-dir ./ci-results -# 预览 docker 命令 -python .ci/run.py --dry-run -``` - -### 执行流程 - -1. 解析 job 配置,拉取对应镜像 -2. `docker run` 启动容器(自动挂载 GPU、限制内存) -3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令 -4. 依次执行各 stage,汇总结果 - -## 代理配置 - -如果网络环境需要代理,在宿主机设置环境变量后即可: - -```bash -export http_proxy=http://localhost:9991 -export https_proxy=http://localhost:9991 +# 只跑 test stage,预览命令 +python .ci/run.py --stage test --dry-run ``` -- **`build.py`** 会自动透传代理到 `docker build`(通过 `--build-arg` + `--network host`)。 -- **`run.py`** 使用 `--network host`,容器内可直接访问宿主机代理。 +容器内执行流程:`git clone` → `checkout` → `setup` → stages。 +代理从宿主机透传,测试结果写入 `--results-dir`。每次运行均为干净环境(不挂载宿主机 pip 缓存)。 diff --git a/.ci/build.py b/.ci/build.py index 489ebf0..2339319 100644 --- a/.ci/build.py +++ b/.ci/build.py @@ -4,6 +4,7 @@ import argparse import json import os +import shlex import subprocess import sys from pathlib import Path @@ -28,6 +29,7 @@ def get_git_commit(ref="HEAD"): capture_output=True, text=True, ) + if result.returncode != 0: print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) sys.exit(1) @@ -43,9 +45,61 @@ def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): text=True, ) + if result.returncode != 0: + print( + "warning: git diff failed (shallow clone or initial commit?);" + " assuming Dockerfile changed", + file=sys.stderr, + ) + return True + return bool(result.stdout.strip()) +def docker_login(registry_cfg, dry_run): + """Log in to the registry using `credentials_env` token. + + Returns True on success. + + NOTE: Registry support is currently unused (`config.yaml` has no registry + section). Retained for future integration with an external image management + system. + """ + credentials_env = registry_cfg.get("credentials_env") + registry_url = registry_cfg.get("url", "") + + if not credentials_env or not registry_url: + return True + + token = os.environ.get(credentials_env) + + if not token: + print( + f"error: {credentials_env} not set, cannot login", + file=sys.stderr, + ) + return False + + if dry_run: + print( + f"[dry-run] echo | docker login {registry_url}" + " --username token --password-stdin" + ) + return True + + result = subprocess.run( + ["docker", "login", registry_url, "--username", "token", "--password-stdin"], + input=token, + text=True, + ) + + if result.returncode != 0: + print("error: docker login failed", file=sys.stderr) + return False + + return True + + def build_image_tag(registry_url, project, platform, tag): if registry_url: return f"{registry_url}/{project}/{platform}:{tag}" @@ -53,46 +107,53 @@ def build_image_tag(registry_url, project, platform, tag): return f"{project}-ci/{platform}:{tag}" -def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in): """Build a single platform image. Returns True on success.""" registry_url = registry_cfg.get("url", "") project = registry_cfg.get("project", "infiniops") dockerfile_dir = platform_cfg["dockerfile"] - commit_tag = build_image_tag(registry_url, project, platform, commit) latest_tag = build_image_tag(registry_url, project, platform, "latest") build_args_cfg = platform_cfg.get("build_args", {}) build_cmd = ["docker", "build", "--network", "host"] + for key, value in build_args_cfg.items(): build_cmd.extend(["--build-arg", f"{key}={value}"]) - for proxy_var in ("http_proxy", "https_proxy", "no_proxy"): - proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper()) + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + if proxy_val: build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"]) private_sdk = platform_cfg.get("private_sdk", {}) + if private_sdk: - sdk_url = private_sdk.get("source", "") - if sdk_url.startswith("${") and sdk_url.endswith("}"): - env_var = sdk_url[2:-1] - sdk_url = os.environ.get(env_var, "") + source_env = private_sdk.get("source_env", "") + sdk_url = os.environ.get(source_env, "") if source_env else "" + if sdk_url: build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) if dry_run: - print(f"[dry-run] {' '.join(build_cmd)}") + print(f"[dry-run] {shlex.join(build_cmd)}") + if push: - print(f"[dry-run] docker push {commit_tag}") - print(f"[dry-run] docker push {latest_tag}") + if not logged_in: + print("[dry-run] (skipping push: docker login failed)") + else: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") return True print(f"==> building {platform}: {commit_tag}", file=sys.stderr) result = subprocess.run(build_cmd) + if result.returncode != 0: error = { "stage": "build", @@ -105,9 +166,14 @@ def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): return False if push: + if not logged_in: + print("error: docker login failed, cannot push", file=sys.stderr) + return False + for tag in (commit_tag, latest_tag): print(f"==> pushing {tag}", file=sys.stderr) push_result = subprocess.run(["docker", "push", tag]) + if push_result.returncode != 0: error = { "stage": "push", @@ -145,7 +211,7 @@ def main(): parser.add_argument( "--push", action="store_true", - help="Push images to registry after building", + help="Push images to registry after building (requires registry in config)", ) parser.add_argument( "--force", @@ -179,6 +245,7 @@ def main(): platforms = [args.platform] commit = get_git_commit(args.commit) + logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True failed = False for platform in platforms: @@ -187,7 +254,8 @@ def main(): if not Path(dockerfile_dir).is_dir(): print( - f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}", + f"warning: dockerfile directory `{dockerfile_dir}` does not exist," + f" skipping {platform}", file=sys.stderr, ) continue @@ -197,8 +265,15 @@ def main(): continue ok = build_image( - platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run + platform, + platform_cfg, + registry_cfg, + commit, + args.push, + args.dry_run, + logged_in=logged_in, ) + if not ok: failed = True diff --git a/.ci/config.yaml b/.ci/config.yaml index fea3f7c..c80c47d 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -2,12 +2,7 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -registry: - url: "" # TODO: Harbor not ready yet - project: infiniops - credentials_env: REGISTRY_TOKEN - -images: +images: nvidia: dockerfile: .ci/images/nvidia/ build_args: @@ -17,20 +12,22 @@ images: build_args: BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 private_sdk: - source: "${PRIVATE_SDK_URL}" + source_env: PRIVATE_SDK_URL jobs: nvidia_gpu: - image: stable + image: latest platform: nvidia resources: gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" - gpu_type: A100 memory: 32GB + shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 timeout: 3600 setup: pip install .[dev] + # env: # 可选,注入容器环境变量 + # MY_VAR: value stages: - name: test - run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml + run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile index 87f7c91..66392eb 100644 --- a/.ci/images/ascend/Dockerfile +++ b/.ci/images/ascend/Dockerfile @@ -3,11 +3,19 @@ FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ cmake \ ninja-build \ + coreutils \ curl \ libclang-dev \ && rm -rf /var/lib/apt/lists/* diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile index d89ea91..74ccfd1 100644 --- a/.ci/images/nvidia/Dockerfile +++ b/.ci/images/nvidia/Dockerfile @@ -3,14 +3,19 @@ FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY ARG http_proxy ARG https_proxy +ARG no_proxy RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ cmake \ ninja-build \ + coreutils \ libclang-dev \ && rm -rf /var/lib/apt/lists/* diff --git a/.ci/run.py b/.ci/run.py index 0421a56..3f25afa 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -2,8 +2,11 @@ """Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" import argparse +import os +import shlex import subprocess import sys +from datetime import datetime from pathlib import Path try: @@ -20,8 +23,35 @@ def load_config(path): return yaml.safe_load(f) +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return "unknown" + + return result.stdout.strip() + + +def build_results_dir(base, platform, stages, commit): + """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}`.""" + stage_names = "+".join(s["name"] for s in stages) + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + dirname = f"{platform}_{stage_names}_{commit}_{timestamp}" + + return Path(base) / dirname + + def resolve_image(config, platform, image_tag): - """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL.""" + """Resolve an image reference to a full image name. + + Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config + contains a registry section, returns a registry-prefixed URL. Otherwise + returns a local tag (current default). + """ registry = config.get("registry", {}) registry_url = registry.get("url", "") project = registry.get("project", "infiniops") @@ -34,9 +64,9 @@ def resolve_image(config, platform, image_tag): def build_runner_script(): return r""" -export https_proxy=http://localhost:9991 set -e cd /workspace +mkdir -p /workspace/results git clone "$REPO_URL" repo cd repo git checkout "$BRANCH" @@ -58,15 +88,27 @@ def build_runner_script(): def build_docker_args( - config, job_name, repo_url, branch, stages, workdir, image_tag_override, + config, + job_name, + repo_url, + branch, + stages, + workdir, + image_tag_override, gpu_id_override=None, + results_dir=None, ): job = config["jobs"][job_name] platform = job.get("platform", "nvidia") - image_tag = image_tag_override or job.get("image", "stable") + image_tag = image_tag_override or job.get("image", "latest") image = resolve_image(config, platform, image_tag) resources = job.get("resources", {}) - setup_cmd = job.get("setup", "pip install .[dev]") + setup_raw = job.get("setup", "pip install .[dev]") + + if isinstance(setup_raw, list): + setup_cmd = "\n".join(setup_raw) + else: + setup_cmd = setup_raw args = [ "docker", @@ -86,6 +128,20 @@ def build_docker_args( "-e", f"NUM_STAGES={len(stages)}", ] + + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + + if proxy_val: + args.extend(["-e", f"{proxy_var}={proxy_val}"]) + args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"]) + + for key, value in job.get("env", {}).items(): + args.extend(["-e", f"{key}={value}"]) + + if results_dir: + args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"]) + for i, s in enumerate(stages): args.append("-e") args.append(f"STAGE_{i + 1}_NAME={s['name']}") @@ -94,6 +150,7 @@ def build_docker_args( gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) gpu_count = resources.get("gpu_count", 0) + if gpu_id: if gpu_id == "all": args.extend(["--gpus", "all"]) @@ -103,20 +160,28 @@ def build_docker_args( args.extend(["--gpus", f"count={gpu_count}"]) memory = resources.get("memory") + if memory: - mem = str(memory).upper().replace("GB", "g").replace("MB", "m") + mem = str(memory).lower().replace("gb", "g").replace("mb", "m") + if not mem.endswith("g") and not mem.endswith("m"): mem = f"{mem}g" + args.extend(["--memory", mem]) + shm_size = resources.get("shm_size") + + if shm_size: + args.extend(["--shm-size", str(shm_size)]) + timeout_sec = resources.get("timeout") + args.append(image) + if timeout_sec: - args.extend(["--stop-timeout", str(timeout_sec)]) + # Requires coreutils `timeout` inside the container image. + args.extend(["timeout", str(timeout_sec)]) - args.append(image) - args.append("bash") - args.append("-c") - args.append(build_runner_script().strip()) + args.extend(["bash", "-c", build_runner_script().strip()]) return args @@ -146,6 +211,12 @@ def main(): type=str, help='GPU device IDs to use, e.g. "0", "0,2", "all"', ) + parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + help="Base directory for test results (default: ./ci-results)", + ) parser.add_argument( "--dry-run", action="store_true", @@ -156,38 +227,54 @@ def main(): config = load_config(args.config) repo = config.get("repo", {}) repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") - branch = args.branch or repo.get("branch", "dev-infra") + branch = args.branch or repo.get("branch", "master") jobs = config.get("jobs", {}) + if not jobs: print("error: no jobs in config", file=sys.stderr) sys.exit(1) + job_name = args.job or next(iter(jobs)) + if job_name not in jobs: print(f"error: job {job_name!r} not in config", file=sys.stderr) sys.exit(1) job = jobs[job_name] all_stages = job.get("stages", []) + if args.stage: stages = [s for s in all_stages if s["name"] == args.stage] + if not stages: print(f"error: stage {args.stage!r} not found", file=sys.stderr) sys.exit(1) else: stages = all_stages + platform = job.get("platform", "nvidia") + commit = get_git_commit() + results_dir = build_results_dir(args.results_dir, platform, stages, commit) + workdir = "/workspace" docker_args = build_docker_args( - config, job_name, repo_url, branch, stages, workdir, args.image_tag, + config, + job_name, + repo_url, + branch, + stages, + workdir, + args.image_tag, gpu_id_override=args.gpu_id, + results_dir=results_dir, ) if args.dry_run: - print(" ".join(docker_args)) - + print(shlex.join(docker_args)) return + results_dir.mkdir(parents=True, exist_ok=True) sys.exit(subprocess.run(docker_args).returncode) diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py new file mode 100644 index 0000000..98079cd --- /dev/null +++ b/.ci/tests/conftest.py @@ -0,0 +1,42 @@ +import sys +from pathlib import Path + +# Allow `import run` and `import build` directly. +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import pytest + + +@pytest.fixture +def minimal_config(): + return { + "repo": { + "url": "https://github.com/InfiniTensor/InfiniOps.git", + "branch": "master", + }, + "images": { + "nvidia": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + } + }, + "jobs": { + "nvidia_gpu": { + "image": "latest", + "platform": "nvidia", + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "setup": "pip install .[dev]", + "stages": [ + { + "name": "test", + "run": "pytest tests/ -v", + } + ], + } + }, + } diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py new file mode 100644 index 0000000..fa2f292 --- /dev/null +++ b/.ci/tests/test_build.py @@ -0,0 +1,186 @@ +import build + + +# --------------------------------------------------------------------------- +# build_image_tag +# --------------------------------------------------------------------------- + + +def test_build_image_tag_with_registry(): + tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest") + assert tag == "localhost:5000/infiniops/nvidia:latest" + + +def test_build_image_tag_without_registry(): + tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234") + assert tag == "infiniops-ci/nvidia:abc1234" + + +def test_build_image_tag_commit_hash(): + tag = build.build_image_tag( + "registry.example.com:5000", "proj", "ascend", "deadbeef" + ) + assert tag == "registry.example.com:5000/proj/ascend:deadbeef" + + +# --------------------------------------------------------------------------- +# has_dockerfile_changed +# --------------------------------------------------------------------------- + + +def test_has_dockerfile_changed_true_when_stdout_nonempty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout="Dockerfile\n"), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +def test_has_dockerfile_changed_false_when_stdout_empty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is False + + +def test_has_dockerfile_changed_true_on_git_error(mocker): + # Shallow clone or initial commit: `git diff` returns non-zero. + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=128, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +# --------------------------------------------------------------------------- +# docker_login +# --------------------------------------------------------------------------- + + +def test_docker_login_no_credentials_env(mocker): + run_mock = mocker.patch("subprocess.run") + result = build.docker_login({"url": "localhost:5000"}, dry_run=False) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_token_not_set(mocker, monkeypatch, capsys): + monkeypatch.delenv("REGISTRY_TOKEN", raising=False) + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is False + run_mock.assert_not_called() + + +def test_docker_login_dry_run_does_not_call_subprocess(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=True) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_success(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is True + run_mock.assert_called_once() + cmd = run_mock.call_args[0][0] + assert "docker" in cmd + assert "login" in cmd + + +# --------------------------------------------------------------------------- +# build_image — dry_run and proxy +# --------------------------------------------------------------------------- + + +def _platform_cfg(): + return { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + } + + +def _registry_cfg(): + return {"url": "localhost:5000", "project": "infiniops"} + + +def test_build_image_dry_run_no_subprocess(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + run_mock = mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + run_mock.assert_not_called() + captured = capsys.readouterr() + assert "[dry-run]" in captured.out + + +def test_build_image_dry_run_output_contains_image_tag(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + captured = capsys.readouterr() + assert "abc1234" in captured.out + + +def test_build_image_proxy_in_build_args(mocker, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + called_cmd = run_mock.call_args[0][0] + joined = " ".join(called_cmd) + assert "HTTP_PROXY=http://proxy.test:3128" in joined + assert "http_proxy=http://proxy.test:3128" in joined + + +def test_build_image_returns_false_on_docker_error(mocker, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=1), + ) + result = build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + assert result is False diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py new file mode 100644 index 0000000..075546e --- /dev/null +++ b/.ci/tests/test_run.py @@ -0,0 +1,298 @@ +from pathlib import Path + +import pytest + +import run + + +# --------------------------------------------------------------------------- +# resolve_image +# --------------------------------------------------------------------------- + + +def test_resolve_image_with_registry(): + cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}} + img = run.resolve_image(cfg, "nvidia", "latest") + assert img == "localhost:5000/infiniops/nvidia:latest" + + +def test_resolve_image_without_registry(minimal_config): + img = run.resolve_image(minimal_config, "nvidia", "abc1234") + assert img == "infiniops-ci/nvidia:abc1234" + + +# --------------------------------------------------------------------------- +# build_runner_script +# --------------------------------------------------------------------------- + + +def test_runner_script_contains_git_clone(): + script = run.build_runner_script() + assert "git clone" in script + + +def test_runner_script_contains_setup_cmd(): + script = run.build_runner_script() + assert "SETUP_CMD" in script + + +def test_runner_script_exits_on_failure(): + script = run.build_runner_script() + assert "exit $failed" in script + + +def test_runner_script_creates_results_dir(): + script = run.build_runner_script() + assert "mkdir -p /workspace/results" in script + + +# --------------------------------------------------------------------------- +# build_docker_args — basic structure +# --------------------------------------------------------------------------- + + +def test_docker_args_basic_structure(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert args[0] == "docker" + assert "run" in args + assert "--rm" in args + + +def test_docker_args_correct_image(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "infiniops-ci/nvidia:latest" in args + + +def test_docker_args_image_tag_override(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + "abc1234", + ) + assert "infiniops-ci/nvidia:abc1234" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — proxy passthrough +# --------------------------------------------------------------------------- + + +def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "-e" in args + assert "HTTP_PROXY=http://proxy.example.com:8080" in args + assert "http_proxy=http://proxy.example.com:8080" in args + + +def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.delenv("http_proxy", raising=False) + monkeypatch.delenv("HTTPS_PROXY", raising=False) + monkeypatch.delenv("https_proxy", raising=False) + monkeypatch.delenv("NO_PROXY", raising=False) + monkeypatch.delenv("no_proxy", raising=False) + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + + for arg in args: + assert not arg.startswith("HTTP_PROXY=") + assert not arg.startswith("http_proxy=") + assert not arg.startswith("HTTPS_PROXY=") + assert not arg.startswith("https_proxy=") + assert not arg.startswith("NO_PROXY=") + assert not arg.startswith("no_proxy=") + + +def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "HTTP_PROXY=http://lowercase.proxy:3128" in args + assert "http_proxy=http://lowercase.proxy:3128" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — GPU flags +# --------------------------------------------------------------------------- + + +def _make_args(config, gpu_id_override=None): + return run.build_docker_args( + config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + gpu_id_override=gpu_id_override, + ) + + +def test_docker_args_gpu_device(minimal_config): + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert "device=0" in args[idx + 1] + + +def test_docker_args_gpu_all(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all" + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert args[idx + 1] == "all" + + +def test_docker_args_no_gpu(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "" + minimal_config["jobs"]["nvidia_gpu"]["resources"].pop("gpu_count", None) + args = _make_args(minimal_config) + assert "--gpus" not in args + + +def test_docker_args_gpu_override(minimal_config): + args = _make_args(minimal_config, gpu_id_override="2,3") + idx = args.index("--gpus") + assert "2,3" in args[idx + 1] + + +# --------------------------------------------------------------------------- +# build_docker_args — memory format +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("32GB", "32g"), + ("512MB", "512m"), + ("8", "8g"), + ("16gb", "16g"), + ("256mb", "256m"), + ], +) +def test_docker_args_memory_format(minimal_config, raw, expected): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw + args = _make_args(minimal_config) + idx = args.index("--memory") + assert args[idx + 1] == expected + + +# --------------------------------------------------------------------------- +# build_docker_args — stages encoding +# --------------------------------------------------------------------------- + + +def test_docker_args_num_stages(minimal_config): + args = _make_args(minimal_config) + assert "NUM_STAGES=1" in args + + +def test_docker_args_stage_name_cmd(minimal_config): + args = _make_args(minimal_config) + assert "STAGE_1_NAME=test" in args + assert any(a.startswith("STAGE_1_CMD=") for a in args) + + +def test_docker_args_multiple_stages(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["stages"] = [ + {"name": "lint", "run": "ruff check ."}, + {"name": "test", "run": "pytest tests/"}, + ] + args = _make_args(minimal_config) + assert "NUM_STAGES=2" in args + assert "STAGE_1_NAME=lint" in args + assert "STAGE_2_NAME=test" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — results_dir mount +# --------------------------------------------------------------------------- + + +def test_docker_args_results_dir(minimal_config, tmp_path): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + results_dir=tmp_path, + ) + joined = " ".join(str(a) for a in args) + assert "-v" in args + assert "/workspace/results" in joined + + +# --------------------------------------------------------------------------- +# build_results_dir +# --------------------------------------------------------------------------- + + +def test_build_results_dir_contains_platform(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "nvidia" in d.name + + +def test_build_results_dir_contains_commit(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "abc1234" in d.name + + +def test_build_results_dir_contains_stage_names(): + stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "lint+test" in d.name + + +def test_build_results_dir_under_base(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678") + assert d.parent == Path("/tmp/my-results") From 63dbafca7019902ab35f7b47a8ed72c68655c613 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 20 Mar 2026 08:00:22 +0000 Subject: [PATCH 3/5] fix(ci): fix results dir permissions and reduce parallel workers - Pass host UID/GID into container and `chown` results after tests, so mounted `ci-results/` is accessible by the host user. - Limit `pytest-xdist` workers from `-n auto` to `-n 8` to prevent OOM worker crashes on high-core-count machines. Co-Authored-By: Claude Opus 4.6 --- .ci/config.yaml | 2 +- .ci/run.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.ci/config.yaml b/.ci/config.yaml index c80c47d..a86174a 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -30,4 +30,4 @@ jobs: stages: - name: test - run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml diff --git a/.ci/run.py b/.ci/run.py index 3f25afa..0c8d648 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -83,6 +83,9 @@ def build_runner_script(): eval "$cmd" || failed=1 done echo "========== Summary ==========" +if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then + chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true +fi exit $failed """ @@ -127,6 +130,10 @@ def build_docker_args( f"SETUP_CMD={setup_cmd}", "-e", f"NUM_STAGES={len(stages)}", + "-e", + f"HOST_UID={os.getuid()}", + "-e", + f"HOST_GID={os.getgid()}", ] for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): From 497b2552b1077d03ee7c1a7b7e1c0e5832dcd7f6 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Mon, 23 Mar 2026 03:27:06 +0000 Subject: [PATCH 4/5] refactor(ci): Refactor code structure for improved readability and maintainability --- .ci/README.md | 207 ++++++- .ci/agent.py | 971 ++++++++++++++++++++++++++++++++ .ci/build.py | 27 +- .ci/ci_resource.py | 241 ++++++++ .ci/config.yaml | 89 ++- .ci/github_status.py | 98 ++++ .ci/images/iluvatar/Dockerfile | 53 ++ .ci/images/nvidia/Dockerfile | 21 +- .ci/run.py | 56 +- .ci/tests/conftest.py | 44 +- .ci/tests/test_agent.py | 503 +++++++++++++++++ .ci/tests/test_github_status.py | 144 +++++ .ci/tests/test_resource.py | 324 +++++++++++ .ci/tests/test_utils.py | 90 +++ .ci/utils.py | 101 ++++ 15 files changed, 2833 insertions(+), 136 deletions(-) create mode 100644 .ci/agent.py create mode 100644 .ci/ci_resource.py create mode 100644 .ci/github_status.py create mode 100644 .ci/images/iluvatar/Dockerfile create mode 100644 .ci/tests/test_agent.py create mode 100644 .ci/tests/test_github_status.py create mode 100644 .ci/tests/test_resource.py create mode 100644 .ci/tests/test_utils.py create mode 100644 .ci/utils.py diff --git a/.ci/README.md b/.ci/README.md index 0bd59bd..33841ca 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -2,11 +2,16 @@ ``` .ci/ -├── config.yaml # 统一配置(镜像、job 定义) +├── config.yaml # 统一配置(镜像、job、Agent 定义) +├── utils.py # 共享工具(load_config、get_git_commit) +├── agent.py # Runner Agent(调度、Webhook、远程触发) ├── build.py # 镜像构建 -├── run.py # CI 流水线执行 +├── run.py # CI 流水线执行(Docker 层) +├── ci_resource.py # GPU/内存资源检测与分配 +├── github_status.py # GitHub Commit Status 上报 └── images/ ├── nvidia/Dockerfile + ├── iluvatar/Dockerfile └── ascend/Dockerfile ``` @@ -16,41 +21,88 @@ ## 配置文件 `config.yaml` +配置以 **platform** 为顶级结构,每个平台包含镜像定义、平台级默认值和 job 列表。 +加载时自动展平为 `{platform}_{job}` 格式(如 `nvidia_gpu`)。 + ```yaml repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -images: +platforms: nvidia: - dockerfile: .ci/images/nvidia/ - build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 - -jobs: - nvidia_gpu: - image: latest # latest | - platform: nvidia - resources: - gpu_ids: "0" # "0" | "0,2" | "all" - memory: 32GB - shm_size: 16g # 避免 PyTorch SHMEM 不足 - timeout: 3600 # 容器内脚本最大运行秒数 + image: # 镜像定义 + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + setup: pip install .[dev] # 平台级默认值,job 可覆盖 + jobs: + gpu: # 展平后为 nvidia_gpu + resources: + gpu_ids: "0" # "0" | "0,2" | "all" + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + + iluvatar: + image: + dockerfile: .ci/images/iluvatar/ + build_args: + BASE_IMAGE: corex:qs_pj20250825 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: # 平台级 docker 参数,所有 job 继承 + - "--privileged" + - "--cap-add=ALL" + - "--pid=host" + - "--ipc=host" + volumes: + - /dev:/dev + - /lib/firmware:/lib/firmware + - /usr/src:/usr/src + - /lib/modules:/lib/modules setup: pip install .[dev] - env: # 可选,注入容器环境变量 - MY_VAR: value - stages: - - name: test - run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml + jobs: + gpu: # 展平后为 iluvatar_gpu + resources: + gpu_ids: "0" + gpu_style: none # CoreX 设备通过 --privileged + /dev 挂载 + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml ``` +### 配置层级说明 + +| 层级 | 字段 | 说明 | +|---|---|---| +| **平台级** | `image` | 镜像定义(dockerfile、build_args) | +| | `image_tag` | 默认镜像 tag(默认 `latest`) | +| | `docker_args` | 额外 docker run 参数(如 `--privileged`) | +| | `volumes` | 额外挂载卷 | +| | `setup` | 容器内 setup 命令 | +| | `env` | 注入容器环境变量 | +| **Job 级** | `resources.gpu_ids` | GPU 设备 ID | +| | `resources.gpu_style` | GPU 透传方式:`nvidia`(默认)或 `none` | +| | `resources.memory` | 容器内存限制 | +| | `resources.shm_size` | 共享内存大小 | +| | `resources.timeout` | 容器内脚本最大运行秒数 | +| | `stages` | 执行阶段列表 | +| | 以上平台级字段 | Job 可覆盖任意平台级默认值 | + --- ## 镜像构建 `build.py` | 参数 | 说明 | |---|---| -| `--platform nvidia\|ascend\|all` | 构建平台,默认 `all` | +| `--platform nvidia\|iluvatar\|ascend\|all` | 构建平台,默认 `all` | | `--force` | 跳过 Dockerfile 变更检测 | | `--dry-run` | 打印命令不执行 | @@ -58,8 +110,11 @@ jobs: # 检测变更后构建(无变更自动跳过) python .ci/build.py --platform nvidia -# 强制构建 -python .ci/build.py --platform nvidia --force +# 构建 Iluvatar 镜像 +python .ci/build.py --platform iluvatar --force + +# 强制构建全部 +python .ci/build.py --force ``` 构建产物以宿主机本地镜像 tag 存储:`infiniops-ci/:` 和 `:latest`。 @@ -73,20 +128,116 @@ python .ci/build.py --platform nvidia --force | 参数 | 说明 | |---|---| +| `--job` | 指定 job 名称(默认第一个) | | `--branch` | 覆盖克隆分支 | | `--stage` | 只运行指定 stage | | `--image-tag` | 覆盖镜像 tag | -| `--gpu-id` | 覆盖 GPU 设备 ID | +| `--gpu-id` | 覆盖 GPU 设备 ID(仅 nvidia gpu_style) | | `--results-dir` | 宿主机目录,挂载到容器 `/workspace/results` | | `--dry-run` | 打印 docker 命令不执行 | ```bash -# 运行默认 job -python .ci/run.py --branch feat/my-feature --results-dir ./ci-results +# 运行 NVIDIA job +python .ci/run.py --job nvidia_gpu --branch master + +# 运行 Iluvatar job +python .ci/run.py --job iluvatar_gpu --branch feat/ci-nvidia # 只跑 test stage,预览命令 -python .ci/run.py --stage test --dry-run +python .ci/run.py --job iluvatar_gpu --stage test --dry-run ``` 容器内执行流程:`git clone` → `checkout` → `setup` → stages。 代理从宿主机透传,测试结果写入 `--results-dir`。每次运行均为干净环境(不挂载宿主机 pip 缓存)。 + +--- + +## 平台差异 + +| 平台 | GPU 透传方式 | 基础镜像 | 备注 | +|---|---|---|---| +| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA | +| Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时,CUDA 兼容 | +| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善 | + +--- + +## Runner Agent `agent.py` + +Runner Agent 支持 CLI 手动触发、GitHub Webhook 自动触发、资源感知的动态调度,以及跨机器远程触发。 + +### CLI 手动执行 + +```bash +# 运行所有 job(本地 + 远程 Agent) +python .ci/agent.py run --branch master + +# 运行指定 job +python .ci/agent.py run --branch master --job nvidia_gpu + +# 按平台运行 +python .ci/agent.py run --branch master --platform nvidia + +# 预览命令 +python .ci/agent.py run --branch master --dry-run --no-status +``` + +| 参数 | 说明 | +|---|---| +| `--branch` | 测试分支(必填) | +| `--job` | 指定 job 名称 | +| `--platform` | 按平台过滤 job | +| `--commit` | 覆盖 commit SHA | +| `--image-tag` | 覆盖镜像 tag | +| `--results-dir` | 结果目录(默认 `ci-results`) | +| `--utilization-threshold` | GPU 空闲阈值百分比(默认 10) | +| `--no-status` | 跳过 GitHub Status 上报 | +| `--dry-run` | 预览模式 | + +### Webhook 服务 + +每台平台机器部署一个 Agent 实例: + +```bash +# NVIDIA 机器 +python .ci/agent.py serve --platform nvidia --port 8080 + +# Iluvatar 机器 +python .ci/agent.py serve --platform iluvatar --port 8080 +``` + +| 端点 | 方法 | 说明 | +|---|---|---| +| `/webhook` | POST | GitHub Webhook(push/pull_request) | +| `/api/run` | POST | 远程触发 job | +| `/api/job/{id}` | GET | 查询 job 状态 | +| `/health` | GET | 健康检查 | +| `/status` | GET | 队列 + 资源状态 | + +Webhook 支持 `X-Hub-Signature-256` 签名验证,通过 `--webhook-secret` 或 `WEBHOOK_SECRET` 环境变量配置。 + +### 远程 Agent 配置 + +在 `config.yaml` 中配置各平台 Agent 地址,CLI 执行时自动将远程 job 分发到对应 Agent: + +```yaml +agents: + nvidia: + url: http://nvidia-host:8080 + iluvatar: + url: http://iluvatar-host:8080 +``` + +### 资源调度 + +Agent 自动检测 GPU 利用率和系统内存,动态决定并行度: +- GPU 利用率 < 阈值(默认 10%)且未被 Agent 分配 → 可用 +- 资源不足时 job 自动排队,已完成 job 释放资源后自动调度排队任务 + +### GitHub Status + +设置 `GITHUB_TOKEN` 环境变量后,Agent 会自动上报 commit status: +- `pending` — job 开始执行 +- `success` / `failure` — job 执行完成 + +Status context 格式:`ci/infiniops/{job_name}` diff --git a/.ci/agent.py b/.ci/agent.py new file mode 100644 index 0000000..3696ce2 --- /dev/null +++ b/.ci/agent.py @@ -0,0 +1,971 @@ +#!/usr/bin/env python3 +"""CI Runner Agent: webhook server, resource-aware scheduler, GitHub status reporting. + +Usage: + # Run jobs locally (or dispatch to remote agents) + python .ci/agent.py run --branch master + python .ci/agent.py run --branch master --job nvidia_gpu --dry-run + + # Start webhook server + python .ci/agent.py serve --platform nvidia --port 8080 +""" + +import argparse +import collections +import hashlib +import hmac +import json +import os +import shlex +import subprocess +import sys +import threading +import time +import urllib.error +import urllib.request +import uuid +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + +import ci_resource as res +import github_status as gh +import run + +# Maximum POST body size (1 MB) to prevent memory exhaustion +MAX_CONTENT_LENGTH = 1 * 1024 * 1024 + +# Job states +STATE_QUEUED = "queued" +STATE_RUNNING = "running" +STATE_PENDING = "pending" +STATE_SUCCESS = "success" +STATE_FAILURE = "failure" +STATE_ERROR = "error" + +# urllib helpers (module-level for easier mocking in tests) +urllib_request = urllib.request.Request +urllib_urlopen = urllib.request.urlopen + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + + +class JobRequest: + """Describes a CI job to be executed.""" + + def __init__(self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None): + self.job_id = str(uuid.uuid4())[:8] + self.job_name = job_name + self.branch = branch + self.commit_sha = commit_sha + self.config = config + self.image_tag = image_tag + self.results_dir = results_dir or Path("ci-results") + self.created_at = datetime.now().isoformat() + + job = config["jobs"][job_name] + self.platform = job.get("platform", "nvidia") + + def to_dict(self): + return { + "job_id": self.job_id, + "job_name": self.job_name, + "branch": self.branch, + "commit_sha": self.commit_sha, + "platform": self.platform, + "created_at": self.created_at, + } + + +class JobResult: + """Outcome of a completed job.""" + + def __init__(self, job_id, job_name, commit_sha, returncode, results_dir, duration): + self.job_id = job_id + self.job_name = job_name + self.commit_sha = commit_sha + self.returncode = returncode + self.results_dir = results_dir + self.duration = duration + + self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE + + def to_dict(self): + return { + "job_id": self.job_id, + "job_name": self.job_name, + "commit_sha": self.commit_sha, + "state": self.state, + "returncode": self.returncode, + "results_dir": str(self.results_dir), + "duration_seconds": round(self.duration, 1), + } + + +# --------------------------------------------------------------------------- +# Job selection and routing +# --------------------------------------------------------------------------- + + +def select_jobs(config, platform=None, job_name=None): + """Return list of job names to run.""" + jobs = config.get("jobs", {}) + + if job_name: + if job_name not in jobs: + raise ValueError(f"job {job_name!r} not in config") + + return [job_name] + + if platform: + return [ + name for name, job in jobs.items() if job.get("platform") == platform + ] + + return list(jobs.keys()) + + +def route_jobs(config, job_names, local_platform=None): + """Split jobs into local and remote. + + Returns (local_jobs, remote_jobs) where remote_jobs is a list of + (job_name, agent_url) tuples. + """ + agents = config.get("agents", {}) + jobs = config.get("jobs", {}) + local = [] + remote = [] + + for name in job_names: + job = jobs.get(name, {}) + platform = job.get("platform", "") + + if not local_platform: + local.append(name) + elif platform == local_platform: + local.append(name) + elif platform in agents: + remote.append((name, agents[platform].get("url", ""))) + else: + local.append(name) + + return local, remote + + +# --------------------------------------------------------------------------- +# Scheduler +# --------------------------------------------------------------------------- + + +class Scheduler: + """Resource-aware job scheduler with dynamic parallelism.""" + + def __init__( + self, + config, + platform, + resource_pool, + results_dir=None, + max_workers=4, + no_status=False, + dry_run=False, + ): + self._config = config + self._platform = platform + self._resource_pool = resource_pool + self._results_dir = results_dir or Path("ci-results") + self._no_status = no_status + self._dry_run = dry_run + self._queue = collections.deque() + self._jobs: dict[str, dict] = {} # job_id -> {request, result, state, gpu_ids} + self._executor = ThreadPoolExecutor(max_workers=max_workers) + self._lock = threading.Lock() + self._done_event = threading.Event() + + # GitHub config + github_cfg = config.get("github", {}) + self._status_prefix = github_cfg.get("status_context_prefix", "ci/infiniops") + repo = config.get("repo", {}) + repo_url = repo.get("url", "") + self._owner, self._repo = gh.parse_repo_url(repo_url) + + def submit(self, job_request): + """Add a job to the queue and attempt to schedule it. + + Returns the job_id. + """ + with self._lock: + self._jobs[job_request.job_id] = { + "request": job_request, + "result": None, + "state": STATE_QUEUED, + "gpu_ids": [], + } + self._queue.append(job_request) + + self._try_schedule() + return job_request.job_id + + def get_job(self, job_id): + """Get job info by ID.""" + with self._lock: + entry = self._jobs.get(job_id) + + if not entry: + return None + + info = entry["request"].to_dict() + info["state"] = entry["state"] + + if entry["result"]: + info.update(entry["result"].to_dict()) + + return info + + def get_status(self): + """Return scheduler status for the /status endpoint.""" + with self._lock: + queued = [ + self._jobs[r.job_id]["request"].to_dict() + for r in self._queue + ] + running = [] + completed = [] + + for entry in self._jobs.values(): + state = entry["state"] + + if state == STATE_RUNNING: + running.append({**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]}) + elif state in (STATE_SUCCESS, STATE_FAILURE): + completed.append(entry["result"].to_dict()) + + return { + "queued": queued, + "running": running, + "completed": completed[-20:], # Last 20 + "resources": self._resource_pool.get_status(), + } + + def wait_all(self): + """Block until all submitted jobs are done. Returns list of JobResult.""" + while True: + with self._lock: + pending = any( + e["state"] in (STATE_QUEUED, STATE_RUNNING) for e in self._jobs.values() + ) + + if not pending: + break + + self._done_event.wait(timeout=2.0) + self._done_event.clear() + + with self._lock: + return [ + e["result"] + for e in self._jobs.values() + if e["result"] is not None + ] + + def _try_schedule(self): + """Try to run queued jobs that have enough resources. + + Resource allocation and job submission are split: allocation decisions + are made under the lock, but executor.submit() happens outside to + prevent deadlock when the thread pool is saturated. + """ + to_launch = [] # [(req, gpu_ids), ...] + + with self._lock: + remaining = collections.deque() + + while self._queue: + req = self._queue.popleft() + job_cfg = self._config["jobs"].get(req.job_name, {}) + gpu_count = res.parse_gpu_requirement(job_cfg) + memory_mb = res.parse_memory_requirement(job_cfg) + + if self._dry_run: + # In dry-run mode, skip resource checks + gpu_ids, ok = [], True + else: + gpu_ids, ok = self._resource_pool.allocate(gpu_count, memory_mb) + + if ok: + self._jobs[req.job_id]["state"] = STATE_RUNNING + self._jobs[req.job_id]["gpu_ids"] = gpu_ids + to_launch.append((req, gpu_ids)) + else: + remaining.append(req) + + self._queue = remaining + + # Submit outside the lock to avoid deadlock with ThreadPoolExecutor + for req, gpu_ids in to_launch: + self._executor.submit(self._run_job, req, gpu_ids) + + def _run_job(self, req, gpu_ids): + """Execute a single job in a worker thread. + + Wrapped in try/finally to guarantee GPU resources are always released + and job state is updated even on unexpected exceptions. + """ + context = gh.build_status_context(self._status_prefix, req.job_name) + result = None + + try: + # Post pending status + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + STATE_PENDING, + context, + f"Running {req.job_name}...", + ) + + job_cfg = self._config["jobs"][req.job_name] + all_stages = job_cfg.get("stages", []) + repo_url = self._config.get("repo", {}).get("url", "") + commit_short = req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha + results_dir = run.build_results_dir( + req.results_dir, req.platform, all_stages, commit_short + ) + + gpu_id_str = ",".join(str(g) for g in gpu_ids) if gpu_ids else None + docker_args = run.build_docker_args( + self._config, + req.job_name, + repo_url, + req.branch, + all_stages, + "/workspace", + req.image_tag, + gpu_id_override=gpu_id_str, + results_dir=results_dir, + ) + + start = time.monotonic() + + if self._dry_run: + print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}") + returncode = 0 + else: + results_dir.mkdir(parents=True, exist_ok=True) + proc = subprocess.run(docker_args) + returncode = proc.returncode + + duration = time.monotonic() - start + + result = JobResult( + job_id=req.job_id, + job_name=req.job_name, + commit_sha=req.commit_sha, + returncode=returncode, + results_dir=results_dir, + duration=duration, + ) + + # Post final status + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + result.state, + context, + f"{req.job_name}: {result.state} in {duration:.0f}s", + ) + except Exception as e: + print(f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr) + + if result is None: + result = JobResult( + job_id=req.job_id, + job_name=req.job_name, + commit_sha=req.commit_sha, + returncode=-1, + results_dir=req.results_dir, + duration=0, + ) + + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + STATE_ERROR, + context, + f"{req.job_name}: internal error", + ) + finally: + # Always release resources and update state + self._resource_pool.release(gpu_ids) + + with self._lock: + self._jobs[req.job_id]["result"] = result + self._jobs[req.job_id]["state"] = result.state if result else STATE_FAILURE + + self._done_event.set() + self._try_schedule() + + return result + + +# --------------------------------------------------------------------------- +# Webhook server +# --------------------------------------------------------------------------- + + +def verify_signature(secret, body, signature_header): + """Verify GitHub webhook HMAC-SHA256 signature.""" + if not signature_header: + return False + + expected = "sha256=" + hmac.new( + secret.encode("utf-8"), body, hashlib.sha256 + ).hexdigest() + return hmac.compare_digest(expected, signature_header) + + +def _verify_api_token(handler): + """Check Bearer token for /api/run authentication. + + Returns True if authenticated, False (and sends 401) if not. + When no api_token is configured on the server, all requests are allowed. + """ + api_token = getattr(handler.server, "api_token", None) + + if not api_token: + return True + + auth_header = handler.headers.get("Authorization", "") + + if auth_header == f"Bearer {api_token}": + return True + + handler._respond_json(401, {"error": "unauthorized"}) + return False + + +class WebhookHandler(BaseHTTPRequestHandler): + """HTTP handler for GitHub webhooks and API endpoints.""" + + def log_message(self, format, *args): + print(f"[agent] {args[0]}", file=sys.stderr) + + def do_GET(self): + if self.path == "/health": + self._respond_json(200, {"status": "ok", "platform": self.server.platform}) + elif self.path == "/status": + status = self.server.scheduler.get_status() + self._respond_json(200, status) + elif self.path.startswith("/api/job/"): + self._handle_api_job() + else: + self._respond_json(404, {"error": "not found"}) + + def do_POST(self): + content_length = int(self.headers.get("Content-Length", 0)) + + if content_length > MAX_CONTENT_LENGTH: + self._respond_json(413, {"error": "payload too large"}) + return + + body = self.rfile.read(content_length) + + if self.path == "/webhook": + self._handle_webhook(body) + elif self.path == "/api/run": + self._handle_api_run(body) + else: + self._respond_json(404, {"error": "not found"}) + + def _handle_webhook(self, body): + # Verify signature if secret is configured + if self.server.webhook_secret: + sig = self.headers.get("X-Hub-Signature-256", "") + + if not verify_signature(self.server.webhook_secret, body, sig): + self._respond_json(401, {"error": "invalid signature"}) + return + + event_type = self.headers.get("X-GitHub-Event", "") + + if event_type == "ping": + self._respond_json(200, {"msg": "pong"}) + return + + try: + payload = json.loads(body) + except json.JSONDecodeError: + self._respond_json(400, {"error": "invalid JSON"}) + return + + if event_type == "push": + branch, sha = self._parse_push(payload) + elif event_type == "pull_request": + action = payload.get("action", "") + + if action not in ("opened", "synchronize"): + self._respond_json(200, {"msg": f"ignored PR action: {action}"}) + return + + branch, sha = self._parse_pull_request(payload) + else: + self._respond_json(200, {"msg": f"ignored event: {event_type}"}) + return + + if not branch or not sha: + self._respond_json(400, {"error": "could not extract branch/sha"}) + return + + job_ids = self._submit_jobs(branch, sha) + self._respond_json(200, {"accepted": True, "job_ids": job_ids}) + + def _handle_api_run(self, body): + """Handle /api/run: remote job trigger (requires Bearer token auth).""" + if not _verify_api_token(self): + return + + try: + payload = json.loads(body) + except json.JSONDecodeError: + self._respond_json(400, {"error": "invalid JSON"}) + return + + branch = payload.get("branch", "") + sha = payload.get("commit_sha", "") + job_name = payload.get("job") + image_tag = payload.get("image_tag") + + if not branch: + self._respond_json(400, {"error": "branch is required"}) + return + + if not sha: + sha = run.get_git_commit() + + job_ids = self._submit_jobs(branch, sha, job_name=job_name, image_tag=image_tag) + self._respond_json(200, {"accepted": True, "job_ids": job_ids}) + + def _handle_api_job(self): + """Handle GET /api/job/{id}.""" + parts = self.path.split("/") + + if len(parts) < 4: + self._respond_json(400, {"error": "missing job_id"}) + return + + job_id = parts[3] + info = self.server.scheduler.get_job(job_id) + + if info is None: + self._respond_json(404, {"error": f"job {job_id} not found"}) + else: + self._respond_json(200, info) + + def _parse_push(self, payload): + branch = payload.get("ref", "").removeprefix("refs/heads/") + sha = payload.get("after", "") + return branch, sha + + def _parse_pull_request(self, payload): + pr = payload.get("pull_request", {}) + head = pr.get("head", {}) + branch = head.get("ref", "") + sha = head.get("sha", "") + return branch, sha + + def _submit_jobs(self, branch, sha, job_name=None, image_tag=None): + config = self.server.config + job_names = select_jobs(config, platform=self.server.platform, job_name=job_name) + job_ids = [] + + for name in job_names: + req = JobRequest( + job_name=name, + branch=branch, + commit_sha=sha, + config=config, + image_tag=image_tag, + results_dir=self.server.results_dir, + ) + jid = self.server.scheduler.submit(req) + job_ids.append(jid) + + return job_ids + + def _respond_json(self, status_code, data): + body = json.dumps(data, indent=2).encode("utf-8") + self.send_response(status_code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + +class AgentServer(HTTPServer): + """HTTP server with scheduler and config context.""" + + def __init__( + self, + host, + port, + config, + scheduler, + platform, + webhook_secret=None, + api_token=None, + results_dir=None, + ): + super().__init__((host, port), WebhookHandler) + self.config = config + self.scheduler = scheduler + self.platform = platform + self.webhook_secret = webhook_secret + self.api_token = api_token + self.results_dir = results_dir or Path("ci-results") + + +# --------------------------------------------------------------------------- +# Remote job dispatch (for CLI triggering remote agents) +# --------------------------------------------------------------------------- + + +def dispatch_remote_job(agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None): + """Send a job to a remote agent via HTTP API. Returns job_id or None.""" + url = f"{agent_url.rstrip('/')}/api/run" + body = { + "branch": branch, + "commit_sha": commit_sha, + "job": job_name, + } + + if image_tag: + body["image_tag"] = image_tag + + data = json.dumps(body).encode("utf-8") + headers = {"Content-Type": "application/json"} + + if api_token: + headers["Authorization"] = f"Bearer {api_token}" + + req = urllib_request(url, data=data, headers=headers, method="POST") + + try: + with urllib_urlopen(req, timeout=30) as resp: + result = json.loads(resp.read()) + job_ids = result.get("job_ids", []) + return job_ids[0] if job_ids else None + except Exception as e: + print(f"error: failed to dispatch to {agent_url}: {e}", file=sys.stderr) + return None + + +def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200): + """Poll a remote agent for job completion. Returns final state dict or None.""" + url = f"{agent_url.rstrip('/')}/api/job/{job_id}" + deadline = time.monotonic() + timeout + + while time.monotonic() < deadline: + try: + req = urllib_request(url) + + with urllib_urlopen(req, timeout=10) as resp: + info = json.loads(resp.read()) + + state = info.get("state", "") + + if state in (STATE_SUCCESS, STATE_FAILURE): + return info + except Exception: + pass + + time.sleep(interval) + + return None + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def cmd_run(args): + """Handle 'run' subcommand: execute jobs locally and/or remotely.""" + config = run.load_config(args.config) + commit_sha = args.commit or run.get_git_commit(short=False) + + # Determine which jobs to run + try: + job_names = select_jobs(config, platform=args.platform, job_name=args.job) + except ValueError as e: + print(f"error: {e}", file=sys.stderr) + sys.exit(1) + + if not job_names: + print("error: no matching jobs found", file=sys.stderr) + sys.exit(1) + + # Detect local platform (if running serve on this machine, use that; otherwise guess) + local_platform = args.platform + local_jobs, remote_jobs = route_jobs(config, job_names, local_platform) + + # Run local jobs + local_results = [] + + if local_jobs: + pool = res.ResourcePool( + local_platform or "unknown", + utilization_threshold=args.utilization_threshold, + ) + scheduler = Scheduler( + config, + local_platform or "unknown", + pool, + results_dir=args.results_dir, + no_status=args.no_status, + dry_run=args.dry_run, + ) + + for name in local_jobs: + req = JobRequest( + job_name=name, + branch=args.branch, + commit_sha=commit_sha, + config=config, + image_tag=args.image_tag, + results_dir=args.results_dir, + ) + scheduler.submit(req) + + local_results = scheduler.wait_all() + + # Dispatch remote jobs + remote_results = [] + api_token = os.environ.get("AGENT_API_TOKEN", "") + + if remote_jobs and not args.dry_run: + # Dispatch all remote jobs first, then poll concurrently + dispatched = [] # [(name, agent_url, job_id)] + + for name, agent_url in remote_jobs: + if not agent_url: + print(f"warning: no agent URL for {name}, skipping", file=sys.stderr) + remote_results.append({"job_name": name, "state": "error"}) + continue + + print(f"==> dispatching {name} to {agent_url}", file=sys.stderr) + job_id = dispatch_remote_job( + agent_url, name, args.branch, commit_sha, args.image_tag, + api_token=api_token or None, + ) + + if job_id: + print(f" job_id: {job_id}", file=sys.stderr) + dispatched.append((name, agent_url, job_id)) + else: + print(f" failed to dispatch {name}", file=sys.stderr) + remote_results.append({"job_name": name, "state": "error"}) + + # Poll all dispatched jobs concurrently + if dispatched: + with ThreadPoolExecutor(max_workers=len(dispatched)) as executor: + futures = { + executor.submit(poll_remote_job, url, jid): (name, url, jid) + for name, url, jid in dispatched + } + + for future in futures: + name, _, _ = futures[future] + result = future.result() + + if result: + remote_results.append(result) + else: + print(f" timeout waiting for {name}", file=sys.stderr) + remote_results.append({"job_name": name, "state": "timeout"}) + + elif remote_jobs and args.dry_run: + for name, agent_url in remote_jobs: + print(f"[dry-run] dispatch {name} to {agent_url}") + + # Summary + print("\n========== Results ==========") + all_ok = True + + for r in local_results: + status = "PASS" if r.returncode == 0 else "FAIL" + + if r.returncode != 0: + all_ok = False + + print(f" {status} {r.job_name} ({r.duration:.0f}s) {r.results_dir}") + + for r in remote_results: + state = r.get("state", "unknown") + name = r.get("job_name", "?") + status = "PASS" if state == STATE_SUCCESS else "FAIL" + + if state != STATE_SUCCESS: + all_ok = False + + duration = r.get("duration_seconds", 0) + print(f" {status} {name} ({duration:.0f}s) [remote]") + + if not all_ok: + sys.exit(1) + + +def cmd_serve(args): + """Handle 'serve' subcommand: start webhook server.""" + config = run.load_config(args.config) + + pool = res.ResourcePool( + args.platform, + utilization_threshold=args.utilization_threshold, + ) + scheduler = Scheduler( + config, + args.platform, + pool, + results_dir=args.results_dir, + ) + + webhook_secret = args.webhook_secret or os.environ.get("WEBHOOK_SECRET", "") + api_token = args.api_token or os.environ.get("AGENT_API_TOKEN", "") + + if not webhook_secret: + print( + "WARNING: No webhook secret configured. Webhook endpoint accepts " + "unsigned requests. Set --webhook-secret or WEBHOOK_SECRET for production.", + file=sys.stderr, + ) + + if not api_token: + print( + "WARNING: No API token configured. /api/run endpoint is unauthenticated. " + "Set --api-token or AGENT_API_TOKEN for production.", + file=sys.stderr, + ) + + server = AgentServer( + args.host, + args.port, + config, + scheduler, + args.platform, + webhook_secret=webhook_secret or None, + api_token=api_token or None, + results_dir=args.results_dir, + ) + + print( + f"Agent serving on {args.host}:{args.port} (platform={args.platform})", + file=sys.stderr, + ) + print(f" POST /webhook — GitHub webhook", file=sys.stderr) + print(f" POST /api/run — remote job trigger", file=sys.stderr) + print(f" GET /health — health check", file=sys.stderr) + print(f" GET /status — queue & resource status", file=sys.stderr) + print(f" GET /api/job/{{id}} — job status", file=sys.stderr) + + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down...", file=sys.stderr) + server.shutdown() + + +def main(): + parser = argparse.ArgumentParser( + description="CI Runner Agent: run jobs locally, dispatch remotely, or serve webhooks", + ) + subparsers = parser.add_subparsers(dest="command") + + # --- run subcommand --- + run_parser = subparsers.add_parser("run", help="Run CI jobs") + run_parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + ) + run_parser.add_argument("--branch", type=str, required=True, help="Branch to test") + run_parser.add_argument("--job", type=str, help="Specific job name") + run_parser.add_argument("--platform", type=str, help="Filter jobs by platform") + run_parser.add_argument("--image-tag", type=str, help="Override image tag") + run_parser.add_argument("--commit", type=str, help="Override commit SHA") + run_parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + ) + run_parser.add_argument( + "--utilization-threshold", + type=int, + default=10, + help="GPU utilization threshold (%%) to consider free (default: 10)", + ) + run_parser.add_argument("--no-status", action="store_true", help="Skip GitHub status") + run_parser.add_argument("--dry-run", action="store_true") + + # --- serve subcommand --- + serve_parser = subparsers.add_parser("serve", help="Start webhook server") + serve_parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + ) + serve_parser.add_argument( + "--platform", + type=str, + required=True, + help="Platform this agent handles (nvidia, iluvatar, etc.)", + ) + serve_parser.add_argument("--port", type=int, default=8080) + serve_parser.add_argument("--host", type=str, default="0.0.0.0") + serve_parser.add_argument("--webhook-secret", type=str) + serve_parser.add_argument( + "--api-token", + type=str, + help="Bearer token for /api/run authentication (or AGENT_API_TOKEN env var)", + ) + serve_parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + ) + serve_parser.add_argument( + "--utilization-threshold", + type=int, + default=10, + ) + + args = parser.parse_args() + + if args.command == "run": + cmd_run(args) + elif args.command == "serve": + cmd_serve(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/build.py b/.ci/build.py index 2339319..7953209 100644 --- a/.ci/build.py +++ b/.ci/build.py @@ -9,32 +9,7 @@ import sys from pathlib import Path -try: - import yaml -except ImportError: - print( - "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr - ) - sys.exit(1) - - -def load_config(path): - with open(path, encoding="utf-8") as f: - return yaml.safe_load(f) - - -def get_git_commit(ref="HEAD"): - result = subprocess.run( - ["git", "rev-parse", "--short", ref], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) - sys.exit(1) - - return result.stdout.strip() +from utils import get_git_commit, load_config def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py new file mode 100644 index 0000000..f3dbfb1 --- /dev/null +++ b/.ci/ci_resource.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +"""Resource detection and allocation for CI Runner Agent.""" + +import os +import subprocess +import threading +from dataclasses import dataclass, field + +# GPU passthrough styles +GPU_STYLE_NVIDIA = "nvidia" +GPU_STYLE_NONE = "none" + + +@dataclass +class GpuInfo: + index: int + memory_used_mb: float + memory_total_mb: float + utilization_pct: float + + +@dataclass +class SystemResources: + total_memory_mb: float + available_memory_mb: float + cpu_count: int + + +class ResourcePool: + """Thread-safe GPU and system resource manager. + + Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi) + and tracks allocations to enable dynamic parallel scheduling. + """ + + GPU_QUERY_TOOLS = { + "nvidia": "nvidia-smi", + "iluvatar": "ixsmi", + } + + def __init__(self, platform, utilization_threshold=10): + self._platform = platform + self._utilization_threshold = utilization_threshold + self._allocated: set[int] = set() + self._lock = threading.Lock() + + @property + def platform(self): + return self._platform + + @property + def allocated(self): + with self._lock: + return set(self._allocated) + + def detect_gpus(self) -> list[GpuInfo]: + """Query GPU status via platform-specific CLI tool.""" + tool = self.GPU_QUERY_TOOLS.get(self._platform) + + if not tool: + return [] + + try: + result = subprocess.run( + [ + tool, + "--query-gpu=index,memory.used,memory.total,utilization.gpu", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return [] + + if result.returncode != 0: + return [] + + gpus = [] + + for line in result.stdout.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + + if len(parts) < 4: + continue + + try: + gpus.append( + GpuInfo( + index=int(parts[0]), + memory_used_mb=float(parts[1]), + memory_total_mb=float(parts[2]), + utilization_pct=float(parts[3]), + ) + ) + except (ValueError, IndexError): + continue + + return gpus + + def detect_system_resources(self) -> SystemResources: + """Read system memory from /proc/meminfo and CPU count.""" + total_mb = 0.0 + available_mb = 0.0 + + try: + with open("/proc/meminfo", encoding="utf-8") as f: + for line in f: + if line.startswith("MemTotal:"): + total_mb = float(line.split()[1]) / 1024 + elif line.startswith("MemAvailable:"): + available_mb = float(line.split()[1]) / 1024 + except OSError: + pass + + return SystemResources( + total_memory_mb=total_mb, + available_memory_mb=available_mb, + cpu_count=os.cpu_count() or 1, + ) + + def get_free_gpus(self) -> list[int]: + """Return GPU indices with utilization below threshold.""" + gpus = self.detect_gpus() + return [ + g.index + for g in gpus + if g.utilization_pct < self._utilization_threshold + ] + + def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]: + """Try to allocate GPUs and check memory. + + Returns (allocated_gpu_ids, success). On failure returns ([], False). + GPU detection and memory checks run outside the lock to avoid blocking + other threads while subprocess.run (nvidia-smi) executes. + """ + if gpu_count <= 0: + if memory_mb > 0: + sys_res = self.detect_system_resources() + + if sys_res.available_memory_mb < memory_mb: + return ([], False) + + return ([], True) + + # Detect GPUs and memory outside the lock (subprocess.run can block) + free_gpus = set(self.get_free_gpus()) + sys_res = self.detect_system_resources() if memory_mb > 0 else None + + with self._lock: + available = free_gpus - self._allocated + + if len(available) < gpu_count: + return ([], False) + + if sys_res is not None and sys_res.available_memory_mb < memory_mb: + return ([], False) + + selected = sorted(available)[:gpu_count] + self._allocated.update(selected) + return (selected, True) + + def release(self, gpu_ids): + """Return GPUs to the free pool.""" + with self._lock: + self._allocated -= set(gpu_ids) + + def get_status(self) -> dict: + """Return current resource status for API endpoints.""" + gpus = self.detect_gpus() + sys_res = self.detect_system_resources() + + with self._lock: + allocated = sorted(self._allocated) + + return { + "platform": self._platform, + "gpus": [ + { + "index": g.index, + "memory_used_mb": g.memory_used_mb, + "memory_total_mb": g.memory_total_mb, + "utilization_pct": g.utilization_pct, + "allocated_by_agent": g.index in allocated, + } + for g in gpus + ], + "allocated_gpu_ids": allocated, + "system": { + "total_memory_mb": round(sys_res.total_memory_mb, 1), + "available_memory_mb": round(sys_res.available_memory_mb, 1), + "cpu_count": sys_res.cpu_count, + }, + "utilization_threshold": self._utilization_threshold, + } + + +def parse_gpu_requirement(job_config) -> int: + """Extract GPU count requirement from a job config.""" + resources = job_config.get("resources", {}) + gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA) + + if gpu_style == GPU_STYLE_NONE: + return 0 + + gpu_ids = str(resources.get("gpu_ids", "")) + + if not gpu_ids: + return resources.get("gpu_count", 0) + + if gpu_ids == "all": + return 0 # "all" means use all available, don't reserve specific count + + return len(gpu_ids.split(",")) + + +def parse_memory_requirement(job_config) -> float: + """Extract memory requirement in MB from a job config.""" + resources = job_config.get("resources", {}) + memory = str(resources.get("memory", "")) + + if not memory: + return 0 + + memory = memory.lower().strip() + + if memory.endswith("gb"): + return float(memory[:-2]) * 1024 + elif memory.endswith("g"): + return float(memory[:-1]) * 1024 + elif memory.endswith("mb"): + return float(memory[:-2]) + elif memory.endswith("m"): + return float(memory[:-1]) + + try: + return float(memory) * 1024 # Default: GB + except ValueError: + return 0 diff --git a/.ci/config.yaml b/.ci/config.yaml index a86174a..e62bc07 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -2,32 +2,69 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -images: - nvidia: - dockerfile: .ci/images/nvidia/ - build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 - ascend: # TODO: Ascend image is not ready yet - dockerfile: .ci/images/ascend/ - build_args: - BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 - private_sdk: - source_env: PRIVATE_SDK_URL +github: + status_context_prefix: "ci/infiniops" # GitHub Commit Status context 前缀 + +# agents: # 远程 Agent 地址(CLI 跨机器触发用) +# nvidia: +# url: http://nvidia-host:8080 +# iluvatar: +# url: http://iluvatar-host:8080 -jobs: - nvidia_gpu: - image: latest - platform: nvidia - resources: - gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" - memory: 32GB - shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 - timeout: 3600 +platforms: + nvidia: + image: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" + memory: 32GB + shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 + timeout: 3600 + # env: # 可选,注入容器环境变量 + # MY_VAR: value + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - setup: pip install .[dev] - # env: # 可选,注入容器环境变量 - # MY_VAR: value + iluvatar: + image: + dockerfile: .ci/images/iluvatar/ + build_args: + BASE_IMAGE: corex:qs_pj20250825 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + - "--cap-add=ALL" + - "--pid=host" + - "--ipc=host" + volumes: + - /dev:/dev + - /lib/firmware:/lib/firmware + - /usr/src:/usr/src + - /lib/modules:/lib/modules + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" # 通过 CUDA_VISIBLE_DEVICES 控制可见 GPU + gpu_style: none # CoreX 设备通过 --privileged + /dev 挂载透传 + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - stages: - - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + ascend: # TODO: Ascend image is not ready yet + image: + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source_env: PRIVATE_SDK_URL diff --git a/.ci/github_status.py b/.ci/github_status.py new file mode 100644 index 0000000..a7abb8f --- /dev/null +++ b/.ci/github_status.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""GitHub Commit Status API wrapper using urllib (zero external dependencies).""" + +import json +import os +import re +import sys +import urllib.error +import urllib.request + + +def parse_repo_url(url): + """Extract (owner, repo) from a GitHub URL. + + Handles: + - https://github.com/Owner/Repo.git + - git@github.com:Owner/Repo.git + """ + # HTTPS format + m = re.match(r"https?://[^/]+/([^/]+)/([^/]+?)(?:\.git)?$", url) + + if m: + return m.group(1), m.group(2) + + # SSH format + m = re.match(r"git@[^:]+:([^/]+)/([^/]+?)(?:\.git)?$", url) + + if m: + return m.group(1), m.group(2) + + return "", "" + + +def build_status_context(prefix, job_name): + """Build status context string, e.g. 'ci/infiniops/nvidia_gpu'.""" + return f"{prefix}/{job_name}" + + +def post_commit_status( + owner, + repo, + sha, + state, + context, + description, + target_url=None, + token=None, +): + """Post a commit status to GitHub. + + Args: + state: One of 'pending', 'success', 'failure', 'error'. + Returns True on success, False on failure. + """ + token = token or os.environ.get("GITHUB_TOKEN", "") + + if not token: + print("warning: GITHUB_TOKEN not set, skipping status update", file=sys.stderr) + return False + + if not owner or not repo or not sha: + print("warning: missing owner/repo/sha, skipping status update", file=sys.stderr) + return False + + url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}" + body = { + "state": state, + "context": context, + "description": description[:140], + } + + if target_url: + body["target_url"] = target_url + + data = json.dumps(body).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "Content-Type": "application/json", + }, + method="POST", + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return 200 <= resp.status < 300 + except urllib.error.HTTPError as e: + print( + f"warning: GitHub status API returned {e.code}: {e.reason}", + file=sys.stderr, + ) + return False + except urllib.error.URLError as e: + print(f"warning: GitHub status API error: {e.reason}", file=sys.stderr) + return False diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile new file mode 100644 index 0000000..f098e5f --- /dev/null +++ b/.ci/images/iluvatar/Dockerfile @@ -0,0 +1,53 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +# CoreX runtime environment (base image sets these in /etc/bash.bashrc, +# but docker build RUN uses /bin/sh which doesn't source it) +ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages +ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + ninja-build \ + coreutils \ + && rm -rf /var/lib/apt/lists/* + +RUN ln -sf $(which python3) /usr/local/bin/python 2>/dev/null || true + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml \ + ruff==0.15.7 + +RUN pip config set global.index-url https://pypi.org/simple + +# Pin pre-installed CoreX torch to prevent pip from replacing it with upstream version +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile index 74ccfd1..05da963 100644 --- a/.ci/images/nvidia/Dockerfile +++ b/.ci/images/nvidia/Dockerfile @@ -10,7 +10,11 @@ ARG http_proxy ARG https_proxy ARG no_proxy -RUN apt-get update && \ +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ apt-get install -y --no-install-recommends \ git \ cmake \ @@ -19,13 +23,24 @@ RUN apt-get update && \ libclang-dev \ && rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir \ + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ scikit-build-core \ pybind11 \ libclang \ pytest \ pytest-cov \ pytest-xdist \ - pyyaml + pyyaml \ + ruff==0.15.7 + +# Pin pre-installed torch to prevent pip from replacing it with a different version +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py index 0c8d648..2575781 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -9,31 +9,8 @@ from datetime import datetime from pathlib import Path -try: - import yaml -except ImportError: - print( - "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr - ) - sys.exit(1) - - -def load_config(path): - with open(path, encoding="utf-8") as f: - return yaml.safe_load(f) - - -def get_git_commit(ref="HEAD"): - result = subprocess.run( - ["git", "rev-parse", "--short", ref], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - return "unknown" - - return result.stdout.strip() +from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE +from utils import get_git_commit, load_config def build_results_dir(base, platform, stages, commit): @@ -155,16 +132,29 @@ def build_docker_args( args.append("-e") args.append(f"STAGE_{i + 1}_CMD={s['run']}") + # Platform-specific device access + for flag in job.get("docker_args", []): + args.append(flag) + + for vol in job.get("volumes", []): + args.extend(["-v", vol]) + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) gpu_count = resources.get("gpu_count", 0) - - if gpu_id: - if gpu_id == "all": - args.extend(["--gpus", "all"]) - else: - args.extend(["--gpus", f'"device={gpu_id}"']) - elif gpu_count and gpu_count > 0: - args.extend(["--gpus", f"count={gpu_count}"]) + gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA) + + if gpu_style == GPU_STYLE_NVIDIA: + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif gpu_count and gpu_count > 0: + args.extend(["--gpus", f"count={gpu_count}"]) + elif gpu_style == GPU_STYLE_NONE and gpu_id and gpu_id != "all": + # For platforms like Iluvatar/CoreX that use --privileged + /dev mount, + # control visible GPUs via CUDA_VISIBLE_DEVICES. + args.extend(["-e", f"CUDA_VISIBLE_DEVICES={gpu_id}"]) memory = resources.get("memory") diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py index 98079cd..38ed716 100644 --- a/.ci/tests/conftest.py +++ b/.ci/tests/conftest.py @@ -6,37 +6,41 @@ import pytest +from utils import normalize_config + @pytest.fixture def minimal_config(): - return { + """Minimal platform-centric config, normalized to flat format.""" + raw = { "repo": { "url": "https://github.com/InfiniTensor/InfiniOps.git", "branch": "master", }, - "images": { + "platforms": { "nvidia": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - } - }, - "jobs": { - "nvidia_gpu": { - "image": "latest", - "platform": "nvidia", - "resources": { - "gpu_ids": "0", - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, }, "setup": "pip install .[dev]", - "stages": [ - { - "name": "test", - "run": "pytest tests/ -v", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [ + { + "name": "test", + "run": "pytest tests/ -v", + } + ], } - ], + }, } }, } + return normalize_config(raw) diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py new file mode 100644 index 0000000..5741385 --- /dev/null +++ b/.ci/tests/test_agent.py @@ -0,0 +1,503 @@ +import hashlib +import hmac +import json +import threading +import time +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +import agent +import ci_resource as res +from utils import normalize_config + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def agent_config(): + raw = { + "repo": { + "url": "https://github.com/InfiniTensor/InfiniOps.git", + "branch": "master", + }, + "github": { + "status_context_prefix": "ci/infiniops", + }, + "agents": { + "nvidia": {"url": "http://nvidia-host:8080"}, + "iluvatar": {"url": "http://iluvatar-host:8080"}, + }, + "platforms": { + "nvidia": { + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + }, + "setup": "pip install .[dev]", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [{"name": "test", "run": "pytest tests/ -v"}], + }, + }, + }, + "iluvatar": { + "image": { + "dockerfile": ".ci/images/iluvatar/", + "build_args": {"BASE_IMAGE": "corex:qs_pj20250825"}, + }, + "setup": "pip install .[dev]", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "gpu_style": "none", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [{"name": "test", "run": "pytest tests/ -v"}], + }, + }, + }, + }, + } + return normalize_config(raw) + + +@pytest.fixture +def mock_resource_pool(): + pool = MagicMock(spec=res.ResourcePool) + pool.platform = "nvidia" + pool.allocate.return_value = ([0], True) + pool.release.return_value = None + pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}} + return pool + + +# --------------------------------------------------------------------------- +# select_jobs +# --------------------------------------------------------------------------- + + +def test_select_jobs_by_name(agent_config): + jobs = agent.select_jobs(agent_config, job_name="nvidia_gpu") + assert jobs == ["nvidia_gpu"] + + +def test_select_jobs_by_platform(agent_config): + jobs = agent.select_jobs(agent_config, platform="nvidia") + assert jobs == ["nvidia_gpu"] + + +def test_select_jobs_by_platform_iluvatar(agent_config): + jobs = agent.select_jobs(agent_config, platform="iluvatar") + assert jobs == ["iluvatar_gpu"] + + +def test_select_jobs_all(agent_config): + jobs = agent.select_jobs(agent_config) + assert set(jobs) == {"nvidia_gpu", "iluvatar_gpu"} + + +def test_select_jobs_invalid_name(agent_config): + with pytest.raises(ValueError, match="not_exist"): + agent.select_jobs(agent_config, job_name="not_exist") + + +# --------------------------------------------------------------------------- +# route_jobs +# --------------------------------------------------------------------------- + + +def test_route_jobs_local(agent_config): + local, remote = agent.route_jobs(agent_config, ["nvidia_gpu"], local_platform="nvidia") + assert local == ["nvidia_gpu"] + assert remote == [] + + +def test_route_jobs_remote(agent_config): + local, remote = agent.route_jobs(agent_config, ["iluvatar_gpu"], local_platform="nvidia") + assert local == [] + assert len(remote) == 1 + assert remote[0][0] == "iluvatar_gpu" + assert remote[0][1] == "http://iluvatar-host:8080" + + +def test_route_jobs_mixed(agent_config): + local, remote = agent.route_jobs( + agent_config, ["nvidia_gpu", "iluvatar_gpu"], local_platform="nvidia" + ) + assert local == ["nvidia_gpu"] + assert len(remote) == 1 + + +def test_route_jobs_no_platform(agent_config): + local, remote = agent.route_jobs(agent_config, ["nvidia_gpu", "iluvatar_gpu"]) + assert len(local) == 2 + assert remote == [] + + +# --------------------------------------------------------------------------- +# verify_signature +# --------------------------------------------------------------------------- + + +def test_verify_signature_valid(): + secret = "my-secret" + body = b'{"action": "push"}' + sig = "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() + assert agent.verify_signature(secret, body, sig) is True + + +def test_verify_signature_invalid(): + assert agent.verify_signature("secret", b"body", "sha256=wrong") is False + + +def test_verify_signature_empty(): + assert agent.verify_signature("secret", b"body", "") is False + + +# --------------------------------------------------------------------------- +# JobRequest / JobResult +# --------------------------------------------------------------------------- + + +def test_job_request_fields(agent_config): + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) + assert req.job_name == "nvidia_gpu" + assert req.platform == "nvidia" + assert req.commit_sha == "abc123" + assert len(req.job_id) == 8 + d = req.to_dict() + assert d["job_name"] == "nvidia_gpu" + + +def test_job_result_success(): + r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 42.5) + assert r.state == "success" + + +def test_job_result_failure(): + r = agent.JobResult("id1", "nvidia_gpu", "abc", 1, Path("/tmp/res"), 10.0) + assert r.state == "failure" + + +# --------------------------------------------------------------------------- +# Scheduler +# --------------------------------------------------------------------------- + + +def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("subprocess.run", lambda cmd, **kw: MagicMock(returncode=0)) + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + results_dir=Path("/tmp/test-results"), + no_status=True, dry_run=True, + ) + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config, + results_dir=Path("/tmp/test-results")) + jid = scheduler.submit(req) + results = scheduler.wait_all() + assert len(results) == 1 + assert results[0].state == "success" + + +def test_scheduler_queues_when_no_resources(agent_config, monkeypatch): + pool = MagicMock(spec=res.ResourcePool) + pool.allocate.return_value = ([], False) + pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}} + + scheduler = agent.Scheduler( + agent_config, "nvidia", pool, + no_status=True, dry_run=False, + ) + + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) + scheduler.submit(req) + + info = scheduler.get_job(req.job_id) + assert info["state"] == "queued" + + +def test_scheduler_get_status(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + + status = scheduler.get_status() + assert "queued" in status + assert "running" in status + assert "completed" in status + assert "resources" in status + + +# --------------------------------------------------------------------------- +# WebhookHandler — push event parsing +# --------------------------------------------------------------------------- + + +def test_webhook_parse_push(): + handler = agent.WebhookHandler.__new__(agent.WebhookHandler) + payload = {"ref": "refs/heads/feat/test", "after": "abc123def456"} + branch, sha = handler._parse_push(payload) + assert branch == "feat/test" + assert sha == "abc123def456" + + +def test_webhook_parse_pr(): + handler = agent.WebhookHandler.__new__(agent.WebhookHandler) + payload = { + "pull_request": { + "head": { + "ref": "feat/pr-branch", + "sha": "def789", + } + } + } + branch, sha = handler._parse_pull_request(payload) + assert branch == "feat/pr-branch" + assert sha == "def789" + + +# --------------------------------------------------------------------------- +# Integration-style: webhook HTTP test +# --------------------------------------------------------------------------- + + +def _urlopen_no_proxy(url_or_req, **kwargs): + """urlopen that bypasses any HTTP_PROXY.""" + import urllib.request + + opener = urllib.request.build_opener(urllib.request.ProxyHandler({})) + return opener.open(url_or_req, **kwargs) + + +def test_health_endpoint(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + try: + resp = _urlopen_no_proxy(f"http://127.0.0.1:{port}/health", timeout=5) + data = json.loads(resp.read()) + assert data["status"] == "ok" + assert data["platform"] == "nvidia" + finally: + server.server_close() + + +def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={"Content-Type": "application/json"}, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + assert len(data["job_ids"]) >= 1 + finally: + server.server_close() + + +def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + secret = "test-secret" + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + webhook_secret=secret, + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + payload = json.dumps({ + "ref": "refs/heads/master", + "after": "abc123def456", + }).encode() + sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest() + + req = urllib.request.Request( + f"http://127.0.0.1:{port}/webhook", + data=payload, + headers={ + "Content-Type": "application/json", + "X-GitHub-Event": "push", + "X-Hub-Signature-256": sig, + }, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + finally: + server.server_close() + + +def test_webhook_invalid_signature(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + webhook_secret="real-secret", + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.error + import urllib.request + + payload = b'{"ref": "refs/heads/master", "after": "abc"}' + req = urllib.request.Request( + f"http://127.0.0.1:{port}/webhook", + data=payload, + headers={ + "Content-Type": "application/json", + "X-GitHub-Event": "push", + "X-Hub-Signature-256": "sha256=invalid", + }, + ) + + try: + with pytest.raises(urllib.error.HTTPError) as exc_info: + _urlopen_no_proxy(req, timeout=5) + + assert exc_info.value.code == 401 + finally: + server.server_close() + + +# --------------------------------------------------------------------------- +# API token authentication +# --------------------------------------------------------------------------- + + +def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): + """When api_token is set, /api/run rejects requests without valid token.""" + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + api_token="my-secret-token", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.error + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={"Content-Type": "application/json"}, + ) + + try: + with pytest.raises(urllib.error.HTTPError) as exc_info: + _urlopen_no_proxy(req, timeout=5) + + assert exc_info.value.code == 401 + finally: + server.server_close() + + +def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch): + """When api_token is set, /api/run accepts requests with correct Bearer token.""" + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + api_token="my-secret-token", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer my-secret-token", + }, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + finally: + server.server_close() diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py new file mode 100644 index 0000000..0efa36e --- /dev/null +++ b/.ci/tests/test_github_status.py @@ -0,0 +1,144 @@ +import json +from unittest.mock import MagicMock, patch + +import pytest + +import github_status as gh + + +# --------------------------------------------------------------------------- +# parse_repo_url +# --------------------------------------------------------------------------- + + +def test_parse_repo_url_https(): + owner, repo = gh.parse_repo_url("https://github.com/InfiniTensor/InfiniOps.git") + assert owner == "InfiniTensor" + assert repo == "InfiniOps" + + +def test_parse_repo_url_https_no_git(): + owner, repo = gh.parse_repo_url("https://github.com/Owner/Repo") + assert owner == "Owner" + assert repo == "Repo" + + +def test_parse_repo_url_ssh(): + owner, repo = gh.parse_repo_url("git@github.com:Owner/Repo.git") + assert owner == "Owner" + assert repo == "Repo" + + +def test_parse_repo_url_invalid(): + owner, repo = gh.parse_repo_url("not-a-url") + assert owner == "" + assert repo == "" + + +# --------------------------------------------------------------------------- +# build_status_context +# --------------------------------------------------------------------------- + + +def test_build_status_context(): + ctx = gh.build_status_context("ci/infiniops", "nvidia_gpu") + assert ctx == "ci/infiniops/nvidia_gpu" + + +# --------------------------------------------------------------------------- +# post_commit_status +# --------------------------------------------------------------------------- + + +def test_post_status_no_token(monkeypatch): + monkeypatch.delenv("GITHUB_TOKEN", raising=False) + result = gh.post_commit_status("owner", "repo", "abc123", "success", "ctx", "desc") + assert result is False + + +def test_post_status_missing_owner(): + result = gh.post_commit_status("", "repo", "abc123", "success", "ctx", "desc", token="tok") + assert result is False + + +def test_post_status_success(monkeypatch): + mock_response = MagicMock() + mock_response.status = 201 + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + + captured_req = {} + + def mock_urlopen(req, **kwargs): + captured_req["url"] = req.full_url + captured_req["data"] = json.loads(req.data) + captured_req["headers"] = dict(req.headers) + return mock_response + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "InfiniTensor", + "InfiniOps", + "abc123def", + "success", + "ci/infiniops/nvidia_gpu", + "Tests passed", + token="ghp_test_token", + ) + + assert result is True + assert "abc123def" in captured_req["url"] + assert captured_req["data"]["state"] == "success" + assert captured_req["data"]["context"] == "ci/infiniops/nvidia_gpu" + assert "ghp_test_token" in captured_req["headers"]["Authorization"] + + +def test_post_status_http_error(monkeypatch): + import urllib.error + + def mock_urlopen(req, **kwargs): + raise urllib.error.HTTPError( + url="", code=422, msg="Unprocessable", hdrs=None, fp=None + ) + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "owner", "repo", "sha", "success", "ctx", "desc", token="tok" + ) + assert result is False + + +def test_post_status_url_error(monkeypatch): + import urllib.error + + def mock_urlopen(req, **kwargs): + raise urllib.error.URLError("connection refused") + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "owner", "repo", "sha", "success", "ctx", "desc", token="tok" + ) + assert result is False + + +def test_post_status_truncates_description(monkeypatch): + mock_response = MagicMock() + mock_response.status = 201 + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + + captured = {} + + def mock_urlopen(req, **kwargs): + captured["data"] = json.loads(req.data) + return mock_response + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + long_desc = "x" * 200 + gh.post_commit_status("o", "r", "sha", "success", "ctx", long_desc, token="tok") + + assert len(captured["data"]["description"]) == 140 diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py new file mode 100644 index 0000000..b75043c --- /dev/null +++ b/.ci/tests/test_resource.py @@ -0,0 +1,324 @@ +import threading + +import pytest + +import ci_resource as res + + +# --------------------------------------------------------------------------- +# GpuInfo / SystemResources +# --------------------------------------------------------------------------- + + +def test_gpu_info_fields(): + g = res.GpuInfo(index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50) + assert g.index == 0 + assert g.memory_total_mb == 8000 + + +def test_system_resources_fields(): + s = res.SystemResources(total_memory_mb=32000, available_memory_mb=16000, cpu_count=8) + assert s.cpu_count == 8 + + +# --------------------------------------------------------------------------- +# detect_gpus +# --------------------------------------------------------------------------- + + +def test_detect_gpus_nvidia_parses_csv(monkeypatch): + csv_output = "0, 512, 8192, 5\n1, 1024, 8192, 80\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + gpus = pool.detect_gpus() + assert len(gpus) == 2 + assert gpus[0].index == 0 + assert gpus[0].memory_used_mb == 512 + assert gpus[0].utilization_pct == 5 + assert gpus[1].index == 1 + assert gpus[1].utilization_pct == 80 + + +def test_detect_gpus_empty_on_failure(monkeypatch): + def mock_run(cmd, **kwargs): + class R: + returncode = 1 + stdout = "" + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + assert pool.detect_gpus() == [] + + +def test_detect_gpus_unknown_platform(): + pool = res.ResourcePool("unknown_platform") + assert pool.detect_gpus() == [] + + +def test_detect_gpus_file_not_found(monkeypatch): + def mock_run(cmd, **kwargs): + raise FileNotFoundError("nvidia-smi not found") + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + assert pool.detect_gpus() == [] + + +# --------------------------------------------------------------------------- +# detect_system_resources +# --------------------------------------------------------------------------- + + +def test_detect_system_resources(monkeypatch, tmp_path): + meminfo = tmp_path / "meminfo" + meminfo.write_text( + "MemTotal: 32000000 kB\n" + "MemFree: 10000000 kB\n" + "MemAvailable: 20000000 kB\n" + ) + + import io + _real_open = open + + def fake_open(path, **kw): + if str(path) == "/proc/meminfo": + return _real_open(str(meminfo), **kw) + return _real_open(path, **kw) + + monkeypatch.setattr("builtins.open", fake_open) + + pool = res.ResourcePool("nvidia") + sys_res = pool.detect_system_resources() + assert abs(sys_res.total_memory_mb - 32000000 / 1024) < 1 + assert abs(sys_res.available_memory_mb - 20000000 / 1024) < 1 + assert sys_res.cpu_count > 0 + + +# --------------------------------------------------------------------------- +# get_free_gpus +# --------------------------------------------------------------------------- + + +def test_get_free_gpus_filters_by_utilization(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 4000, 8192, 95\n2, 200, 8192, 8\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + free = pool.get_free_gpus() + assert 0 in free + assert 2 in free + assert 1 not in free + + +# --------------------------------------------------------------------------- +# allocate / release +# --------------------------------------------------------------------------- + + +def test_allocate_success(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(1) + assert ok is True + assert len(gpu_ids) == 1 + assert gpu_ids[0] in (0, 1) + + +def test_allocate_insufficient_gpus(monkeypatch): + csv_output = "0, 100, 8192, 5\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(3) + assert ok is False + assert gpu_ids == [] + + +def test_allocate_zero_gpus(): + pool = res.ResourcePool("unknown") + gpu_ids, ok = pool.allocate(0) + assert ok is True + assert gpu_ids == [] + + +def test_release_frees_gpus(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(2) + assert ok is True + assert len(gpu_ids) == 2 + + # All GPUs allocated, next allocation should fail + _, ok2 = pool.allocate(1) + assert ok2 is False + + # Release one + pool.release([gpu_ids[0]]) + gpu_ids2, ok3 = pool.allocate(1) + assert ok3 is True + assert gpu_ids2 == [gpu_ids[0]] + + +def test_allocate_excludes_allocated(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids1, _ = pool.allocate(1) + gpu_ids2, _ = pool.allocate(1) + + assert gpu_ids1 != gpu_ids2 + assert set(gpu_ids1 + gpu_ids2) == {0, 1} + + +def test_thread_safety(monkeypatch): + csv_output = "0, 0, 8192, 0\n1, 0, 8192, 0\n2, 0, 8192, 0\n3, 0, 8192, 0\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=50) + allocated_all = [] + lock = threading.Lock() + + def allocate_one(): + ids, ok = pool.allocate(1) + + if ok: + with lock: + allocated_all.extend(ids) + + threads = [threading.Thread(target=allocate_one) for _ in range(4)] + + for t in threads: + t.start() + + for t in threads: + t.join() + + assert len(allocated_all) == 4 + assert len(set(allocated_all)) == 4 + + +# --------------------------------------------------------------------------- +# get_status +# --------------------------------------------------------------------------- + + +def test_get_status(monkeypatch): + csv_output = "0, 512, 8192, 5\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + status = pool.get_status() + assert status["platform"] == "nvidia" + assert len(status["gpus"]) == 1 + assert "system" in status + + +# --------------------------------------------------------------------------- +# parse_gpu_requirement / parse_memory_requirement +# --------------------------------------------------------------------------- + + +def test_parse_gpu_requirement_nvidia(): + job = {"resources": {"gpu_ids": "0,1", "gpu_style": "nvidia"}} + assert res.parse_gpu_requirement(job) == 2 + + +def test_parse_gpu_requirement_none(): + job = {"resources": {"gpu_style": "none"}} + assert res.parse_gpu_requirement(job) == 0 + + +def test_parse_gpu_requirement_all(): + job = {"resources": {"gpu_ids": "all"}} + assert res.parse_gpu_requirement(job) == 0 + + +def test_parse_gpu_requirement_default(): + job = {"resources": {"gpu_ids": "0"}} + assert res.parse_gpu_requirement(job) == 1 + + +def test_parse_memory_requirement_gb(): + assert res.parse_memory_requirement({"resources": {"memory": "32GB"}}) == 32 * 1024 + + +def test_parse_memory_requirement_mb(): + assert res.parse_memory_requirement({"resources": {"memory": "512MB"}}) == 512 + + +def test_parse_memory_requirement_empty(): + assert res.parse_memory_requirement({"resources": {}}) == 0 diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py new file mode 100644 index 0000000..2a930d3 --- /dev/null +++ b/.ci/tests/test_utils.py @@ -0,0 +1,90 @@ +from utils import normalize_config + + +def test_normalize_creates_flat_jobs(): + raw = { + "repo": {"url": "https://github.com/org/repo.git"}, + "platforms": { + "nvidia": { + "image": {"dockerfile": ".ci/images/nvidia/"}, + "setup": "pip install .", + "docker_args": ["--gpus", "all"], + "jobs": { + "gpu": { + "resources": {"gpu_ids": "0"}, + "stages": [{"name": "test", "run": "pytest"}], + }, + "multi_gpu": { + "resources": {"gpu_ids": "0,1"}, + "stages": [{"name": "test", "run": "pytest"}], + }, + }, + }, + }, + } + config = normalize_config(raw) + + assert "nvidia_gpu" in config["jobs"] + assert "nvidia_multi_gpu" in config["jobs"] + assert config["jobs"]["nvidia_gpu"]["platform"] == "nvidia" + assert config["jobs"]["nvidia_gpu"]["setup"] == "pip install ." + assert config["jobs"]["nvidia_gpu"]["docker_args"] == ["--gpus", "all"] + assert config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] == "0" + assert config["jobs"]["nvidia_multi_gpu"]["resources"]["gpu_ids"] == "0,1" + + +def test_normalize_extracts_images(): + raw = { + "platforms": { + "nvidia": { + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "pytorch:latest"}, + }, + "jobs": {}, + }, + }, + } + config = normalize_config(raw) + assert config["images"]["nvidia"]["dockerfile"] == ".ci/images/nvidia/" + assert config["images"]["nvidia"]["build_args"]["BASE_IMAGE"] == "pytorch:latest" + + +def test_normalize_job_overrides_platform_defaults(): + raw = { + "platforms": { + "nvidia": { + "setup": "default setup", + "jobs": { + "special": { + "setup": "custom setup", + "stages": [], + }, + }, + }, + }, + } + config = normalize_config(raw) + assert config["jobs"]["nvidia_special"]["setup"] == "custom setup" + + +def test_normalize_preserves_top_level_keys(): + raw = { + "repo": {"url": "https://github.com/org/repo.git"}, + "github": {"status_context_prefix": "ci/test"}, + "agents": {"nvidia": {"url": "http://host:8080"}}, + "platforms": {}, + } + config = normalize_config(raw) + assert config["repo"]["url"] == "https://github.com/org/repo.git" + assert config["github"]["status_context_prefix"] == "ci/test" + assert config["agents"]["nvidia"]["url"] == "http://host:8080" + + +def test_normalize_passthrough_flat_config(): + """Old flat format without 'platforms' key is returned as-is.""" + flat = { + "images": {"nvidia": {}}, + "jobs": {"nvidia_gpu": {"platform": "nvidia"}}, + } + assert normalize_config(flat) is flat diff --git a/.ci/utils.py b/.ci/utils.py new file mode 100644 index 0000000..7932ba6 --- /dev/null +++ b/.ci/utils.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Shared utilities for the CI toolchain.""" + +import subprocess +import sys + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def normalize_config(raw): + """Convert platform-centric config to flat images/jobs format. + + Input (new format): + platforms: + nvidia: + image: {dockerfile: ..., build_args: ...} + setup: pip install .[dev] + jobs: + gpu: {resources: ..., stages: ...} + + Output (flat format consumed by run.py / build.py / agent.py): + images: + nvidia: {dockerfile: ..., build_args: ...} + jobs: + nvidia_gpu: {platform: nvidia, setup: ..., resources: ..., stages: ...} + + If the config already uses the flat format (no 'platforms' key), returns as-is. + """ + if "platforms" not in raw: + return raw + + config = {} + + for key in ("repo", "github", "agents"): + if key in raw: + config[key] = raw[key] + + config["images"] = {} + config["jobs"] = {} + + for platform, pcfg in raw.get("platforms", {}).items(): + # Image config + if "image" in pcfg: + config["images"][platform] = pcfg["image"] + + # Platform-level defaults inherited by jobs + defaults = {} + + for key in ("image_tag", "docker_args", "volumes", "setup", "env"): + if key in pcfg: + defaults[key] = pcfg[key] + + # Flatten jobs: {platform}_{job_name} + for job_name, job_cfg in pcfg.get("jobs", {}).items(): + full_name = f"{platform}_{job_name}" + flat = { + "platform": platform, + "image": defaults.get("image_tag", "latest"), + } + + # Apply platform defaults + for key in ("docker_args", "volumes", "setup", "env"): + if key in defaults: + flat[key] = defaults[key] + + # Job-level overrides + flat.update(job_cfg) + + config["jobs"][full_name] = flat + + return config + + +def load_config(path): + """Load a YAML config file and normalize to flat format.""" + with open(path, encoding="utf-8") as f: + raw = yaml.safe_load(f) + + return normalize_config(raw) + + +def get_git_commit(ref="HEAD", short=True): + """Get git commit SHA. Returns 'unknown' on failure.""" + cmd = ["git", "rev-parse"] + + if short: + cmd.append("--short") + + cmd.append(ref) + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + return "unknown" + + return result.stdout.strip() From 8da3bc0643e26d82acc15c35c52c4cc525ab97fe Mon Sep 17 00:00:00 2001 From: zhangyue Date: Mon, 23 Mar 2026 06:03:23 +0000 Subject: [PATCH 5/5] docs: add multi-machine deployment guide for NVIDIA and Iluvatar platform --- .ci/README.md | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/.ci/README.md b/.ci/README.md index 33841ca..4e826e8 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -241,3 +241,154 @@ Agent 自动检测 GPU 利用率和系统内存,动态决定并行度: - `success` / `failure` — job 执行完成 Status context 格式:`ci/infiniops/{job_name}` + +--- + +## 多机部署指南 + +以 NVIDIA + Iluvatar 双平台为例,说明如何在两台机器上部署 Agent 并实现跨平台并行测试。 + +### 前置条件(两台机器共同) + +```bash +# 1. Python 3.10+ 和依赖 +pip install pyyaml + +# 2. Docker 已安装 +docker --version + +# 3. 克隆仓库 +git clone https://github.com/InfiniTensor/InfiniOps.git +cd InfiniOps +``` + +### NVIDIA 机器配置 + +```bash +# 1. 安装 NVIDIA Container Toolkit +# 参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html + +# 2. 验证 GPU 可见 +nvidia-smi + +# 3. 构建 CI 镜像 +python .ci/build.py --platform nvidia +``` + +### Iluvatar 机器配置 + +```bash +# 1. 确认 CoreX 运行时已安装 +ixsmi + +# 2. 确认基础镜像已导入(非公开镜像,需提前准备) +docker images | grep corex # 应有 corex:qs_pj20250825 + +# 3. 构建 CI 镜像 +python .ci/build.py --platform iluvatar +``` + +### 启动 Agent 服务 + +在各自机器上启动 Agent: + +```bash +# NVIDIA 机器 +python .ci/agent.py serve --platform nvidia --port 8080 + +# Iluvatar 机器 +python .ci/agent.py serve --platform iluvatar --port 8080 +``` + +验证连通性: + +```bash +curl http://:8080/health +curl http://:8080/health +``` + +### 配置远程 Agent 地址 + +在触发端的 `config.yaml` 中添加 `agents` 段: + +```yaml +agents: + nvidia: + url: http://:8080 + iluvatar: + url: http://:8080 +``` + +### 触发跨平台测试 + +```bash +# 一键运行所有平台的 job +python .ci/agent.py run --branch master + +# 预览模式(不实际执行) +python .ci/agent.py run --branch master --dry-run --no-status + +# 只运行指定平台 +python .ci/agent.py run --branch master --platform nvidia +``` + +### 可选配置 + +#### GitHub Status 上报 + +两台机器均设置环境变量,各自上报所属平台的测试状态: + +```bash +export GITHUB_TOKEN=ghp_xxxxxxxxxxxx +``` + +#### API Token 认证 + +Agent 暴露在非可信网络时,建议启用 Token 认证: + +```bash +# 启动 Agent 时指定 token +python .ci/agent.py serve --platform nvidia --port 8080 --api-token + +# 或通过环境变量 +export API_TOKEN= +``` + +#### GitHub Webhook 自动触发 + +在 GitHub repo → Settings → Webhooks 中为每台机器添加 Webhook: + +| 字段 | 值 | +|---|---| +| Payload URL | `http://<机器IP>:8080/webhook` | +| Content type | `application/json` | +| Secret | 与 `--webhook-secret` 一致 | +| Events | `push` 和 `pull_request` | + +启动时配置 secret: + +```bash +python .ci/agent.py serve --platform nvidia --port 8080 --webhook-secret + +# 或通过环境变量 +export WEBHOOK_SECRET= +``` + +### 验证清单 + +```bash +# 1. 各机器单独 dry-run +python .ci/agent.py run --branch master --platform nvidia --dry-run --no-status +python .ci/agent.py run --branch master --platform iluvatar --dry-run --no-status + +# 2. 健康检查 +curl http://:8080/health +curl http://:8080/health + +# 3. 查看资源状态 +curl http://:8080/status +curl http://:8080/status + +# 4. 跨平台一键测试 +python .ci/agent.py run --branch master +```