diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e84fc0da5..6890126cf 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -503,6 +503,38 @@ dsr1-fp8-mi355x-atom-mtp: search-space: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } +# Eval-only: fp8 disagg WITHOUT DPA — isolates DPA as variable +dsr1-fp8-mi355x-sglang-disagg-nodpa-eval: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + dsr1-fp8-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 @@ -814,7 +846,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -1022,7 +1054,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index e2cda146b..4da79d5cd 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -77,6 +77,20 @@ on: required: false type: string default: "[]" + run-eval: + type: boolean + required: false + default: false + eval-only: + description: "Run only evals (skip throughput benchmark)" + type: boolean + required: false + default: false + eval-conc: + description: "Concurrency to use for eval requests (overrides default max-of-conc-list)" + type: string + required: false + default: "" ref: description: "Git ref (branch/sha) to checkout" required: false @@ -96,6 +110,9 @@ env: CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }} SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} + RUN_EVAL: ${{ inputs.run-eval }} + EVAL_ONLY: ${{ inputs.eval-only }} + EVAL_CONC: ${{ inputs.eval-conc }} PYTHONDONTWRITEBYTECODE: '1' PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache @@ -116,7 +133,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 480 - name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}" + name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}" steps: - name: Slurm cleanup (pre-run) @@ -146,9 +163,17 @@ jobs: ref: ${{ inputs.ref || github.ref }} clean: false + - name: Cleanup stale eval outputs (pre-run) + if: ${{ inputs.run-eval || inputs.eval-only }} + run: | + rm -f meta_env.json || true + rm -f results*.json || true + rm -f sample*.jsonl || true + - name: Launch multi-node job script env: RUNNER_NAME: ${{ runner.name }} + RUNNER_TYPE: ${{ inputs.runner }} # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}_prefill-tp{}-ep{}-dp{}-nw{}_decode-tp{}-ep{}-dp{}-nw{}_disagg-{}_spec-{}_conc{}_{runner} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_prefill-tp${{ env.PREFILL_TP }}-ep${{ env.PREFILL_EP }}-dp${{ env.PREFILL_DP_ATTN }}-nw${{ env.PREFILL_NUM_WORKERS }}_decode-tp${{ env.DECODE_TP }}-ep${{ env.DECODE_EP }}-dp${{ env.DECODE_DP_ATTN }}-nw${{ env.DECODE_NUM_WORKERS }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }} run: | @@ -159,16 +184,26 @@ jobs: export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} export IS_MULTINODE=true bash ./runners/launch_${RUNNER_NAME%%_*}.sh - # Check if at least one result file was created - if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then - echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV - echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)" + if [ "${{ inputs.eval-only }}" = "true" ]; then + echo "Eval-only mode: skipping benchmark result file check" + # Verify eval produced results + if ! ls results*.json 1>/dev/null 2>&1; then + echo "Eval-only run failed: no results*.json files found." >&2 + exit 1 + fi else - echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2 - exit 1 + # Check if at least one result file was created + if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then + echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV + echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)" + else + echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2 + exit 1 + fi fi - name: Process results + if: ${{ !inputs.eval-only }} env: RUNNER_TYPE: ${{ inputs.runner }} run: | @@ -189,11 +224,34 @@ jobs: done - name: Upload results + if: ${{ !inputs.eval-only }} uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}_*.json + - name: Upload eval results (if any) + if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }} + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} + path: | + meta_env.json + results*.json + sample*.jsonl + if-no-files-found: ignore + + - name: Verify eval scores + if: ${{ inputs.eval-only }} + run: python3 utils/evals/validate_scores.py + + - name: Cleanup eval outputs (post-upload) + if: ${{ always() && (inputs.run-eval || inputs.eval-only) }} + run: | + rm -f meta_env.json || true + rm -f results*.json || true + rm -f sample*.jsonl || true + - name: Upload logs if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 797505eec..25bec61ee 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -140,6 +140,13 @@ jobs: ref: ${{ inputs.ref || github.ref }} clean: false + - name: Cleanup stale eval outputs (pre-run) + if: ${{ inputs.run-eval || inputs.eval-only }} + run: | + rm -f meta_env.json || true + rm -f results*.json || true + rm -f sample*.jsonl || true + - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index d6ecf76b0..487a4a0c3 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -38,6 +38,7 @@ jobs: single-node-config: ${{ steps.get-jobs.outputs.single-node-config }} multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }} eval-config: ${{ steps.get-jobs.outputs.eval-config }} + multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }} steps: - name: Checkout code (ref) if: ${{ inputs.ref && inputs.ref != '' }} @@ -55,11 +56,13 @@ jobs: CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }}) SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))") - MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))") + MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and not x.get('run-eval', False)]))") EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))") + MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))") echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT echo "eval-config=$EVALS" >> $GITHUB_OUTPUT + echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT test-sweep-multi-node: needs: get-jobs @@ -97,6 +100,48 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: false + ref: ${{ inputs.ref }} + + test-sweep-multi-node-evals: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.multi-node-eval-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: multi-node eval / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.multi-node-eval-config) }} + secrets: inherit + with: + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: true + eval-only: true + eval-conc: ${{ matrix.config.eval-conc }} ref: ${{ inputs.ref }} test-sweep-single-node: @@ -162,15 +207,15 @@ jobs: collect-results: needs: [test-sweep-multi-node, test-sweep-single-node] - if: ${{ always() }} + if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped') }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: result-prefix: "bmk" collect-evals: - needs: [test-sweep-evals] - if: ${{ always() && needs.test-sweep-evals.result != 'skipped' }} + needs: [test-sweep-evals, test-sweep-multi-node-evals] + if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }} uses: ./.github/workflows/collect-evals.yml secrets: inherit diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 4d61a918c..e3eaf1c3b 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -105,6 +105,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: false sweep-multi-node-8k1k: needs: setup @@ -189,6 +190,44 @@ jobs: run-eval: true eval-only: true + sweep-multi-node-evals: + needs: setup + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: multi-node eval / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.search-space-config).multinode_evals }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: true + eval-only: true + eval-conc: ${{ matrix.config.eval-conc }} + collect-results: needs: [ @@ -205,8 +244,8 @@ jobs: result-prefix: "bmk" collect-evals: - needs: [sweep-evals, setup] - if: ${{ always() && needs.setup.result != 'skipped' && needs.sweep-evals.result != 'skipped' }} + needs: [sweep-evals, sweep-multi-node-evals, setup] + if: ${{ always() && needs.setup.result != 'skipped' && (needs.sweep-evals.result != 'skipped' || needs.sweep-multi-node-evals.result != 'skipped') }} uses: ./.github/workflows/collect-evals.yml secrets: inherit diff --git a/AGENTS.md b/AGENTS.md index 94c28e334..e64a903cd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -37,8 +37,9 @@ InferenceX is an open-source, automated benchmarking system that continuously tr │ ├── workflows/ # GitHub Actions CI/CD │ │ ├── run-sweep.yml # Main performance sweep │ │ ├── e2e-tests.yml # End-to-end testing -│ │ ├── benchmark-tmpl.yml # Benchmark job template -│ │ └── collect-evals.yml # Eval results collection +│ │ ├── benchmark-tmpl.yml # Single-node benchmark job template +│ │ ├── benchmark-multinode-tmpl.yml # Multi-node benchmark job template +│ │ └── collect-evals.yml # Eval results collection │ └── configs/ # Master configuration files │ ├── nvidia-master.yaml │ ├── amd-master.yaml @@ -299,14 +300,27 @@ Evals run optional accuracy checks to ensure model outputs aren't degraded by in ### When Evals Run -Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run at two concurrency levels per configuration group: +Evals run as **separate workflow jobs** from throughput benchmarks (eval-only mode). The `EVAL_ONLY` flag skips throughput benchmarking and only runs lm-eval. -- **Highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn) -- **Lower-median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn) +**Single-node** eval selection: +- All TPs at **highest concurrency** and **median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn) +- Only on `8k1k` sequence length + +**Multi-node** eval selection: +- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) +- Only `8k1k` sequence length This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`. -**Note**: Evals only run on `8k1k` sequence length. +**Workflow separation**: Eval jobs are independent from benchmark jobs: +- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node) +- `e2e-tests.yml`: `test-sweep-evals` and `test-sweep-multi-node-evals` +- Both use their respective benchmark templates with `eval-only: true` +- `collect-evals` depends only on eval jobs, not benchmark jobs + +**Multi-node eval infrastructure**: +- AMD (MI355X): `server.sh` skips `bench.sh` when `EVAL_ONLY=true`, runs lm-eval directly +- NVIDIA (GB200/GB300): Uses srt-slurm `infmax-eval` benchmark type with expanded `eval_context_length` ### Eval Framework: lm-eval @@ -336,13 +350,13 @@ All benchmark scripts in `benchmarks/` follow one of two flows: ```bash # Combined mode (benchmark + eval): -# 1. Start server +# 1. Start server (with --context-length expansion if EVAL_ONLY=true) # 2. wait_for_server_ready -# 3. run_benchmark_serving (throughput) -# 4. Conditionally run evals: +# 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true) +# 4. Run evals: if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary + append_lm_eval_summary # Writes meta_env.json and moves artifacts fi # Eval-only mode (EVAL_ONLY=true): @@ -353,6 +367,16 @@ fi # 5. run_eval + append_lm_eval_summary ``` +**Multi-node AMD** (`benchmarks/multi_node/amd_utils/server.sh`): +- Skips `bench.sh` when `EVAL_ONLY=true` +- Runs lm-eval via `run_eval` against the router on port 30000 +- Copies eval artifacts to `/run_logs/slurm_job-*/eval_results/` + +**Multi-node NVIDIA** (GB200/GB300 via srt-slurm): +- Uses `benchmark.type: "infmax-eval"` in srt-slurm config +- `benchmark.eval_context_length` expands server context for eval +- `infmax-eval` benchmark runner sources `benchmark_lib.sh` from `INFMAX_WORKSPACE` + ### Key Eval Functions in `benchmarks/benchmark_lib.sh` | Function | Description | diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 535313252..403484998 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -506,13 +506,13 @@ _install_lm_eval_deps() { python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476" if command -v git >/dev/null 2>&1; then - if ! python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \ + if ! python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \ "git+https://github.com/EleutherAI/lm-evaluation-harness.git@${lm_eval_ref}"; then - python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \ + python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \ "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true fi else - python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \ + python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \ "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true fi } @@ -593,14 +593,23 @@ PY get_native_max_context_length() { local model_path="$1" + # Prefer MODEL_PATH (local model directory) when available, since the + # argument may be a served-model name that is neither a valid HF repo + # ID nor a local path (e.g. "deepseek-r1-fp4" on the B300 cluster). + if [ -n "${MODEL_PATH:-}" ] && [ -d "${MODEL_PATH}" ]; then + model_path="${MODEL_PATH}" + fi python3 -c " -from transformers import AutoConfig -config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True) -for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']: - if hasattr(config, attr): - print(getattr(config, attr)) - break -else: +try: + from transformers import AutoConfig + config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True) + for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']: + if hasattr(config, attr): + print(getattr(config, attr)) + break + else: + print(0) +except Exception: print(0) " } @@ -708,8 +717,32 @@ append_lm_eval_summary() { # Write minimal meta for collectors that expect it local meta_json="${out_dir}/meta_env.json" local model_name="${MODEL_NAME:-$MODEL}" + local is_multinode_json="false" + if [ "${IS_MULTINODE:-false}" = "true" ]; then + is_multinode_json="true" + fi + + local prefill_tp="${PREFILL_TP:-${TP:-1}}" + local prefill_ep="${PREFILL_EP:-${EP_SIZE:-1}}" + local prefill_num_workers="${PREFILL_NUM_WORKERS:-1}" + local decode_tp="${DECODE_TP:-${TP:-1}}" + local decode_ep="${DECODE_EP:-${EP_SIZE:-1}}" + local decode_num_workers="${DECODE_NUM_WORKERS:-1}" + local dp_json="false" - if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi + if [ "${DP_ATTENTION:-false}" = "true" ]; then dp_json="true"; fi + local prefill_dp_json="$dp_json" + if [ "${PREFILL_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then + prefill_dp_json="true" + else + prefill_dp_json="false" + fi + local decode_dp_json="$dp_json" + if [ "${DECODE_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then + decode_dp_json="true" + else + decode_dp_json="false" + fi # Derive framework/precision from env, fallback to parsing RESULT_FILENAME # RESULT_FILENAME format (from workflow): @@ -734,6 +767,7 @@ append_lm_eval_summary() { fi cat > "${meta_json}" </dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + # Must run from repo root so utils/evals/${task}.yaml resolves + pushd /workspace + + # Source eval functions from benchmark_lib.sh + source /workspace/benchmarks/benchmark_lib.sh + + # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})" + else + # Run lm-eval against the router on port 30000 + run_eval --framework lm-eval --port 30000 + + # Set metadata env vars for append_lm_eval_summary + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME + # are already set via Docker -e flags from job.slurm + + append_lm_eval_summary + # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace + + # Copy eval artifacts to run_logs for NFS extraction by runner + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + # Use find for glob patterns to avoid "no match" errors + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + + popd + fi + fi + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index 802106350..be22b8d33 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -105,6 +105,17 @@ export BENCH_NUM_PROMPTS_MULTIPLIER=10 export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +# Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) +export RUN_EVAL="${RUN_EVAL:-false}" +export EVAL_ONLY="${EVAL_ONLY:-false}" +export EVAL_CONC="${EVAL_CONC:-}" +export FRAMEWORK="${FRAMEWORK:-}" +export PRECISION="${PRECISION:-}" +export MODEL_PREFIX="${MODEL_PREFIX:-}" +export RUNNER_TYPE="${RUNNER_TYPE:-}" +export RESULT_FILENAME="${RESULT_FILENAME:-}" +export SPEC_DECODING="${SPEC_DECODING:-}" + # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. # SLURM writes output files on the batch node, so /tmp won't work (node-local). # Defaults to a sibling directory of the submit working directory. diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 0d1bd40cc..fb9bb7b22 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -36,7 +36,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 git checkout sa-submission-q1-2026 @@ -65,6 +65,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" + export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -98,7 +99,17 @@ EOF echo "Running make setup..." make setup ARCH=x86_64 + # Export eval-related env vars for srt-slurm post-benchmark eval + export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." + + if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 + fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) @@ -162,45 +173,63 @@ EOF cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" + echo "All result files processed" else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "EVAL_ONLY=true: Skipping benchmark result collection" + fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + # Collect eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi - echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 68da9f2b7..c718dcad0 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -16,11 +16,11 @@ fi # The yaml files specify HuggingFace model IDs for portability, but we use # local paths to avoid repeated downloading on the shared B300 cluster. if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/scratch/models/deepseek-r1-0528-nvfp4-v2" + export MODEL_PATH="/data/models/dsr1-fp4" export SERVED_MODEL_NAME="deepseek-r1-fp4" export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then - export MODEL_PATH="/scratch/models/deepseek-r1-0528" + export MODEL_PATH="/data/models/dsr1-fp8" export SERVED_MODEL_NAME="deepseek-r1-fp8" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else @@ -35,7 +35,7 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 git checkout sa-submission-q1-2026 @@ -64,6 +64,7 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX export ISL="$ISL" export OSL="$OSL" +export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -99,7 +100,17 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=x86_64 +# Export eval-related env vars for srt-slurm post-benchmark eval +export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." + +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) @@ -163,45 +174,63 @@ echo "Found logs directory: $LOGS_DIR" cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . -# Find all result subdirectories -RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi -if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" + echo "All result files processed" else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi -echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f8f0ef26e..d84c0ac13 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -50,6 +50,8 @@ NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$NGINX_IMAGE" | enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +export EVAL_ONLY="${EVAL_ONLY:-false}" + export ISL="$ISL" export OSL="$OSL" @@ -105,6 +107,14 @@ PY fi +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" if [ -d "$SRT_REPO_DIR" ]; then @@ -112,7 +122,7 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 @@ -164,6 +174,9 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=aarch64 +# Export eval-related env vars for srt-slurm post-benchmark eval +export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." # Override the job name in the config file with the runner name @@ -223,51 +236,71 @@ set -x echo "Job $JOB_ID completed!" echo "Collecting results..." -if [ ! -d "$LOGS_DIR" ]; then +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else echo "Warning: Logs directory not found at $LOGS_DIR" - exit 1 fi -echo "Found logs directory: $LOGS_DIR" +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi -cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" -tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) -# Find all result subdirectories -RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi -if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" + echo "All result files processed" else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi - -echo "All result files processed" diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index d71fd5af7..91147d90d 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -31,6 +31,8 @@ NGINX_SQUASH_FILE="/home/sa-shared/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#] srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" +export EVAL_ONLY="${EVAL_ONLY:-false}" + export ISL="$ISL" export OSL="$OSL" @@ -41,7 +43,7 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 @@ -95,8 +97,17 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=aarch64 +# Export eval-related env vars for srt-slurm post-benchmark eval +export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" @@ -150,54 +161,74 @@ set -x echo "Job $JOB_ID completed!" echo "Collecting results..." -if [ ! -d "$LOGS_DIR" ]; then +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else echo "Warning: Logs directory not found at $LOGS_DIR" - exit 1 fi -echo "Found logs directory: $LOGS_DIR" - -cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" -tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi -# Find all result subdirectories -RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) -if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" -else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done done - done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" fi -echo "All result files processed" +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index bb0335955..91cea74f3 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -41,7 +41,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 @@ -75,6 +75,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" + export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -112,7 +113,17 @@ EOF echo "Running make setup..." make setup ARCH=x86_64 + # Export eval-related env vars for srt-slurm post-benchmark eval + export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." + + if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 + fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" sed -i "/^name:.*/a sbatch_directives:\n exclude: \"${SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_FILE" @@ -177,45 +188,63 @@ EOF cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" - else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "All result files processed" + else + echo "EVAL_ONLY=true: Skipping benchmark result collection" + fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + # Collect eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi - echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 9b3b771a5..3e7032314 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -40,7 +40,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 @@ -74,6 +74,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" + export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -112,7 +113,17 @@ EOF echo "Running make setup..." make setup ARCH=x86_64 + # Export eval-related env vars for srt-slurm post-benchmark eval + export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." + + if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 + fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) @@ -176,45 +187,63 @@ EOF cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" - else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "All result files processed" + else + echo "EVAL_ONLY=true: Skipping benchmark result collection" + fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + # Collect eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi - echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 52e28e9b8..aa55d35e5 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -51,6 +51,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then mkdir -p "$BENCHMARK_LOGS_DIR" sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + # Ensure root-owned files are cleaned up even on early exit to prevent + # EACCES errors when the next GH Actions job checks out on this runner + trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT + SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" @@ -101,33 +105,50 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data - cat > collect_latest_results.py <<'PY' + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + cat > collect_latest_results.py <<'PY' import os, sys sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + ls -la "$LOGS_DIR" + + # Result JSON are contained within the result directory + for result_file in $(find $LOGS_DIR -type f); do + # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json + file_name=$(basename $result_file) + if [ -f $result_file ]; then + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done fi - echo "Found logs directory: $LOGS_DIR" - ls -la "$LOGS_DIR" - - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" - echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE + # Extract eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + # Find eval_results in the slurm job logs directory + EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1) + if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + else + echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs" fi - done + fi echo "All result files processed" # Use sync scancel to ensure nfs file handle is released in time @@ -146,6 +167,9 @@ PY echo "Logs copied to $ARTIFACT_DIR for artifact upload" fi + # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + else export HF_HUB_CACHE_MOUNT="/var/lib/hf-hub-cache/" diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 1c2f6429b..18917447e 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -10,7 +10,8 @@ from summarize import ( load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, - SPEC_DECODING + SPEC_DECODING, PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, + DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS ) @@ -160,19 +161,67 @@ def se(x: Any) -> str: return '' +def as_int(x: Any, default: int = 0) -> int: + """Convert a metadata field to int with a fallback.""" + try: + return int(x) + except Exception: + return default + + +def as_bool(x: Any, default: bool = False) -> bool: + """Parse a metadata boolean stored as bool/string/int.""" + if isinstance(x, bool): + return x + if x is None: + return default + return str(x).lower() == 'true' + + def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: """Build a result row from metadata and extracted metrics.""" + is_multinode = as_bool(meta.get('is_multinode'), False) + prefill_tp = as_int(meta.get('prefill_tp', meta.get('tp', 1)), 1) + prefill_ep = as_int(meta.get('prefill_ep', meta.get('ep', 1)), 1) + prefill_num_workers = as_int(meta.get('prefill_num_workers', 1), 1) + decode_tp = as_int(meta.get('decode_tp', meta.get('tp', 1)), 1) + decode_ep = as_int(meta.get('decode_ep', meta.get('ep', 1)), 1) + decode_num_workers = as_int(meta.get('decode_num_workers', 1), 1) + prefill_dp_attention = meta.get('prefill_dp_attention') + decode_dp_attention = meta.get('decode_dp_attention') + dp_attention = meta.get('dp_attention', 'none') + + if prefill_dp_attention is None: + prefill_dp_attention = dp_attention + if decode_dp_attention is None: + decode_dp_attention = dp_attention + + if is_multinode: + if prefill_dp_attention == decode_dp_attention: + dp_attention = prefill_dp_attention + else: + dp_attention = f"prefill={str(prefill_dp_attention).lower()},decode={str(decode_dp_attention).lower()}" + row = { + 'is_multinode': is_multinode, 'model_prefix': meta.get('infmax_model_prefix', 'unknown'), 'model': m.get('model') or meta.get('model', 'unknown'), 'hw': meta.get('hw', 'unknown').upper(), 'framework': meta.get('framework', 'unknown').lower(), 'precision': meta.get('precision', 'unknown').lower(), 'spec_decoding': meta.get('spec_decoding', 'unknown'), - 'tp': int(meta.get('tp', 1)), - 'ep': int(meta.get('ep', 1)), - 'conc': int(meta.get('conc', 0)), - 'dp_attention': str(meta.get('dp_attention', "none")).lower(), + 'tp': as_int(meta.get('tp', prefill_tp), prefill_tp), + 'ep': as_int(meta.get('ep', prefill_ep), prefill_ep), + 'prefill_tp': prefill_tp, + 'prefill_ep': prefill_ep, + 'prefill_num_workers': prefill_num_workers, + 'decode_tp': decode_tp, + 'decode_ep': decode_ep, + 'decode_num_workers': decode_num_workers, + 'conc': as_int(meta.get('conc', 0), 0), + 'dp_attention': str(dp_attention).lower(), + 'prefill_dp_attention': str(prefill_dp_attention).lower(), + 'decode_dp_attention': str(decode_dp_attention).lower(), 'task': m.get('task', 'unknown'), 'em_strict': m.get('strict'), 'em_strict_se': m.get('strict_se'), @@ -226,49 +275,111 @@ def main(): row = build_row(meta, m) rows.append(row) + single_node_rows = [r for r in rows if not r['is_multinode']] + multinode_rows = [r for r in rows if r['is_multinode']] + # Sort for stable output (default: by model_prefix) sort_by = sys.argv[3] if len(sys.argv) > 3 else 'model_prefix' - if sort_by == 'hw': - rows.sort(key=lambda r: ( - r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'] + single_node_sort_key = ( + (lambda r: ( + r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), + r['tp'], r['ep'], r['conc'], )) - else: - rows.sort(key=lambda r: ( - r['model_prefix'], r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'] + if sort_by == 'hw' + else (lambda r: ( + r['model_prefix'], r['hw'], r['framework'], r['precision'], + r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'], + )) + ) + multinode_sort_key = ( + (lambda r: ( + r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), + r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'], + r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'], + )) + if sort_by == 'hw' + else (lambda r: ( + r['model_prefix'], r['hw'], r['framework'], r['precision'], + r.get('spec_decoding', ''), + r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'], + r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'], )) + ) + single_node_rows.sort(key=single_node_sort_key) + multinode_rows.sort(key=multinode_sort_key) if not rows: print('> No eval results found to summarize.') else: # Print table using tabulate MODEL_PREFIX = "Model Prefix" - headers = [ - MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION, - TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL - ] - - table_rows = [ - [ - r['model_prefix'], - r['hw'], - r['framework'].upper(), - r['precision'].upper(), - r['spec_decoding'], - r['tp'], - r['ep'], - r['conc'], - r['dp_attention'], - r['task'], - f"{pct(r['score'])}{se(r['score_se'])}", - f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", - f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", - r['n_eff'] or '', - r['model'] + + if single_node_rows: + headers = [ + MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, + TP, EP, CONC, DP_ATTENTION, + TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL, + ] + table_rows = [ + [ + r['model_prefix'], + r['hw'], + r['framework'].upper(), + r['precision'].upper(), + r['spec_decoding'], + r['tp'], + r['ep'], + r['conc'], + r['dp_attention'], + r['task'], + f"{pct(r['score'])}{se(r['score_se'])}", + f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", + f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", + r['n_eff'] or '', + r['model'], + ] + for r in single_node_rows + ] + print("### Single-Node Eval Results\n") + print(tabulate(table_rows, headers=headers, tablefmt="github")) + + if multinode_rows: + headers = [ + MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, + PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, + DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, + CONC, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL, + ] + table_rows = [ + [ + r['model_prefix'], + r['hw'], + r['framework'].upper(), + r['precision'].upper(), + r['spec_decoding'], + r['prefill_tp'], + r['prefill_ep'], + r['prefill_dp_attention'], + r['prefill_num_workers'], + r['decode_tp'], + r['decode_ep'], + r['decode_dp_attention'], + r['decode_num_workers'], + r['conc'], + r['task'], + f"{pct(r['score'])}{se(r['score_se'])}", + f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", + f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", + r['n_eff'] or '', + r['model'], + ] + for r in multinode_rows ] - for r in rows - ] + if single_node_rows: + print("\n") + print("### Multi-Node Eval Results\n") + print(tabulate(table_rows, headers=headers, tablefmt="github")) - print(tabulate(table_rows, headers=headers, tablefmt="github")) # Write JSON aggregate out_path = Path(f'agg_eval_{exp_name}.json') diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index e32d6d988..f729d5f24 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -6,21 +6,54 @@ Quick graded QnA which measures model performance. Examples of test suites: - **gpqa**: Graduate level, Google-Proof multiple choice questions ## When? -At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py` +Evals run as **separate workflow jobs** from throughput benchmarks. The selection logic is in `mark_eval_entries()` of `utils/matrix_logic/generate_sweep_configs.py`. + +**Single-node**: At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. + +**Multi-node**: One entry per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) with the highest max concurrency, only for 8k1k. ## Why? -To verify how model outputs are affected by throughput optimizations. +To verify how model outputs are affected by throughput optimizations. - TP/Conc might affect model outputs - Check kernel implementations for correctness - If there was a tradeoff in accuracy for performance ## How? -- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`. +`run_eval` in `benchmarks/benchmark_lib.sh` runs EleutherAI/lm-evaluation-harness against the server's OpenAI-compatible endpoint. Concurrency is set via `EVAL_CONCURRENT_REQUESTS` env var (not a CLI flag). Results are collected by `utils/collect_eval_results.py` and published as a summary table. + +### Single-node +In eval-only mode (`EVAL_ONLY=true`), the benchmark script starts the server with expanded context length (via `compute_eval_context_length`), skips throughput, and runs lm-eval directly. Each framework handles the context expansion differently (`--context-length` for SGLang, `--max_seq_len` for TRT-LLM). + +### Multi-node +Multi-node evals support three hardware paths: + +**MI355X (AMD)** — `benchmarks/multi_node/amd_utils/server.sh` +- Skips `bench.sh` when `EVAL_ONLY=true` +- Runs lm-eval via `run_eval` against the router on port 30000 +- Concurrency derived from max of `BENCH_MAX_CONCURRENCY` (x-separated values) +- Eval artifacts copied to `/run_logs/slurm_job-*/eval_results/` +- `runners/launch_mi355x-amds.sh` skips benchmark result collection when `EVAL_ONLY=true` and uses `find` to locate eval results + +**GB200/GB300 (NVIDIA)** — via [srt-slurm fork](https://github.com/Oseltamivir/srt-slurm) (`sa-submission-q1-2026` branch) +- `do_sweep.py` skips the benchmark stage when `EVAL_ONLY=true`, runs `_run_post_eval()` directly +- In eval-only mode, uses the full `wait_for_model()` health check (same as benchmark stage) since the benchmark health check was skipped +- `lm-eval` benchmark runner (`benchmarks/lm_eval.py`) sources InferenceX's `benchmark_lib.sh` from the mounted workspace (`/infmax-workspace`) +- Eval artifacts written to `/logs/eval_results/` inside the container, collected by launch scripts +- `runners/launch_gb200-nv.sh` and `launch_gb300-nv.sh` always collect server logs (for debugging) but skip benchmark result collection when `EVAL_ONLY=true` +- Env vars threaded: `RUN_EVAL`, `EVAL_ONLY`, `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP/EP/DP_ATTN`, `DECODE_TP/EP/DP_ATTN`, `MODEL_NAME`, `EVAL_CONC` + +### Workflow structure +- `e2e-tests.yml`: `test-sweep-evals` (single-node) and `test-sweep-multi-node-evals` (multi-node) +- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node) +- Both use their respective benchmark templates with `eval-only: true`, `run-eval: true` +- `collect-evals` depends on both eval jobs; `collect-results` only runs when benchmark jobs ran +- `process_changelog.py` splits eval results into `evals` (single-node) and `multinode_evals` + +### Score validation +`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails. ## Misc Following files are task definitions from lmeval, more info on changes within the files - `utils/evals/gsm8k.yaml` - `utils/evals/gpqa_diamond.yaml` - - diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 2a336c960..9682c1423 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -33,26 +33,34 @@ def seq_len_to_str(isl: int, osl: int) -> str: return seq_len_itos.get((isl, osl), f"{isl}_{osl}") def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: - """Eval selection policy (single-node only): - - Only consider 8k1k (isl=8192, osl=1024). - - For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn): + """Eval selection policy: + - Single-node: only consider 8k1k (isl=8192, osl=1024). + For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn): - Mark all entries at the highest CONC (all TPs) - Mark all entries at the median CONC (all TPs) + - Multi-node: for each unique (model, runner, framework, precision, + spec-decoding, prefill-dp-attn, decode-dp-attn), only 8k1k entries. + Mark the entry with the highest max concurrency. Sets eval-conc to the + median of the conc list to avoid OOM during eval. """ from collections import defaultdict - # Only run evals on 8k1k target_isl, target_osl = seq_len_stoi["8k1k"] - # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn). - # Only include entries that have a top-level TP (i.e., single-node schema). - groups = defaultdict(list) + eval_indices = set() + mn_eval_conc = {} # index -> chosen eval concurrency for multinode entries + + def _max_conc(ie): + c = ie[1][Fields.CONC.value] + return max(c) if isinstance(c, list) else c + + # Single-node: group by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn). + # Only 8k1k entries with a top-level TP (single-node schema). + sn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): if Fields.TP.value not in entry: continue - if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: continue - key = ( entry[Fields.MODEL.value], entry[Fields.RUNNER.value], @@ -61,27 +69,53 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: entry[Fields.ISL.value], entry[Fields.OSL.value], entry[Fields.SPEC_DECODING.value], - entry[Fields.DP_ATTN.value] + entry[Fields.DP_ATTN.value], ) - groups[key].append((i, entry)) - - # For each group, select entries at highest CONC and median CONC (all TPs) - eval_indices = set() - for key, entries in groups.items(): - if not entries: - continue + sn_groups[key].append((i, entry)) + for entries in sn_groups.values(): conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries)) median_conc = conc_values[len(conc_values) // 2] target_concs = {conc_values[-1], median_conc} - for i, e in entries: if e[Fields.CONC.value] in target_concs: eval_indices.add(i) + # Multi-node: group by (model, runner, framework, precision, spec-decoding, prefill-dp, decode-dp). + # Only 8k1k entries with a prefill key (multi-node schema). + # Pick the entry with the highest max concurrency per group. + mn_groups = defaultdict(list) + for i, entry in enumerate(matrix_values): + if Fields.TP.value in entry: + continue + if Fields.PREFILL.value not in entry: + continue + if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: + continue + key = ( + entry[Fields.MODEL.value], + entry[Fields.RUNNER.value], + entry[Fields.FRAMEWORK.value], + entry[Fields.PRECISION.value], + entry[Fields.SPEC_DECODING.value], + entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value), + entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value), + ) + mn_groups[key].append((i, entry)) + + for entries in mn_groups.values(): + best_idx, best_entry = max(entries, key=_max_conc) + eval_indices.add(best_idx) + # Set eval-conc to median of the conc list to avoid OOM during eval + conc = best_entry[Fields.CONC.value] + sorted_conc = sorted(conc) if isinstance(conc, list) else [conc] + mn_eval_conc[best_idx] = sorted_conc[len(sorted_conc) // 2] + # Mark the selected entries for i, entry in enumerate(matrix_values): entry[Fields.RUN_EVAL.value] = i in eval_indices + if i in mn_eval_conc: + entry[Fields.EVAL_CONC.value] = mn_eval_conc[i] return matrix_values @@ -557,9 +591,18 @@ def generate_test_config_sweep(args, all_config_data): runner = val[Fields.RUNNER.value] disagg = val.get(Fields.DISAGG.value, False) + # Build seq-len filter if --seq-lens was provided + seq_lens_filter = None + if getattr(args, 'seq_lens', None): + seq_lens_filter = {seq_len_stoi[s] for s in args.seq_lens} + for seq_len_config in val[Fields.SEQ_LEN_CONFIGS.value]: isl = seq_len_config[Fields.ISL.value] osl = seq_len_config[Fields.OSL.value] + + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + seq_len_str = seq_len_to_str(isl, osl) for bmk in seq_len_config[Fields.SEARCH_SPACE.value]: @@ -905,6 +948,13 @@ def main(): required=False, help='Only include these concurrency values. Values must exist in the config conc-range/list.' ) + test_config_keys_parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help='Only include these sequence length configurations (e.g., 1k1k 8k1k)' + ) test_config_keys_parser.add_argument( '-h', '--help', action='help', diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 697d97de6..62a92c5ed 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -53,6 +53,7 @@ class Fields(Enum): # Eval RUN_EVAL = 'run-eval' + EVAL_CONC = 'eval-conc' """ @@ -126,6 +127,7 @@ class MultiNodeMatrixEntry(BaseModel): exp_name: str = Field(alias=Fields.EXP_NAME.value) disagg: bool run_eval: bool = Field(alias=Fields.RUN_EVAL.value) + eval_conc: Optional[int] = Field(default=None, alias=Fields.EVAL_CONC.value) def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: @@ -361,6 +363,7 @@ class ChangelogMatrixEntry(BaseModel): multi_node: dict[str, list[MultiNodeMatrixEntry] ] = Field(default_factory=dict) evals: list[SingleNodeMatrixEntry] = Field(default_factory=list) + multinode_evals: list[MultiNodeMatrixEntry] = Field(default_factory=list) changelog_metadata: ChangelogMetadata diff --git a/utils/process_changelog.py b/utils/process_changelog.py index 7da19d030..9d231ad3c 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -82,6 +82,7 @@ def main(): "single_node": defaultdict(list), "multi_node": defaultdict(list), "evals": [], + "multinode_evals": [], "changelog_metadata": { "base_ref": args.base_ref, "head_ref": args.head_ref, @@ -163,7 +164,8 @@ def main(): else: final_results["single_node"][seq_len_str].append(result) - final_results["evals"] = all_eval_results + final_results["evals"] = [e for e in all_eval_results if "prefill" not in e or e.get("prefill") is None] + final_results["multinode_evals"] = [e for e in all_eval_results if "prefill" in e and e.get("prefill") is not None] # Validate final results structure validated = ChangelogMatrixEntry.model_validate(final_results)