From 033aa6ecd15be0257fa60c367eafc6f6bb97f9c7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 19 Feb 2026 16:12:44 -0800 Subject: [PATCH 01/22] init --- .../workflows/benchmark-multinode-tmpl.yml | 23 +++++++++ .github/workflows/e2e-tests.yml | 1 + .github/workflows/run-sweep.yml | 1 + benchmarks/multi_node/amd_utils/job.slurm | 14 +++++ benchmarks/multi_node/amd_utils/server.sh | 51 +++++++++++++++++++ benchmarks/multi_node/amd_utils/submit.sh | 8 +++ runners/launch_mi355x-amds.sh | 14 +++++ utils/matrix_logic/generate_sweep_configs.py | 48 +++++++++++++++-- 8 files changed, 155 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index af3652e6b..6f2a6397a 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -77,6 +77,10 @@ on: required: false type: string default: "[]" + run-eval: + type: boolean + required: false + default: false ref: description: "Git ref (branch/sha) to checkout" required: false @@ -96,6 +100,7 @@ env: CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }} SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} + RUN_EVAL: ${{ inputs.run-eval }} PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} PREFILL_TP: ${{ inputs.prefill-tp }} @@ -146,6 +151,7 @@ jobs: - name: Launch multi-node job script env: RUNNER_NAME: ${{ runner.name }} + RUNNER_TYPE: ${{ inputs.runner }} # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}_prefill-tp{}-ep{}-dp{}-nw{}_decode-tp{}-ep{}-dp{}-nw{}_disagg-{}_spec-{}_conc{}_{runner} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_prefill-tp${{ env.PREFILL_TP }}-ep${{ env.PREFILL_EP }}-dp${{ env.PREFILL_DP_ATTN }}-nw${{ env.PREFILL_NUM_WORKERS }}_decode-tp${{ env.DECODE_TP }}-ep${{ env.DECODE_EP }}-dp${{ env.DECODE_DP_ATTN }}-nw${{ env.DECODE_NUM_WORKERS }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }} run: | @@ -188,6 +194,23 @@ jobs: name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}_*.json + - name: Upload eval results (if any) + if: ${{ env.RUN_EVAL == 'true' }} + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} + path: | + meta_env.json + results*.json + sample*.jsonl + if-no-files-found: ignore + + - name: Cleanup eval outputs (post-upload) + if: ${{ env.RUN_EVAL == 'true' }} + run: | + rm -f meta_env.json || true + rm -f results*.json || true + - name: Slurm cleanup (post-run) if: always() run: *slurm-cleanup diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index c108b3960..2658b8d94 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -93,6 +93,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: ${{ matrix.config.run-eval }} ref: ${{ inputs.ref }} test-sweep-single-node: diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 22a71afd7..5d2d1dc51 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -105,6 +105,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: ${{ matrix.config.run-eval }} sweep-multi-node-1k8k: needs: setup diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 6b0352f24..fd37b583d 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -285,6 +285,14 @@ export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +# Eval-related env vars (threaded from submit.sh) +export RUN_EVAL="${RUN_EVAL:-false}" +export FRAMEWORK="${FRAMEWORK:-}" +export PRECISION="${PRECISION:-}" +export MODEL_PREFIX="${MODEL_PREFIX:-}" +export RUNNER_TYPE="${RUNNER_TYPE:-}" +export RESULT_FILENAME="${RESULT_FILENAME:-}" + SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" @@ -389,6 +397,12 @@ exec sudo docker run --rm \ -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ -e DRY_RUN=\$DRY_RUN \ -e BENCHMARK_LOGS_DIR=/benchmark_logs \ + -e RUN_EVAL=\$RUN_EVAL \ + -e FRAMEWORK=\$FRAMEWORK \ + -e PRECISION=\$PRECISION \ + -e MODEL_PREFIX=\$MODEL_PREFIX \ + -e RUNNER_TYPE=\$RUNNER_TYPE \ + -e RESULT_FILENAME=\$RESULT_FILENAME \ --name \"$DOCKER_CONT_NAME\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index dadea4728..3bd0e5573 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -478,6 +478,57 @@ if [ "$NODE_RANK" -eq 0 ]; then set +x fi + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + # Must run from repo root so utils/evals/${task}.yaml resolves + pushd /workspace + + # Source eval functions from benchmark_lib.sh + source /workspace/benchmarks/benchmark_lib.sh + + # Determine eval concurrency (cap at 64 for eval stability) + IFS='x' read -r -a _conc_arr <<< "${BENCH_MAX_CONCURRENCY}" + EVAL_CONC="${_conc_arr[0]:-32}" + (( EVAL_CONC > 64 )) && EVAL_CONC=32 + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC" + else + # Run lm-eval against the router on port 30000 + run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC" + + # Set metadata env vars for append_lm_eval_summary + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONC}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME + # are already set via Docker -e flags from job.slurm + + append_lm_eval_summary + # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace + + # Copy eval artifacts to run_logs for NFS extraction by runner + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + # Use find for glob patterns to avoid "no match" errors + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + + popd + fi + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index a2c3622b9..ddf5bcda7 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -103,6 +103,14 @@ export BENCH_NUM_PROMPTS_MULTIPLIER=10 export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +# Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) +export RUN_EVAL="${RUN_EVAL:-false}" +export FRAMEWORK="${FRAMEWORK:-}" +export PRECISION="${PRECISION:-}" +export MODEL_PREFIX="${MODEL_PREFIX:-}" +export RUNNER_TYPE="${RUNNER_TYPE:-}" +export RESULT_FILENAME="${RESULT_FILENAME:-}" + # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. # SLURM writes output files on the batch node, so /tmp won't work (node-local). # Defaults to a sibling directory of the submit working directory. diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 2b9902b0b..20da5b5d6 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -129,6 +129,20 @@ PY fi done + # Extract eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + EVAL_DIR="$(dirname "$LOGS_DIR")/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi + fi + echo "All result files processed" # Use sync scancel to ensure nfs file handle is released in time set +x diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 48bac118f..e19acb164 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -33,12 +33,14 @@ def seq_len_to_str(isl: int, osl: int) -> str: return seq_len_itos.get((isl, osl), f"{isl}_{osl}") def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: - """Eval selection policy (single-node only): + """Eval selection policy: - Only consider 1k8k (isl=1024, osl=8192). - - For each unique (model, runner, framework, precision, isl, osl, spec-decoding): - - Mark highest TP with highest conc - - Mark lowest TP with highest conc - + - Single-node: for each unique (model, runner, framework, precision, isl, osl, + spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP + with highest conc. + - Multi-node: for each unique (model, runner, framework, precision, isl, osl, + spec-decoding), mark the entry with the highest max concurrency. + Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated independently. """ @@ -46,6 +48,8 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: # Only run evals on 1k8k target_isl, target_osl = seq_len_stoi["1k8k"] + + # --- Single-node eval selection --- # Group entries by (model, runner, framework, precision, isl, osl) # Only include entries that have a top-level TP (i.e., single-node schema). # This avoids relying on structural hints like prefill/decode which may be @@ -98,6 +102,40 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: if e[Fields.CONC.value] == max_conc_lowest_tp: eval_indices.add(i) + # --- Multi-node eval selection --- + # For multi-node (disaggregated) entries, pick one representative per group + # with the highest max concurrency. + mn_groups = defaultdict(list) + for i, entry in enumerate(matrix_values): + if Fields.TP.value in entry: + continue # single-node, already handled + if Fields.PREFILL.value not in entry: + continue + + if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: + continue + + key = ( + entry[Fields.MODEL.value], + entry[Fields.RUNNER.value], + entry[Fields.FRAMEWORK.value], + entry[Fields.PRECISION.value], + entry[Fields.ISL.value], + entry[Fields.OSL.value], + entry[Fields.SPEC_DECODING.value], + ) + mn_groups[key].append((i, entry)) + + for key, entries in mn_groups.items(): + if not entries: + continue + # Pick entry with highest max concurrency + def _max_conc(ie): + c = ie[1][Fields.CONC.value] + return max(c) if isinstance(c, list) else c + best = max(entries, key=_max_conc) + eval_indices.add(best[0]) + # Mark the selected entries for i, entry in enumerate(matrix_values): entry[Fields.RUN_EVAL.value] = i in eval_indices From c177baaaad143e552c52caa9c8eaa0f5e41d15a7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 19 Feb 2026 16:33:48 -0800 Subject: [PATCH 02/22] add mat --- utils/matrix_logic/generate_sweep_configs.py | 41 +++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index e19acb164..ca075d6f1 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -38,8 +38,9 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: - Single-node: for each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP with highest conc. - - Multi-node: for each unique (model, runner, framework, precision, isl, osl, - spec-decoding), mark the entry with the highest max concurrency. + - Multi-node: for each unique (model, runner, framework, precision, + spec-decoding), prefer 1k8k entries if available, otherwise fall back to + any seq-len. Mark the entry with the highest max concurrency. Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated independently. @@ -103,8 +104,10 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: eval_indices.add(i) # --- Multi-node eval selection --- - # For multi-node (disaggregated) entries, pick one representative per group - # with the highest max concurrency. + # For multi-node (disaggregated) entries, pick one representative per group. + # Prefer 1k8k if available (matching single-node policy), otherwise fall back + # to whatever seq-len exists so eval coverage is not skipped entirely. + # Within a group, pick the entry with the highest max concurrency. mn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): if Fields.TP.value in entry: @@ -112,16 +115,11 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: if Fields.PREFILL.value not in entry: continue - if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: - continue - key = ( entry[Fields.MODEL.value], entry[Fields.RUNNER.value], entry[Fields.FRAMEWORK.value], entry[Fields.PRECISION.value], - entry[Fields.ISL.value], - entry[Fields.OSL.value], entry[Fields.SPEC_DECODING.value], ) mn_groups[key].append((i, entry)) @@ -129,11 +127,18 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: for key, entries in mn_groups.items(): if not entries: continue + + # Prefer 1k8k entries; fall back to all entries if none exist + preferred = [(i, e) for i, e in entries + if e.get(Fields.ISL.value) == target_isl + and e.get(Fields.OSL.value) == target_osl] + candidates = preferred if preferred else entries + # Pick entry with highest max concurrency def _max_conc(ie): c = ie[1][Fields.CONC.value] return max(c) if isinstance(c, list) else c - best = max(entries, key=_max_conc) + best = max(candidates, key=_max_conc) eval_indices.add(best[0]) # Mark the selected entries @@ -619,9 +624,18 @@ def generate_test_config_sweep(args, all_config_data): runner = val[Fields.RUNNER.value] disagg = val.get(Fields.DISAGG.value, False) + # Build seq-len filter if --seq-lens was provided + seq_lens_filter = None + if getattr(args, 'seq_lens', None): + seq_lens_filter = {seq_len_stoi[s] for s in args.seq_lens} + for seq_len_config in val[Fields.SEQ_LEN_CONFIGS.value]: isl = seq_len_config[Fields.ISL.value] osl = seq_len_config[Fields.OSL.value] + + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + seq_len_str = seq_len_to_str(isl, osl) for bmk in seq_len_config[Fields.SEARCH_SPACE.value]: @@ -930,6 +944,13 @@ def main(): required=False, help='Only include these concurrency values. Values must exist in the config conc-range/list.' ) + test_config_keys_parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help='Only include these sequence length configurations (e.g., 1k1k 8k1k)' + ) test_config_keys_parser.add_argument( '-h', '--help', action='help', From 6988322ddbf3996be574e39104a220fc2a316d32 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Thu, 19 Feb 2026 19:17:30 -0800 Subject: [PATCH 03/22] Increase Eval Conc --- benchmarks/multi_node/amd_utils/server.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 3bd0e5573..1a441819c 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -490,8 +490,7 @@ if [ "$NODE_RANK" -eq 0 ]; then # Determine eval concurrency (cap at 64 for eval stability) IFS='x' read -r -a _conc_arr <<< "${BENCH_MAX_CONCURRENCY}" - EVAL_CONC="${_conc_arr[0]:-32}" - (( EVAL_CONC > 64 )) && EVAL_CONC=32 + EVAL_CONC="${_conc_arr[0]:-64}" if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC" From c0d008ba5c75b3d8d6f01642c9b708c99c6ad2a2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 19 Feb 2026 22:13:35 -0800 Subject: [PATCH 04/22] 8k1k evals instead of 1k1k --- benchmarks/multi_node/amd_utils/submit.sh | 1 + utils/matrix_logic/generate_sweep_configs.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index ddf5bcda7..5aa476c63 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -110,6 +110,7 @@ export PRECISION="${PRECISION:-}" export MODEL_PREFIX="${MODEL_PREFIX:-}" export RUNNER_TYPE="${RUNNER_TYPE:-}" export RESULT_FILENAME="${RESULT_FILENAME:-}" +export SPEC_DECODING="${SPEC_DECODING:-}" # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. # SLURM writes output files on the batch node, so /tmp won't work (node-local). diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index ca075d6f1..54c687059 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -39,8 +39,8 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP with highest conc. - Multi-node: for each unique (model, runner, framework, precision, - spec-decoding), prefer 1k8k entries if available, otherwise fall back to - any seq-len. Mark the entry with the highest max concurrency. + spec-decoding), prefer 1k8k entries; fall back to 8k1k if unavailable + (never 1k1k). Mark the entry with the highest max concurrency. Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated independently. @@ -105,9 +105,9 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: # --- Multi-node eval selection --- # For multi-node (disaggregated) entries, pick one representative per group. - # Prefer 1k8k if available (matching single-node policy), otherwise fall back - # to whatever seq-len exists so eval coverage is not skipped entirely. + # Prefer 1k8k; fall back to 8k1k if unavailable (never 1k1k). # Within a group, pick the entry with the highest max concurrency. + fallback_isl, fallback_osl = seq_len_stoi["8k1k"] mn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): if Fields.TP.value in entry: @@ -128,17 +128,22 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: if not entries: continue - # Prefer 1k8k entries; fall back to all entries if none exist + # Prefer 1k8k entries; fall back to 8k1k preferred = [(i, e) for i, e in entries if e.get(Fields.ISL.value) == target_isl and e.get(Fields.OSL.value) == target_osl] - candidates = preferred if preferred else entries + if not preferred: + preferred = [(i, e) for i, e in entries + if e.get(Fields.ISL.value) == fallback_isl + and e.get(Fields.OSL.value) == fallback_osl] + if not preferred: + continue # Pick entry with highest max concurrency def _max_conc(ie): c = ie[1][Fields.CONC.value] return max(c) if isinstance(c, list) else c - best = max(candidates, key=_max_conc) + best = max(preferred, key=_max_conc) eval_indices.add(best[0]) # Mark the selected entries From d73bf3d76862e94c9e34195c37690c6c1704edef Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 20 Feb 2026 09:45:51 -0800 Subject: [PATCH 05/22] reduce conc --- benchmarks/multi_node/amd_utils/server.sh | 99 +++++++++++++---------- 1 file changed, 57 insertions(+), 42 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 1a441819c..285945b02 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -482,50 +482,65 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "${RUN_EVAL:-false}" == "true" ]]; then echo "Running lm-eval evaluation on Node 0..." - # Must run from repo root so utils/evals/${task}.yaml resolves - pushd /workspace - - # Source eval functions from benchmark_lib.sh - source /workspace/benchmarks/benchmark_lib.sh - - # Determine eval concurrency (cap at 64 for eval stability) - IFS='x' read -r -a _conc_arr <<< "${BENCH_MAX_CONCURRENCY}" - EVAL_CONC="${_conc_arr[0]:-64}" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC" + # Health check: verify the router is still serving before running eval. + # The throughput benchmark may have crashed/exhausted decode workers. + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." else - # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC" - - # Set metadata env vars for append_lm_eval_summary - export TP="${PREFILL_TP_SIZE}" - export CONC="${EVAL_CONC}" - export EP_SIZE=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" - export DP_ATTENTION="${PREFILL_ENABLE_DP}" - export ISL="${BENCH_INPUT_LEN}" - export OSL="${BENCH_OUTPUT_LEN}" - # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME - # are already set via Docker -e flags from job.slurm - - append_lm_eval_summary - # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace - - # Copy eval artifacts to run_logs for NFS extraction by runner - EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" - mkdir -p "$EVAL_COPY_DIR" - for f in meta_env.json; do - [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" - done - # Use find for glob patterns to avoid "no match" errors - find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; - find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; - - echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + # Must run from repo root so utils/evals/${task}.yaml resolves + pushd /workspace + + # Source eval functions from benchmark_lib.sh + source /workspace/benchmarks/benchmark_lib.sh + + # Cap eval concurrency at 32 for stability + EVAL_CONC=256 + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC" + else + # Run lm-eval against the router on port 30000 + run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC" + + # Set metadata env vars for append_lm_eval_summary + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONC}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME + # are already set via Docker -e flags from job.slurm + + append_lm_eval_summary + # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace + + # Copy eval artifacts to run_logs for NFS extraction by runner + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + # Use find for glob patterns to avoid "no match" errors + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + + popd fi - - popd fi # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) From ab179c73046b36891eb6e8f8b35b1dcda834faf6 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 25 Feb 2026 11:44:24 -0800 Subject: [PATCH 06/22] Eval table missing spec decode --- benchmarks/multi_node/amd_utils/job.slurm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index fd37b583d..87d4dcc9d 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -292,6 +292,7 @@ export PRECISION="${PRECISION:-}" export MODEL_PREFIX="${MODEL_PREFIX:-}" export RUNNER_TYPE="${RUNNER_TYPE:-}" export RESULT_FILENAME="${RESULT_FILENAME:-}" +export SPEC_DECODING="${SPEC_DECODING:-}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" @@ -403,6 +404,7 @@ exec sudo docker run --rm \ -e MODEL_PREFIX=\$MODEL_PREFIX \ -e RUNNER_TYPE=\$RUNNER_TYPE \ -e RESULT_FILENAME=\$RESULT_FILENAME \ + -e SPEC_DECODING=\$SPEC_DECODING \ --name \"$DOCKER_CONT_NAME\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' From 86629313bfe0beea5dde71c8a61d9f9c88bd1458 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 11 Mar 2026 13:11:08 -0700 Subject: [PATCH 07/22] fix: force-reinstall pinned lm-eval to override Docker image version The sglang 0.5.8 Docker image ships a newer lm-eval 0.4.9.2 commit that defaults fewshot_as_multiturn=True for chat-completion models. Since the version string matches the pinned commit, pip silently skips the install. Adding --force-reinstall ensures the pinned commit is always used regardless of what's pre-installed. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/benchmark_lib.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f69d3c418..326b796dd 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -489,13 +489,13 @@ _install_lm_eval_deps() { python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476" if command -v git >/dev/null 2>&1; then - if ! python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \ + if ! python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \ "git+https://github.com/EleutherAI/lm-evaluation-harness.git@${lm_eval_ref}"; then - python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \ + python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \ "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true fi else - python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \ + python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \ "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true fi } From d44f10d048b6f6d3d282aea2cdeaaf5e2fc491f1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 11 Mar 2026 16:31:59 -0700 Subject: [PATCH 08/22] add fp8 disagg no-DPA eval config to isolate DPA as variable Adds dsr1-fp8-mi355x-sglang-disagg-nodpa-eval: same image/model/precision as the DPA config but with dp-attn=false and ep=1. Running evals on this will tell us if DPA is the cause of the 0% GSM8K score or if it's something else about the fp8 disagg setup. Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 337047e57..e1f3123e2 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -595,6 +595,38 @@ dsr1-fp8-mi355x-atom-mtp: search-space: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } +# Eval-only: fp8 disagg WITHOUT DPA — isolates DPA as variable +dsr1-fp8-mi355x-sglang-disagg-nodpa-eval: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + dsr1-fp8-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 From e5c63dcff47c5ed244866bdb0d2e7b97e9abf871 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 26 Mar 2026 12:00:00 -0700 Subject: [PATCH 09/22] nvda evals --- .github/configs/amd-master.yaml | 4 +- .../workflows/benchmark-multinode-tmpl.yml | 61 ++++++++++++++++--- .github/workflows/e2e-tests.yml | 49 ++++++++++++++- .github/workflows/run-sweep.yml | 52 ++++++++++++---- AGENTS.md | 50 +++++++++++---- benchmarks/multi_node/amd_utils/job.slurm | 2 + benchmarks/multi_node/amd_utils/server.sh | 8 ++- runners/launch_gb200-nv.sh | 21 ++++++- runners/launch_gb300-nv.sh | 21 ++++++- runners/launch_mi355x-amds.sh | 56 ++++++++++------- utils/matrix_logic/generate_sweep_configs.py | 21 ++++--- utils/matrix_logic/validation.py | 2 + utils/process_changelog.py | 9 +++ 13 files changed, 284 insertions(+), 72 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index dddee854e..223a2bd07 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1061,7 +1061,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -1269,7 +1269,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index b94ac86a1..5d3035a5f 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -81,6 +81,11 @@ on: type: boolean required: false default: false + eval-only: + description: "Run only evals (skip throughput benchmark)" + type: boolean + required: false + default: false ref: description: "Git ref (branch/sha) to checkout" required: false @@ -101,6 +106,7 @@ env: SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} + EVAL_ONLY: ${{ inputs.eval-only }} PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} PREFILL_TP: ${{ inputs.prefill-tp }} @@ -119,7 +125,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 480 - name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}" + name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}" steps: - name: Slurm cleanup (pre-run) @@ -142,6 +148,9 @@ jobs: fi fi + - name: Clean up root-owned files from previous runs + run: sudo rm -rf benchmark_logs benchmark_artifacts 2>/dev/null || true + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: token: ${{ secrets.REPO_PAT }} @@ -162,16 +171,26 @@ jobs: export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} export IS_MULTINODE=true bash ./runners/launch_${RUNNER_NAME%%_*}.sh - # Check if at least one result file was created - if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then - echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV - echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)" + if [ "${{ inputs.eval-only }}" = "true" ]; then + echo "Eval-only mode: skipping benchmark result file check" + # Verify eval produced results + if ! ls results*.json 1>/dev/null 2>&1; then + echo "Eval-only run failed: no results*.json files found." >&2 + exit 1 + fi else - echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2 - exit 1 + # Check if at least one result file was created + if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then + echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV + echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)" + else + echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2 + exit 1 + fi fi - name: Process results + if: ${{ !inputs.eval-only }} env: RUNNER_TYPE: ${{ inputs.runner }} run: | @@ -192,13 +211,14 @@ jobs: done - name: Upload results + if: ${{ !inputs.eval-only }} uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}_*.json - name: Upload eval results (if any) - if: ${{ env.RUN_EVAL == 'true' }} + if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }} uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} @@ -208,8 +228,31 @@ jobs: sample*.jsonl if-no-files-found: ignore + - name: Verify eval scores + if: ${{ inputs.eval-only }} + run: | + python3 << 'PYEOF' + import json, glob, sys + MIN_SCORE = 0.85 + failed = False + for f in glob.glob("results*.json"): + with open(f) as fh: + data = json.load(fh) + for task, metrics in data.get("results", {}).items(): + for name, val in metrics.items(): + if not name.startswith("exact_match,") or "stderr" in name: + continue + if isinstance(val, (int, float)) and val < MIN_SCORE: + print(f"FAIL: {task} {name} = {val:.4f} (< {MIN_SCORE})", file=sys.stderr) + failed = True + elif isinstance(val, (int, float)): + print(f"PASS: {task} {name} = {val:.4f}") + if failed: + sys.exit(1) + PYEOF + - name: Cleanup eval outputs (post-upload) - if: ${{ env.RUN_EVAL == 'true' }} + if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }} run: | rm -f meta_env.json || true rm -f results*.json || true diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index c3afe42d7..620addebe 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -37,6 +37,7 @@ jobs: outputs: single-node-config: ${{ steps.get-jobs.outputs.single-node-config }} multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }} + multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }} steps: - name: Checkout code (ref) if: ${{ inputs.ref && inputs.ref != '' }} @@ -54,9 +55,11 @@ jobs: CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }}) SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))") - MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))") + MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and not x.get('run-eval', False)]))") + MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))") echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT + echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT test-sweep-multi-node: needs: get-jobs @@ -94,7 +97,47 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} - run-eval: ${{ matrix.config.run-eval }} + run-eval: false + ref: ${{ inputs.ref }} + + test-sweep-multi-node-evals: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.multi-node-eval-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: multi-node eval / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.multi-node-eval-config) }} + secrets: inherit + with: + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: true + eval-only: true ref: ${{ inputs.ref }} test-sweep-single-node: @@ -136,7 +179,7 @@ jobs: result-prefix: "bmk" collect-evals: - needs: [test-sweep-multi-node, test-sweep-single-node] + needs: [test-sweep-multi-node-evals, test-sweep-single-node] if: ${{ always() }} uses: ./.github/workflows/collect-evals.yml secrets: inherit diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index afd04c808..b575e706b 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -105,7 +105,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} - run-eval: ${{ matrix.config.run-eval }} + run-eval: false sweep-multi-node-1k8k: needs: setup @@ -184,6 +184,45 @@ jobs: secrets: inherit with: *single-node-inputs + sweep-multi-node-evals: + needs: setup + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: multi-node eval / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.search-space-config).multinode_evals }} + secrets: inherit + with: + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + run-eval: true + eval-only: true + collect-results: needs: [ @@ -202,16 +241,7 @@ jobs: result-prefix: "bmk" collect-evals: - needs: - [ - sweep-single-node-1k1k, - sweep-single-node-1k8k, - sweep-single-node-8k1k, - sweep-multi-node-1k1k, - sweep-multi-node-1k8k, - sweep-multi-node-8k1k, - setup, - ] + needs: [sweep-multi-node-evals, setup] if: ${{ always() && needs.setup.result != 'skipped' }} uses: ./.github/workflows/collect-evals.yml secrets: inherit diff --git a/AGENTS.md b/AGENTS.md index 6bb4a86c8..787978cfc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -37,8 +37,9 @@ InferenceX is an open-source, automated benchmarking system that continuously tr │ ├── workflows/ # GitHub Actions CI/CD │ │ ├── run-sweep.yml # Main performance sweep │ │ ├── e2e-tests.yml # End-to-end testing -│ │ ├── benchmark-tmpl.yml # Benchmark job template -│ │ └── collect-evals.yml # Eval results collection +│ │ ├── benchmark-tmpl.yml # Single-node benchmark job template +│ │ ├── benchmark-multinode-tmpl.yml # Multi-node benchmark job template +│ │ └── collect-evals.yml # Eval results collection │ └── configs/ # Master configuration files │ ├── nvidia-master.yaml │ ├── amd-master.yaml @@ -300,14 +301,27 @@ Evals run optional accuracy checks after throughput benchmarks to ensure model o ### When Evals Run -Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run for two representative points per configuration group: +Evals run as **separate workflow jobs** from throughput benchmarks (eval-only mode). The `EVAL_ONLY` flag skips throughput benchmarking and only runs lm-eval. -- **Lowest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding) -- **Highest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding) +**Single-node** eval selection (from PR #911): +- All TPs at **highest concurrency** and **median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn) +- Only on `8k1k` sequence length + +**Multi-node** eval selection: +- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding) +- Prefers `8k1k`; falls back to `1k8k` (never `1k1k`) This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`. -**Note**: Evals only run on `1k8k` sequence length. +**Workflow separation**: Eval jobs are independent from benchmark jobs: +- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node) +- `e2e-tests.yml`: `test-sweep-evals` and `test-sweep-multi-node-evals` +- Both use their respective benchmark templates with `eval-only: true` +- `collect-evals` depends only on eval jobs, not benchmark jobs + +**Multi-node eval infrastructure**: +- AMD (MI355X): `server.sh` skips `bench.sh` when `EVAL_ONLY=true`, runs lm-eval directly +- NVIDIA (GB200/GB300): Uses srt-slurm `infmax-eval` benchmark type with expanded `eval_context_length` ### Eval Framework: lm-eval @@ -329,19 +343,28 @@ python utils/matrix_logic/generate_sweep_configs.py full-sweep \ ### Eval Integration in Benchmark Scripts -All benchmark scripts in `benchmarks/` follow this pattern: - +**Single-node** scripts in `benchmarks/single_node/` follow this pattern: ```bash -# 1. Start server +# 1. Start server (with --context-length expansion if EVAL_ONLY=true) # 2. wait_for_server_ready -# 3. run_benchmark_serving (throughput) -# 4. Conditionally run evals: +# 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true) +# 4. Run evals: if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary # Writes meta_env.json and moves artifacts fi ``` +**Multi-node AMD** (`benchmarks/multi_node/amd_utils/server.sh`): +- Skips `bench.sh` when `EVAL_ONLY=true` +- Runs lm-eval via `run_eval` against the router on port 30000 +- Copies eval artifacts to `/run_logs/slurm_job-*/eval_results/` + +**Multi-node NVIDIA** (GB200/GB300 via srt-slurm): +- Uses `benchmark.type: "infmax-eval"` in srt-slurm config +- `benchmark.eval_context_length` expands server context for eval +- `infmax-eval` benchmark runner sources `benchmark_lib.sh` from `INFMAX_WORKSPACE` + ### Key Eval Functions in `benchmarks/benchmark_lib.sh` | Function | Description | @@ -391,10 +414,13 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | Variable | Default | Description | |----------|---------|-------------| | `RUN_EVAL` | `false` | Enable eval after throughput | +| `EVAL_ONLY` | `false` | Skip throughput, only run evals (set by workflow) | | `EVAL_FRAMEWORK` | `lm-eval` | Eval framework to use | | `EVAL_TASK` | `gsm8k` | Task definition file (without `.yaml`) | | `NUM_FEWSHOT` | `2` | Number of few-shot examples | | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results | +| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by compute_eval_context_length) | +| `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval | ### Adding a New Eval Task diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 87d4dcc9d..eb993f64e 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -287,6 +287,7 @@ export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" +export EVAL_ONLY="${EVAL_ONLY:-false}" export FRAMEWORK="${FRAMEWORK:-}" export PRECISION="${PRECISION:-}" export MODEL_PREFIX="${MODEL_PREFIX:-}" @@ -399,6 +400,7 @@ exec sudo docker run --rm \ -e DRY_RUN=\$DRY_RUN \ -e BENCHMARK_LOGS_DIR=/benchmark_logs \ -e RUN_EVAL=\$RUN_EVAL \ + -e EVAL_ONLY=\$EVAL_ONLY \ -e FRAMEWORK=\$FRAMEWORK \ -e PRECISION=\$PRECISION \ -e MODEL_PREFIX=\$MODEL_PREFIX \ diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 2bb686eca..9271c4382 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -468,7 +468,9 @@ if [ "$NODE_RANK" -eq 0 ]; then ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - if [[ "$DRY_RUN" -eq 1 ]]; then + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BENCH_CMD" else set -x @@ -501,8 +503,8 @@ if [ "$NODE_RANK" -eq 0 ]; then # Source eval functions from benchmark_lib.sh source /workspace/benchmarks/benchmark_lib.sh - # Cap eval concurrency at 32 for stability - EVAL_CONC=256 + # Use max concurrency from benchmark config (conc values are x-separated) + EVAL_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f8f0ef26e..e1ecc76a0 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -50,6 +50,8 @@ NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$NGINX_IMAGE" | enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +export EVAL_ONLY="${EVAL_ONLY:-false}" + export ISL="$ISL" export OSL="$OSL" @@ -112,7 +114,7 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 @@ -164,6 +166,9 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=aarch64 +# Export eval-related env vars for srt-slurm post-benchmark eval +export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." # Override the job name in the config file with the runner name @@ -271,3 +276,17 @@ else fi echo "All result files processed" + +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index d71fd5af7..079d5169e 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -31,6 +31,8 @@ NGINX_SQUASH_FILE="/home/sa-shared/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#] srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" +export EVAL_ONLY="${EVAL_ONLY:-false}" + export ISL="$ISL" export OSL="$OSL" @@ -41,7 +43,7 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 @@ -95,6 +97,9 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=aarch64 +# Export eval-related env vars for srt-slurm post-benchmark eval +export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." # Override the job name in the config file with the runner name @@ -199,6 +204,20 @@ fi echo "All result files processed" +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi + # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 406072b2d..cfc4862af 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -51,6 +51,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then mkdir -p "$BENCHMARK_LOGS_DIR" sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + # Ensure root-owned files are cleaned up even on early exit to prevent + # EACCES errors when the next GH Actions job checks out on this runner + trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT + SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" @@ -101,45 +105,48 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data - cat > collect_latest_results.py <<'PY' + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + cat > collect_latest_results.py <<'PY' import os, sys sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - ls -la "$LOGS_DIR" - - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" - echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 fi - done + + echo "Found logs directory: $LOGS_DIR" + ls -la "$LOGS_DIR" + + # Result JSON are contained within the result directory + for result_file in $(find $LOGS_DIR -type f); do + # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json + file_name=$(basename $result_file) + if [ -f $result_file ]; then + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done + fi # Extract eval results if eval was requested if [[ "${RUN_EVAL:-false}" == "true" ]]; then - EVAL_DIR="$(dirname "$LOGS_DIR")/eval_results" - if [ -d "$EVAL_DIR" ]; then + # Find eval_results in the slurm job logs directory + EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1) + if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then echo "Extracting eval results from $EVAL_DIR" for eval_file in "$EVAL_DIR"/*; do [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" echo "Copied eval artifact: $(basename "$eval_file")" done else - echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs" fi fi @@ -160,6 +167,9 @@ PY echo "Logs copied to $ARTIFACT_DIR for artifact upload" fi + # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + else export HF_HUB_CACHE_MOUNT="/var/lib/hf-hub-cache/" diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 7f6bb11ea..850fecd6a 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -40,8 +40,9 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP with highest conc. - Multi-node: for each unique (model, runner, framework, precision, - spec-decoding), prefer 1k8k entries; fall back to 8k1k if unavailable - (never 1k1k). Mark the entry with the highest max concurrency. + spec-decoding, prefill-dp-attn, decode-dp-attn), prefer 8k1k entries; + fall back to 1k8k if unavailable (never 1k1k). Mark the entry with the + highest max concurrency. Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated independently. @@ -106,9 +107,11 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: # --- Multi-node eval selection --- # For multi-node (disaggregated) entries, pick one representative per group. - # Prefer 1k8k; fall back to 8k1k if unavailable (never 1k1k). + # Prefer 8k1k; fall back to 1k8k if unavailable (never 1k1k). # Within a group, pick the entry with the highest max concurrency. - fallback_isl, fallback_osl = seq_len_stoi["8k1k"] + # Multi-node: prefer 8k1k, fallback to 1k8k + mn_target_isl, mn_target_osl = seq_len_stoi["8k1k"] + fallback_isl, fallback_osl = seq_len_stoi["1k8k"] mn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): if Fields.TP.value in entry: @@ -116,12 +119,16 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: if Fields.PREFILL.value not in entry: continue + prefill_dp = entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value) + decode_dp = entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value) key = ( entry[Fields.MODEL.value], entry[Fields.RUNNER.value], entry[Fields.FRAMEWORK.value], entry[Fields.PRECISION.value], entry[Fields.SPEC_DECODING.value], + prefill_dp, + decode_dp, ) mn_groups[key].append((i, entry)) @@ -129,10 +136,10 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: if not entries: continue - # Prefer 1k8k entries; fall back to 8k1k + # Prefer 8k1k entries; fall back to 1k8k preferred = [(i, e) for i, e in entries - if e.get(Fields.ISL.value) == target_isl - and e.get(Fields.OSL.value) == target_osl] + if e.get(Fields.ISL.value) == mn_target_isl + and e.get(Fields.OSL.value) == mn_target_osl] if not preferred: preferred = [(i, e) for i, e in entries if e.get(Fields.ISL.value) == fallback_isl diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index ad7658176..2e8626abe 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -360,6 +360,8 @@ class ChangelogMatrixEntry(BaseModel): ] = Field(default_factory=dict) multi_node: dict[str, list[MultiNodeMatrixEntry] ] = Field(default_factory=dict) + evals: list[SingleNodeMatrixEntry] = Field(default_factory=list) + multinode_evals: list[MultiNodeMatrixEntry] = Field(default_factory=list) changelog_metadata: ChangelogMetadata diff --git a/utils/process_changelog.py b/utils/process_changelog.py index d17fc3729..6b4c7878c 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -81,6 +81,8 @@ def main(): final_results = { "single_node": defaultdict(list), "multi_node": defaultdict(list), + "evals": [], + "multinode_evals": [], "changelog_metadata": { "base_ref": args.base_ref, "head_ref": args.head_ref, @@ -131,6 +133,7 @@ def main(): all_results.extend(json.loads(result.stdout)) + all_eval_results = [] for result in all_results: seq_len_str = seq_len_to_str(result["isl"], result["osl"]) if "prefill" in result and result["prefill"] is not None: @@ -138,6 +141,12 @@ def main(): else: final_results["single_node"][seq_len_str].append(result) + if result.get("run-eval"): + all_eval_results.append(result) + + final_results["evals"] = [e for e in all_eval_results if "prefill" not in e or e.get("prefill") is None] + final_results["multinode_evals"] = [e for e in all_eval_results if "prefill" in e and e.get("prefill") is not None] + # Validate final results structure validated = ChangelogMatrixEntry.model_validate(final_results) print(validated.model_dump_json(by_alias=True)) From 7215f1f88929dde27ba1173ce0b519235c26be8d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 30 Mar 2026 11:00:07 -0700 Subject: [PATCH 10/22] merge main --- .github/configs/amd-master.yaml | 306 +---- .github/configs/nvidia-master.yaml | 1145 ++--------------- .github/workflows/README.md | 6 +- .../workflows/benchmark-multinode-tmpl.yml | 24 +- .github/workflows/benchmark-tmpl.yml | 60 +- .github/workflows/claude.yml | 7 +- .github/workflows/e2e-tests.yml | 42 +- .github/workflows/profile.yml | 5 +- .github/workflows/run-sweep.yml | 66 +- AGENTS.md | 59 +- benchmarks/benchmark_lib.sh | 128 +- benchmarks/single_node/dsr1_fp4_b200.sh | 9 +- benchmarks/single_node/dsr1_fp4_b200_trt.sh | 8 +- .../single_node/dsr1_fp4_b200_trt_mtp.sh | 12 +- benchmarks/single_node/dsr1_fp4_mi355x.sh | 9 +- .../single_node/dsr1_fp4_mi355x_atom.sh | 7 +- .../single_node/dsr1_fp4_mi355x_atom_mtp.sh | 7 +- benchmarks/single_node/dsr1_fp8_b200.sh | 9 +- benchmarks/single_node/dsr1_fp8_b200_mtp.sh | 9 +- benchmarks/single_node/dsr1_fp8_b200_trt.sh | 16 +- .../single_node/dsr1_fp8_b200_trt_mtp.sh | 17 +- benchmarks/single_node/dsr1_fp8_h200.sh | 14 +- benchmarks/single_node/dsr1_fp8_h200_trt.sh | 8 +- .../single_node/dsr1_fp8_h200_trt_mtp.sh | 7 +- benchmarks/single_node/dsr1_fp8_mi300x.sh | 9 +- benchmarks/single_node/dsr1_fp8_mi325x.sh | 10 +- benchmarks/single_node/dsr1_fp8_mi355x.sh | 9 +- .../single_node/dsr1_fp8_mi355x_atom.sh | 7 +- .../single_node/dsr1_fp8_mi355x_atom_mtp.sh | 8 +- benchmarks/single_node/glm5_fp8_b200.sh | 9 +- benchmarks/single_node/glm5_fp8_h200.sh | 10 +- benchmarks/single_node/glm5_fp8_mi355x.sh | 9 +- benchmarks/single_node/glm5_nvfp4_b200.sh | 81 ++ benchmarks/single_node/gptoss_fp4_b200.sh | 9 +- benchmarks/single_node/gptoss_fp4_b200_trt.sh | 8 +- benchmarks/single_node/gptoss_fp4_h100.sh | 14 +- benchmarks/single_node/gptoss_fp4_h200.sh | 12 +- benchmarks/single_node/gptoss_fp4_h200_trt.sh | 14 +- benchmarks/single_node/gptoss_fp4_mi300x.sh | 6 +- benchmarks/single_node/gptoss_fp4_mi325x.sh | 6 +- benchmarks/single_node/gptoss_fp4_mi355x.sh | 6 +- .../single_node/gptoss_fp4_mi355x_atom.sh | 7 +- benchmarks/single_node/kimik2.5_fp4_b200.sh | 7 +- benchmarks/single_node/kimik2.5_fp4_mi355x.sh | 8 +- benchmarks/single_node/kimik2.5_int4_b200.sh | 6 +- benchmarks/single_node/kimik2.5_int4_h200.sh | 8 +- .../single_node/kimik2.5_int4_mi300x.sh | 77 ++ .../single_node/kimik2.5_int4_mi325x.sh | 10 +- .../single_node/kimik2.5_int4_mi355x.sh | 7 +- .../single_node/minimaxm2.5_fp8_b200.sh | 7 +- .../single_node/minimaxm2.5_fp8_h100.sh | 9 +- .../single_node/minimaxm2.5_fp8_h200.sh | 9 +- .../single_node/minimaxm2.5_fp8_mi300x.sh | 7 +- .../single_node/minimaxm2.5_fp8_mi325x.sh | 16 +- .../single_node/minimaxm2.5_fp8_mi355x.sh | 8 +- benchmarks/single_node/qwen3.5_bf16_b200.sh | 6 +- benchmarks/single_node/qwen3.5_bf16_mi300x.sh | 10 +- benchmarks/single_node/qwen3.5_bf16_mi325x.sh | 10 +- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 9 +- benchmarks/single_node/qwen3.5_fp8_b200.sh | 6 +- .../single_node/qwen3.5_fp8_b200_mtp.sh | 6 +- benchmarks/single_node/qwen3.5_fp8_h200.sh | 6 +- benchmarks/single_node/qwen3.5_fp8_mi300x.sh | 10 +- benchmarks/single_node/qwen3.5_fp8_mi325x.sh | 10 +- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 9 +- perf-changelog.yaml | 137 +- runners/launch_b200-dgxc.sh | 2 +- runners/launch_h100-cr.sh | 2 +- runners/launch_h200-nb.sh | 2 +- runners/launch_mi300x-amds.sh | 4 +- utils/bench_serving/backend_request_func.py | 75 +- utils/evals/EVALS.md | 4 +- utils/evals/gsm8k.yaml | 2 +- utils/evals/thresholds.json | 4 + utils/evals/validate_scores.py | 90 ++ utils/matrix_logic/generate_sweep_configs.py | 81 +- .../test_generate_sweep_configs.py | 3 - utils/process_changelog.py | 90 +- 78 files changed, 1328 insertions(+), 1643 deletions(-) create mode 100755 benchmarks/single_node/glm5_nvfp4_b200.sh create mode 100755 benchmarks/single_node/kimik2.5_int4_mi300x.sh create mode 100644 utils/evals/thresholds.json create mode 100644 utils/evals/validate_scores.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2c34a93f4..6890126cf 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -12,10 +12,6 @@ dsr1-fp4-mi355x-sglang: search-space: - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -35,11 +31,6 @@ dsr1-fp4-mi355x-atom: search-space: - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 1, conc-start: 128, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -61,11 +52,6 @@ dsr1-fp4-mi355x-atom-mtp: search-space: - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -85,10 +71,6 @@ dsr1-fp8-mi300x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -107,10 +89,6 @@ dsr1-fp8-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -129,10 +107,6 @@ dsr1-fp8-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -152,10 +126,6 @@ qwen3.5-bf16-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -174,10 +144,6 @@ qwen3.5-bf16-mi300x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -196,10 +162,6 @@ qwen3.5-bf16-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -218,10 +180,6 @@ qwen3.5-fp8-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -240,10 +198,6 @@ qwen3.5-fp8-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -262,10 +216,6 @@ qwen3.5-fp8-mi300x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -284,10 +234,6 @@ glm5-fp8-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -306,17 +252,13 @@ kimik2.5-int4-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.18.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi325x @@ -328,8 +270,22 @@ kimik2.5-int4-mi325x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +kimik2.5-int4-mi300x-vllm: + image: vllm/vllm-openai-rocm:v0.18.0 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: mi300x + precision: int4 + framework: vllm + multinode: false + seq-len-configs: - isl: 1024 - osl: 8192 + osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 @@ -351,12 +307,6 @@ kimik2.5-fp4-mi355x-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -378,12 +328,6 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: @@ -405,11 +349,6 @@ minimaxm2.5-fp8-mi300x-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -417,7 +356,7 @@ minimaxm2.5-fp8-mi300x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x @@ -429,17 +368,12 @@ minimaxm2.5-fp8-mi325x-vllm: osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 @@ -457,13 +391,6 @@ gptoss-fp4-mi300x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: @@ -488,13 +415,6 @@ gptoss-fp4-mi325x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 64, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -518,12 +438,6 @@ gptoss-fp4-mi355x-vllm: - { tp: 1, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 8 } - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: @@ -545,11 +459,6 @@ gptoss-fp4-mi355x-atom: search-space: - { tp: 1, conc-start: 16, conc-end: 128 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -570,10 +479,6 @@ dsr1-fp8-mi355x-atom: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -593,10 +498,6 @@ dsr1-fp8-mi355x-atom-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -943,129 +844,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - # FIXME(billishyahao): disable 1k8k for now - # - isl: 1024 - # osl: 8192 - # search-space: - # # MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "mtp" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - # # non-MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - # - spec-decoding: "none" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "none" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "none" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 @@ -1485,49 +1263,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - - # FIXME(billishyahao): disable FP4 1k8k for now - # - isl: 1024 - # osl: 8192 - # search-space: - # # MTP configurations - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "mtp" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - # # non-MTP configurations - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "none" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f4570fd2c..157a9b54c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1663,11 +1663,6 @@ dsr1-fp4-b200-sglang: search-space: - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -1694,17 +1689,6 @@ dsr1-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 4 } - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - - isl: 1024 - osl: 8192 - search-space: - # low concurrency cases use TP only - # concurrency 64 uses TP & EP - # high concurrency cases use TP & EP & DP-ATTN - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: @@ -1737,17 +1721,6 @@ dsr1-fp4-b200-trt-mtp: - { tp: 8, conc-start: 128, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 32, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 64, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # TP=4 configurations - - { tp: 4, conc-start: 16, conc-end: 16, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 8, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - # TP=8 configurations - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -1772,10 +1745,6 @@ dsr1-fp8-b200-sglang: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1795,10 +1764,6 @@ qwen3.5-bf16-b200-sglang: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1818,11 +1783,6 @@ qwen3.5-fp8-b200-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -1842,14 +1802,28 @@ glm5-fp8-b200-sglang: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 + - isl: 8192 + osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } + +glm5-nvfp4-b200-sglang: + image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + runner: b200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } qwen3.5-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.9-cu130 @@ -1864,10 +1838,6 @@ qwen3.5-fp8-b200-sglang-mtp: osl: 1024 search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -1886,10 +1856,6 @@ kimik2.5-int4-b200-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1908,10 +1874,6 @@ kimik2.5-int4-h200-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1931,10 +1893,6 @@ kimik2.5-fp4-b200-vllm: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1954,10 +1912,6 @@ dsr1-fp8-b200-sglang-mtp: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -1978,11 +1932,6 @@ dsr1-fp8-b200-trt: - { tp: 8, ep: 1, conc-start: 64, conc-end: 128 } - { tp: 4, ep: 1, conc-start: 8, conc-end: 16 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256} - - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -2007,13 +1956,6 @@ dsr1-fp8-b200-trt-mtp: # If CONC == 256, then TP8, EP8, DP_ATTN=true - { tp: 8, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # mostly TP8 - # If CONC >= 128, then TP8, EP8, DP_ATTN=true - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -2033,10 +1975,6 @@ dsr1-fp8-h200-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -2055,10 +1993,6 @@ qwen3.5-fp8-h200-sglang: osl: 1024 search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -2077,10 +2011,6 @@ glm5-fp8-h200-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -2101,11 +2031,6 @@ dsr1-fp8-h200-trt: # If CONC > 64, then DP_ATTN=true search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - # If CONC > 64, then DP_ATTN=true - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 # If CONC > 32, then DP_ATTN=true @@ -2129,12 +2054,6 @@ dsr1-fp8-h200-trt-mtp: # If CONC >= 128, then DP_ATTN=true, MTP=1 - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # If CONC >= 256, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -3149,14 +3068,6 @@ gptoss-fp4-b200-trt: - { tp: 4, conc-start: 4, conc-end: 4 } - { tp: 8, conc-start: 4, conc-end: 4 } # Low ==> high TP from Left to Right of pareto - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 256, conc-end: 256} - - { tp: 2, conc-start: 128, conc-end: 256} - - { tp: 4, conc-start: 4, conc-end: 256} - - { tp: 8, conc-start: 4, conc-end: 4} - # Low ==> high TP from Left to Right of pareto - isl: 8192 osl: 1024 search-space: @@ -3181,13 +3092,6 @@ gptoss-fp4-b200-vllm: - { tp: 2, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 search-space: @@ -3210,11 +3114,6 @@ minimaxm2.5-fp8-b200-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3222,7 +3121,7 @@ minimaxm2.5-fp8-b200-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -3236,12 +3135,6 @@ gptoss-fp4-h100-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3250,7 +3143,7 @@ gptoss-fp4-h100-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } minimaxm2.5-fp8-h100-vllm: - image: vllm/vllm-openai:v0.16.0 + image: vllm/vllm-openai:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h100 @@ -3263,11 +3156,6 @@ minimaxm2.5-fp8-h100-vllm: search-space: # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3345,67 +3233,6 @@ dsr1-fp8-h100-dynamo-sglang: tp: 16 ep: 16 dp-attn: true - - isl: 1024 - osl: 8192 - search-space: - # # STP: Max throughput TEP (1 prefill, 2 decode) - # - conc-list: [1, 2, 4, 8, 16, 32] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p2d-max-tp.yaml" - # decode: - # num-worker: 2 - # tp: 16 - # ep: 1 - # dp-attn: false - # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - # - conc-list: [1, 2, 4, 8] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p1d-max-dep.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # MTP: Max throughput TEP (1 prefill, 2 decode) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 1 - dp-attn: false - # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - isl: 8192 osl: 1024 search-space: @@ -3485,13 +3312,6 @@ gptoss-fp4-h200-trt: - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 search-space: @@ -3501,7 +3321,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 @@ -3516,13 +3336,6 @@ gptoss-fp4-h200-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 4 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3532,7 +3345,7 @@ gptoss-fp4-h200-vllm: - { tp: 8, conc-start: 4, conc-end: 32 } minimaxm2.5-fp8-h200-vllm: - image: vllm/vllm-openai:v0.16.0 + image: vllm/vllm-openai:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 @@ -3544,10 +3357,6 @@ minimaxm2.5-fp8-h200-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -3744,8 +3553,8 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - - isl: 1024 - osl: 8192 + - isl: 8192 + osl: 1024 search-space: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" @@ -3756,105 +3565,89 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 7 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - spec-decoding: "mtp" - conc-list: [ 128 ] + conc-list: [ 180 ] prefill: - num-worker: 1 + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - spec-decoding: "mtp" - conc-list: [ 512 ] + conc-list: [ 1229 ] prefill: - num-worker: 1 + num-worker: 7 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" decode: num-worker: 1 - tp: 32 - ep: 32 + tp: 16 + ep: 16 dp-attn: true - spec-decoding: "mtp" - conc-list: [ 3072 ] + conc-list: [ 666 ] prefill: - num-worker: 1 + num-worker: 8 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" decode: - num-worker: 3 - tp: 16 - ep: 16 + num-worker: 1 + tp: 32 + ep: 32 dp-attn: true - spec-decoding: "mtp" - conc-list: [ 6144 ] + conc-list: [ 4301 ] prefill: - num-worker: 1 + num-worker: 11 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: - num-worker: 3 + num-worker: 1 tp: 16 ep: 16 dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 8192 ] + + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 12, 44, 76 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - # Non-MTP configurations (default spec_decoding="none") + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false - conc-list: [ 5 ] prefill: num-worker: 1 @@ -3862,216 +3655,22 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - - conc-list: [ 60 ] + - conc-list: [ 333 ] prefill: - num-worker: 1 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml" - decode: - num-worker: 15 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 135 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml" - decode: - num-worker: 15 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 563 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 8192 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [ 4, 8, 12, 24, 48 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 180 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 1229 ] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 666 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 4301 ] - prefill: - num-worker: 11 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 12, 44, 76 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 5 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 333 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 @@ -4339,156 +3938,6 @@ dsr1-fp8-gb200-dynamo-trt: tp: 8 ep: 8 dp-attn: false - # 1k8k MTP configs - - isl: 1024 - osl: 8192 - search-space: - - spec-decoding: "mtp" - conc-list: [8192] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2152] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [564] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [72] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [4, 8] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - # 1k8k STP configs - - conc-list: [8192] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [564] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [36] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false # 8k1k MTP configs - isl: 8192 osl: 1024 @@ -5079,343 +4528,164 @@ dsr1-fp4-gb300-dynamo-trt: - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [333] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8, 12, 24, 48] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [2253] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1229] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [12, 48, 96, 192] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [8192] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [1229] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [4301] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 + tp: 4 + ep: 4 dp-attn: true - - conc-list: [2253] + - spec-decoding: "mtp" + conc-list: [333] prefill: - num-worker: 3 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - - isl: 1024 - osl: 8192 - search-space: - # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" - conc-list: [7] + conc-list: [5] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - spec-decoding: "mtp" - conc-list: [63] + conc-list: [8, 12, 24, 48] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - spec-decoding: "mtp" - conc-list: [563] + conc-list: [2253] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" decode: num-worker: 1 - tp: 32 - ep: 32 + tp: 16 + ep: 16 dp-attn: true - spec-decoding: "mtp" - conc-list: [2088] + conc-list: [1229] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - - spec-decoding: "mtp" - conc-list: [8192] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [16384] + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [5] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 - dp-attn: true - # STP configurations (no spec_decoding) - - conc-list: [7] + dp-attn: false + - conc-list: [12, 48, 96, 192] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - - conc-list: [60] + - conc-list: [8192] prefill: - num-worker: 1 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: - num-worker: 15 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [245] - prefill: num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 7 tp: 8 ep: 8 - dp-attn: false - - conc-list: [1024] + dp-attn: true + - conc-list: [1229] prefill: - num-worker: 1 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - - conc-list: [4096] + - conc-list: [4301] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 - tp: 32 - ep: 32 + tp: 16 + ep: 16 dp-attn: true - - conc-list: [8192] + - conc-list: [2253] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 @@ -6184,187 +5454,6 @@ dsr1-fp8-gb300-dynamo-trt: tp: 8 ep: 8 dp-attn: true - - isl: 1024 - osl: 8192 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [4] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [16] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [141] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [544] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2048] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [8192] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - # STP configurations (no spec_decoding) - - conc-list: [4] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [36] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [282] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [1024] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [4096] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [8192] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - gptoss-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 model: openai/gpt-oss-120b diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 37e64b8ed..de0a3dcab 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -40,7 +40,7 @@ usage: generate_sweep_configs.py full-sweep [--precision PRECISION [PRECISION ...]] [--framework FRAMEWORK [FRAMEWORK ...]] [--runner-type RUNNER_TYPE [RUNNER_TYPE ...]] - [--seq-lens {1k1k,1k8k,8k1k} [{1k1k,1k8k,8k1k} ...]] + [--seq-lens {1k1k,8k1k} [{1k1k,8k1k} ...]] [--step-size STEP_SIZE] [--max-conc MAX_CONC] [--max-tp MAX_TP] @@ -62,9 +62,9 @@ full-sweep --config-files .github/configs/nvidia-master.yaml full-sweep --single-node --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml ``` -**Test all single-node fp8 precision configs for 1k8k workloads:** +**Test all single-node fp8 precision configs for 8k1k workloads:** ``` -full-sweep --single-node --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml +full-sweep --single-node --precision fp8 --seq-lens 8k1k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml ``` **Test all single-node TRT configs on H200 runners:** diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 5d3035a5f..d529b7ccc 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -107,6 +107,8 @@ env: DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} PREFILL_TP: ${{ inputs.prefill-tp }} @@ -156,6 +158,7 @@ jobs: token: ${{ secrets.REPO_PAT }} fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} + clean: false - name: Launch multi-node job script env: @@ -230,26 +233,7 @@ jobs: - name: Verify eval scores if: ${{ inputs.eval-only }} - run: | - python3 << 'PYEOF' - import json, glob, sys - MIN_SCORE = 0.85 - failed = False - for f in glob.glob("results*.json"): - with open(f) as fh: - data = json.load(fh) - for task, metrics in data.get("results", {}).items(): - for name, val in metrics.items(): - if not name.startswith("exact_match,") or "stderr" in name: - continue - if isinstance(val, (int, float)) and val < MIN_SCORE: - print(f"FAIL: {task} {name} = {val:.4f} (< {MIN_SCORE})", file=sys.stderr) - failed = True - elif isinstance(val, (int, float)): - print(f"PASS: {task} {name} = {val:.4f}") - if failed: - sys.exit(1) - PYEOF + run: python3 utils/evals/validate_scores.py - name: Cleanup eval outputs (post-upload) if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }} diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 16b587657..797505eec 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -54,6 +54,11 @@ on: type: boolean required: true default: false + eval-only: + description: "Run only evals (skip throughput benchmark)" + type: boolean + required: false + default: false random-range-ratio: required: false type: string @@ -83,6 +88,9 @@ env: SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} + EVAL_ONLY: ${{ inputs.eval-only }} + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache permissions: contents: read @@ -91,7 +99,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 300 - name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}" + name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}" steps: - name: Resource cleanup (pre-run) run: &resource-cleanup | @@ -123,13 +131,14 @@ jobs: sleep 5 done fi - fi + fi - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: token: ${{ secrets.REPO_PAT }} fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} + clean: false - name: Launch job script env: @@ -145,28 +154,42 @@ jobs: echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV bash ./runners/launch_${RUNNER_NAME%%_*}.sh - FOUND_RESULT_FILE= - for i in {1..10}; do - if [ -f "$RESULT_FILENAME.json" ]; then - FOUND_RESULT_FILE=true - break + + if [ "${{ inputs.eval-only }}" = "true" ]; then + echo "Eval-only mode: skipping benchmark result file check" + # Verify eval produced results + if ! ls results*.json 1>/dev/null 2>&1; then + echo "Eval-only run failed: no results*.json files found." >&2 + exit 1 fi - echo "Waiting for result file... (attempt $i)" - sleep 1 - done + # Verify eval scores meet per-benchmark minimum thresholds + python3 utils/evals/validate_scores.py + else + FOUND_RESULT_FILE= + for i in {1..10}; do + if [ -f "$RESULT_FILENAME.json" ]; then + FOUND_RESULT_FILE=true + break + fi + echo "Waiting for result file... (attempt $i)" + sleep 1 + done - if [ -z "$FOUND_RESULT_FILE" ]; then - echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 - exit 1 + if [ -z "$FOUND_RESULT_FILE" ]; then + echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 + exit 1 + fi fi - name: Process result + if: ${{ !inputs.eval-only }} env: RUNNER_TYPE: ${{ inputs.runner }} run: | python3 utils/process_result.py - name: Upload result + if: ${{ !inputs.eval-only }} uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: bmk_${{ env.RESULT_FILENAME }} @@ -176,7 +199,7 @@ jobs: if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: - name: server_logs_${{ env.RESULT_FILENAME }} + name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }} path: server.log if-no-files-found: ignore @@ -184,12 +207,12 @@ jobs: if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: - name: gpu_metrics_${{ env.RESULT_FILENAME }} + name: ${{ inputs.eval-only && 'eval_gpu_metrics_' || 'gpu_metrics_' }}${{ env.RESULT_FILENAME }} path: gpu_metrics.csv if-no-files-found: ignore - name: Upload eval results (if any) - if: ${{ env.RUN_EVAL == 'true' }} + if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }} uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} @@ -197,14 +220,15 @@ jobs: meta_env.json results*.json sample*.jsonl - if-no-files-found: ignore + if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }} - name: Cleanup eval outputs (post-upload) - if: ${{ env.RUN_EVAL == 'true' }} + if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }} run: | rm -f meta_env.json || true # Remove any eval results JSONs that were moved into workspace rm -f results*.json || true + rm -f sample*.jsonl || true - name: Resource cleanup (post-run) if: always() diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index 1be4b1b98..b5b474471 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -97,7 +97,7 @@ jobs: The `generate-cli-command` input accepts arguments for `generate_sweep_configs.py`. Usage: `generate_sweep_configs.py` `[-h]` `{full-sweep,runner-model-sweep,test-config}` **Subcommand reference:** - - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-len`. This is the primary subcommand for running benchmarks. + - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-lens`. This is the primary subcommand for running benchmarks. - `test-config`: Use this subcommand ONLY when prompted to with 'test-config'. Uses the flags `--config-files` and `--config-keys`, does NOT accept any other arguments. Examples: @@ -119,7 +119,7 @@ jobs: **Specify concurrency and sequence length:** ``` - generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-len 1k1k" + generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-lens 1k1k" ``` **Test specific config keys (MUST USE `--conc`):** @@ -130,7 +130,7 @@ jobs: **IMPORTANT: Keep runs precise and efficient:** - Use `full-sweep` with filter flags to narrow down the benchmark scope - "full-sweep" does NOT mean running everything - When using `full-sweep`, you must use `--min-conc` and `--max-conc` together to specify a single concurrency value. Unless prompted otherwise, use `--min-conc 4 --max-conc 4` - - When using `full-sweep`, you can use `--seq-len` to specify a single sequence length (choices: 1k1k, 1k8k, 8k1k). Unless prompted otherwise, use `--seq-len 1k1k` + - When using `full-sweep`, you can use `--seq-lens` to specify sequence lengths (choices: 1k1k, 8k1k). Unless prompted otherwise, use `--seq-lens 1k1k` - Use `test-config` ONLY when given specific config keys to test - Use `--config-files`, `--config-keys`, and `--conc` flags ONLY - Always filter by specific models, frameworks, precision, conc, or config keys when possible @@ -291,4 +291,3 @@ jobs: # Then use $EP in the vllm serve command ``` This ensures the script respects the `ep` setting in the master config YAML's search-space. - diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index af6410e2d..6765113b2 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -37,6 +37,7 @@ jobs: outputs: single-node-config: ${{ steps.get-jobs.outputs.single-node-config }} multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }} + eval-config: ${{ steps.get-jobs.outputs.eval-config }} multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }} steps: - name: Checkout code (ref) @@ -54,11 +55,13 @@ jobs: pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }}) - SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))") + SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))") MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and not x.get('run-eval', False)]))") + EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))") MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))") echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT + echo "eval-config=$EVALS" >> $GITHUB_OUTPUT echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT test-sweep-multi-node: @@ -167,7 +170,38 @@ jobs: conc: ${{ matrix.config.conc }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} - run-eval: ${{ matrix.config.run-eval }} + run-eval: false + ref: ${{ inputs.ref }} + + test-sweep-evals: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.eval-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: eval / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + run-eval: true + eval-only: true ref: ${{ inputs.ref }} collect-results: @@ -179,8 +213,8 @@ jobs: result-prefix: "bmk" collect-evals: - needs: [test-sweep-multi-node-evals, test-sweep-single-node] - if: ${{ always() }} + needs: [test-sweep-evals, test-sweep-multi-node-evals] + if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }} uses: ./.github/workflows/collect-evals.yml secrets: inherit diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index d72f54b8f..64e4ea531 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -35,6 +35,8 @@ env: HF_HUB_CACHE: '/mnt/hf_hub_cache/' RANDOM_RANGE_RATIO: '0.8' PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache jobs: get-jobs: @@ -87,7 +89,7 @@ jobs: - name: Fail if no matching entries if: ${{ steps.filter.outputs.count == '0' }} run: | - echo "No entries produced for config-key=${{ inputs.config-key }}, seq-lens=${{ inputs.seq-lens }}, conc=${{ inputs.conc }}." >&2 + echo "No entries produced for config-key=${{ inputs.config-key }}, conc=${{ inputs.conc }}." >&2 exit 1 profile: @@ -153,6 +155,7 @@ jobs: with: fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} + clean: false - name: Launch + Profile (single-node sglang/vllm) id: run diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index a73eb5bbc..a87f7ee13 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -107,18 +107,6 @@ jobs: decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} run-eval: false - sweep-multi-node-1k8k: - needs: setup - if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k']) != 'null' }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: multi-node 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k'] }} - secrets: inherit - with: *multi-node-inputs - sweep-multi-node-8k1k: needs: setup if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }} @@ -160,29 +148,47 @@ jobs: disagg: ${{ matrix.config.disagg }} run-eval: ${{ matrix.config.run-eval }} - sweep-single-node-1k8k: + sweep-single-node-8k1k: needs: setup - if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k8k']) != 'null' }} + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: single-node 1k8k / + name: single-node 8k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k8k'] }} + config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }} secrets: inherit with: *single-node-inputs - sweep-single-node-8k1k: + sweep-evals: needs: setup - if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }} + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: single-node 8k1k / + name: eval / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }} + config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }} secrets: inherit - with: *single-node-inputs + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + run-eval: true + eval-only: true sweep-multi-node-evals: needs: setup @@ -195,6 +201,7 @@ jobs: config: ${{ fromJson(needs.setup.outputs.search-space-config).multinode_evals }} secrets: inherit with: + exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} osl: ${{ matrix.config.osl }} max-model-len: ${{ matrix.config.max-model-len }} @@ -204,17 +211,14 @@ jobs: model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} conc-list: ${{ toJson(matrix.config.conc) }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} - prefill-num-worker: ${{ matrix.config.prefill.num-worker }} prefill-tp: ${{ matrix.config.prefill.tp }} prefill-ep: ${{ matrix.config.prefill.ep }} prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} - decode-num-worker: ${{ matrix.config.decode.num-worker }} decode-tp: ${{ matrix.config.decode.tp }} decode-ep: ${{ matrix.config.decode.ep }} @@ -227,10 +231,8 @@ jobs: needs: [ sweep-single-node-1k1k, - sweep-single-node-1k8k, sweep-single-node-8k1k, sweep-multi-node-1k1k, - sweep-multi-node-1k8k, sweep-multi-node-8k1k, setup, ] @@ -241,8 +243,8 @@ jobs: result-prefix: "bmk" collect-evals: - needs: [sweep-multi-node-evals, setup] - if: ${{ always() && needs.setup.result != 'skipped' }} + needs: [sweep-evals, sweep-multi-node-evals, setup] + if: ${{ always() && needs.setup.result != 'skipped' && (needs.sweep-evals.result != 'skipped' || needs.sweep-multi-node-evals.result != 'skipped') }} uses: ./.github/workflows/collect-evals.yml secrets: inherit @@ -252,10 +254,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Extract and save changelog metadata - env: - CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }} run: | - echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json + cat <<'CONFIGEOF' > _full_config.json + ${{ needs.setup.outputs.search-space-config }} + CONFIGEOF + jq '.changelog_metadata' _full_config.json > changelog_metadata.json + rm -f _full_config.json - name: Upload changelog artifact uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 diff --git a/AGENTS.md b/AGENTS.md index 787978cfc..e64a903cd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -76,27 +76,27 @@ python -m pytest matrix_logic/ -v ```bash # Full sweep with all configs python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml + --config-files .github/configs/nvidia-master.yaml # Filter by model prefix (dsr1 or gptoss) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ - --model dsr1 + --config-files .github/configs/nvidia-master.yaml \ + --model-prefix dsr1 # Filter by framework (sglang, trt, vllm, atom, dynamo-trt, dynamo-sglang) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --framework sglang # Filter by precision (fp4, fp8) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --precision fp8 # Filter by runner type (b200, h100, h200, gb200, mi300x, mi325x, mi355x) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ - --runner b200 + --config-files .github/configs/nvidia-master.yaml \ + --runner-type b200 ``` ### Processing Results @@ -141,7 +141,6 @@ When working with benchmark configurations, use these valid values: **Sequence Lengths (ISL/OSL)**: - `1k1k` - 1024 input / 1024 output -- `1k8k` - 1024 input / 8192 output - `8k1k` - 8192 input / 1024 output ## Code Conventions @@ -267,7 +266,7 @@ dsr1-fp8-h200-dynamo-sglang: **7. Validate configuration:** ```bash python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --framework dynamo-sglang ``` @@ -297,19 +296,19 @@ When upgrading Docker images in benchmark scripts and master configs .yaml: ## Evals (Accuracy Validation) -Evals run optional accuracy checks after throughput benchmarks to ensure model outputs aren't degraded by inference optimizations. +Evals run optional accuracy checks to ensure model outputs aren't degraded by inference optimizations. They can run alongside benchmarks or independently in eval-only mode. ### When Evals Run Evals run as **separate workflow jobs** from throughput benchmarks (eval-only mode). The `EVAL_ONLY` flag skips throughput benchmarking and only runs lm-eval. -**Single-node** eval selection (from PR #911): +**Single-node** eval selection: - All TPs at **highest concurrency** and **median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn) - Only on `8k1k` sequence length **Multi-node** eval selection: -- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding) -- Prefers `8k1k`; falls back to `1k8k` (never `1k1k`) +- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) +- Only `8k1k` sequence length This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`. @@ -330,21 +329,27 @@ The default eval framework is [lm-evaluation-harness](https://github.com/Eleuthe ### Running Evals via CLI ```bash -# Generate configs with evals marked (in addition to all configs) +# Generate configs (evals marked by default on 8k1k subset) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ - --run-evals + --config-files .github/configs/nvidia-master.yaml + +# Generate throughput-only configs (skip evals) +python utils/matrix_logic/generate_sweep_configs.py full-sweep \ + --config-files .github/configs/nvidia-master.yaml \ + --no-evals # Generate ONLY the eval subset (excludes non-eval configs) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --evals-only ``` ### Eval Integration in Benchmark Scripts -**Single-node** scripts in `benchmarks/single_node/` follow this pattern: +All benchmark scripts in `benchmarks/` follow one of two flows: + ```bash +# Combined mode (benchmark + eval): # 1. Start server (with --context-length expansion if EVAL_ONLY=true) # 2. wait_for_server_ready # 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true) @@ -353,6 +358,13 @@ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary # Writes meta_env.json and moves artifacts fi + +# Eval-only mode (EVAL_ONLY=true): +# 1. Compute expanded context via compute_eval_context_length +# 2. Start server with expanded context (--context-length or --max-model-len) +# 3. wait_for_server_ready +# 4. run_benchmark_serving returns immediately (skipped) +# 5. run_eval + append_lm_eval_summary ``` **Multi-node AMD** (`benchmarks/multi_node/amd_utils/server.sh`): @@ -374,6 +386,8 @@ fi | `append_lm_eval_summary` | Writes `meta_env.json` and moves eval artifacts to workspace | | `_install_lm_eval_deps` | Installs lm-eval dependencies | | `_patch_lm_eval` | Patches lm-eval for reasoning tokens and TRT compatibility | +| `compute_eval_context_length` | Computes eval context length (5x benchmark context, capped at model native max) | +| `get_native_max_context_length` | Extracts model's native max context length from HF config | ### Eval Results Collection @@ -413,19 +427,18 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | Variable | Default | Description | |----------|---------|-------------| -| `RUN_EVAL` | `false` | Enable eval after throughput | +| `RUN_EVAL` | `false` | Enable eval after throughput benchmark | | `EVAL_ONLY` | `false` | Skip throughput, only run evals (set by workflow) | | `EVAL_FRAMEWORK` | `lm-eval` | Eval framework to use | -| `EVAL_TASK` | `gsm8k` | Task definition file (without `.yaml`) | -| `NUM_FEWSHOT` | `2` | Number of few-shot examples | +| `EVAL_TASKS_DIR` | `utils/evals/gsm8k.yaml` | Path to lm-eval task YAML | | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results | -| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by compute_eval_context_length) | +| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) | | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval | ### Adding a New Eval Task 1. Create a task YAML in `utils/evals/` (follow lm-eval task format) -2. Set `EVAL_TASK=` when running benchmarks +2. Set `EVAL_TASKS_DIR=utils/evals/.yaml` when running benchmarks 3. Update `utils/collect_eval_results.py` if new metrics need extraction ### lm-eval Patches diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 326b796dd..b3264cef0 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -2,6 +2,13 @@ # Shared benchmarking utilities for InferenceMAX +# Keep Python bytecode out of the mounted workspace. Benchmark jobs often run as +# root inside containers, and root-owned cache directories break future checkout +# cleanup on self-hosted runners. +export PYTHONDONTWRITEBYTECODE=1 +export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}" +mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true + # -------------------------------- # GPU monitoring helpers # -------------------------------- @@ -174,6 +181,12 @@ wait_for_server_ready() { # --trust-remote-code: Optional flag to trust remote code from HuggingFace # --server-pid: Optional server process ID to monitor during benchmark run_benchmark_serving() { + # In eval-only mode, skip the throughput benchmark entirely. + if [ "${EVAL_ONLY}" = "true" ]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + return 0 + fi + set +x local model="" local port="" @@ -486,6 +499,10 @@ move_profile_trace_for_relay() { # ------------------------------ _install_lm_eval_deps() { + # torchvision causes circular imports in ATOM; TRT-LLM/SGLang need it at module level. + if [[ "${IMAGE:-}" == *atom* ]]; then + python3 -m pip uninstall -y torchvision 2>/dev/null || true + fi python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476" if command -v git >/dev/null 2>&1; then @@ -574,26 +591,74 @@ PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } +get_native_max_context_length() { + local model_path="$1" + python3 -c " +from transformers import AutoConfig +config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True) +for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']: + if hasattr(config, attr): + print(getattr(config, attr)) + break +else: + print(0) +" +} + +# Compute the context length for eval-only mode. +# Uses 5x the benchmark context capped at the model's native max. +# Sets EVAL_MAX_MODEL_LEN (needed by run_lm_eval). +# Echoes the computed value for scripts to capture. +# +# Usage: local ctx=$(compute_eval_context_length "$MODEL" "${current_ctx}") +compute_eval_context_length() { + local model="$1" + local benchmark_ctx="${2:-0}" + local native_max + native_max=$(get_native_max_context_length "$model") + native_max="${native_max:-0}" + + if [ "$benchmark_ctx" -eq 0 ] 2>/dev/null; then + benchmark_ctx="${native_max:-0}" + fi + local eval_ctx=$(( benchmark_ctx * 1 )) + if [ "$native_max" -gt 0 ] 2>/dev/null && [ "$eval_ctx" -gt "$native_max" ]; then + eval_ctx="$native_max" + fi + # If eval_ctx is still 0 (both benchmark_ctx and native_max were 0), fall back + if [ "$eval_ctx" -le 0 ] 2>/dev/null; then + echo "WARN: compute_eval_context_length could not determine context length for $model" >&2 + eval_ctx="${MAX_MODEL_LEN:-16384}" + fi + EVAL_MAX_MODEL_LEN="$eval_ctx" + echo "$eval_ctx" +} + +# Convenience wrapper: compute eval context from ISL/OSL and export EVAL_MAX_MODEL_LEN. +# Call directly (not in a subshell) so the export persists. +# Scripts then wire $EVAL_MAX_MODEL_LEN into whichever server variable they need. +setup_eval_context() { + EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$((ISL + OSL + 200))") + export EVAL_MAX_MODEL_LEN +} + run_lm_eval() { local port="${PORT:-8888}" - local task="${EVAL_TASK:-gsm8k}" - local num_fewshot="${NUM_FEWSHOT:-2}" + local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}" local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" - local gen_max_tokens=16384 + local eval_context_len="${EVAL_MAX_MODEL_LEN:-16384}" local temperature=0 local top_p=1 - local concurrent_requests=32 + local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-64}" while [[ $# -gt 0 ]]; do case $1 in --port) port="$2"; shift 2 ;; - --task) task="$2"; shift 2 ;; - --num-fewshot) num_fewshot="$2"; shift 2 ;; + --task) tasks_dir="$2"; shift 2 ;; --results-dir) results_dir="$2"; shift 2 ;; - --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;; + --gen-max-tokens) eval_context_len="$2"; shift 2 ;; --temperature) temperature="$2"; shift 2 ;; --top-p) top_p="$2"; shift 2 ;; - --concurrent-requests) concurrent_requests="$2"; shift 2 ;; *) echo "Unknown parameter: $1"; return 1 ;; esac done @@ -606,16 +671,23 @@ run_lm_eval() { export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL + # Cap output tokens: must fit within context window (leave room for input), + # and avoid excessive KV cache reservation per request on TRT. + local max_output_tokens=$(( eval_context_len > 4096 ? eval_context_len - 4096 : eval_context_len / 2 )) + if [ "$max_output_tokens" -gt 16384 ]; then + max_output_tokens=16384 + fi + echo "Eval budget: eval_context_len=${eval_context_len}, max_output_tokens=${max_output_tokens}" + # Export for append_lm_eval_summary to pick up export EVAL_RESULT_DIR="$results_dir" set -x python3 -m lm_eval --model local-chat-completions --apply_chat_template \ - --tasks "utils/evals/${task}.yaml" \ - --num_fewshot "${num_fewshot}" \ + --tasks "${tasks_dir}" \ --output_path "${results_dir}" \ --log_samples \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=5,num_concurrent=${concurrent_requests},timeout=600,tokenized_requests=False,max_length=${gen_max_tokens}" \ - --gen_kwargs "max_tokens=8192,temperature=${temperature},top_p=${top_p}" + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${eval_context_len}" \ + --gen_kwargs "max_tokens=${max_output_tokens},temperature=${temperature},top_p=${top_p}" local eval_exit=$? set +x return $eval_exit @@ -623,8 +695,15 @@ run_lm_eval() { append_lm_eval_summary() { local results_dir="${EVAL_RESULT_DIR}" + if [ -z "${results_dir}" ]; then + echo "WARN: EVAL_RESULT_DIR is empty; skipping artifact collection" >&2 + return 1 + fi local out_dir="${results_dir}" - mkdir -p "$out_dir" || true + if [ ! -d "${out_dir}" ]; then + echo "WARN: EVAL_RESULT_DIR='${out_dir}' does not exist; skipping artifact collection" >&2 + return 1 + fi # Write minimal meta for collectors that expect it local meta_json="${out_dir}/meta_env.json" @@ -672,13 +751,13 @@ META # Move eval artifacts into PWD (no new directories in workspace) if [ -f "${meta_json}" ]; then - mv -f "${meta_json}" ./ || true + mv -f "${meta_json}" ./ || echo "WARN: failed to move ${meta_json}" >&2 fi if [ -d "${out_dir}" ]; then while IFS= read -r -d '' jf; do base=$(basename "$jf") if [ "$base" != "meta_env.json" ]; then - mv -f "$jf" ./ || true + mv -f "$jf" ./ || echo "WARN: failed to move ${jf}" >&2 fi done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null) fi @@ -706,8 +785,23 @@ run_eval() { esac done + # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script + if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then + compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null + fi + + local eval_rc=0 case "$framework" in - lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;; - *) echo "Unknown framework '${framework}'"; return 1 ;; + lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;; + *) echo "Unknown framework '${framework}'"; eval_rc=1 ;; esac + + if [ "$eval_rc" -ne 0 ]; then + echo "ERROR: run_eval failed with exit code $eval_rc" >&2 + if [ "${EVAL_ONLY}" = "true" ]; then + echo "Eval-only mode: failing after artifact collection" >&2 + return "$eval_rc" + fi + fi + return $eval_rc } diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh index d98fb8e2b..d88941628 100644 --- a/benchmarks/single_node/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/dsr1_fp4_b200.sh @@ -31,6 +31,11 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -40,7 +45,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 & +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -63,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt.sh b/benchmarks/single_node/dsr1_fp4_b200_trt.sh index 036c2998e..7a9706d30 100644 --- a/benchmarks/single_node/dsr1_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsr1_fp4_b200_trt.sh @@ -77,6 +77,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens] capture_tokens=(1 2 4 8 16 32 64 128) @@ -120,7 +126,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh index 2a0320e53..59e5a3930 100644 --- a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh @@ -76,10 +76,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then elif [[ $CONC == 128 && $DP_ATTENTION == "false" ]]; then PIECEWISE_CUDA_GRAPHS="true" fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC == 64 ]]; then - PIECEWISE_CUDA_GRAPHS="true" - fi fi if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then @@ -101,6 +97,12 @@ fi # end of set of configs using piecewise_cuda_graphs # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + set -x # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ @@ -134,7 +136,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_mi355x.sh b/benchmarks/single_node/dsr1_fp4_mi355x.sh index 58c1118eb..578a6c810 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/dsr1_fp4_mi355x.sh @@ -30,6 +30,11 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -44,7 +49,7 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --max-prefill-tokens=$PREFILL_SIZE \ --cuda-graph-max-bs=128 \ --attention-backend aiter \ ---kv-cache-dtype fp8_e4m3 > $SERVER_LOG 2>&1 & +--kv-cache-dtype fp8_e4m3 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -65,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh index 08f579244..31554fc22 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -69,7 +74,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh index af1ab6aa4..1d557684e 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -72,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh index 7b4be6b2b..e6d8a0e9c 100644 --- a/benchmarks/single_node/dsr1_fp8_b200.sh +++ b/benchmarks/single_node/dsr1_fp8_b200.sh @@ -63,6 +63,11 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -72,7 +77,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 & +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -95,7 +100,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh index b5e499ecc..781869bcc 100755 --- a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh @@ -56,6 +56,11 @@ SPECULATIVE_EAGLE_TOPK=1 SGLANG_ENABLE_SPEC_V2=1 +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -85,7 +90,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --speculative-num-steps $SPECULATIVE_NUM_STEPS \ --speculative-num-draft-tokens $SPECULATIVE_DRAFT_TOKENS \ --speculative-eagle-topk $SPECULATIVE_EAGLE_TOPK \ - > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -109,7 +114,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt.sh b/benchmarks/single_node/dsr1_fp8_b200_trt.sh index 8df439973..139aae669 100644 --- a/benchmarks/single_node/dsr1_fp8_b200_trt.sh +++ b/benchmarks/single_node/dsr1_fp8_b200_trt.sh @@ -37,14 +37,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then PIECEWISE_CUDA_GRAPHS="true" DELAY_BATCHING="true" fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -ge 256 ]]; then - CUDA_GRAPH_MAX_BATCH_SIZE=$(( $CONC / 8 )) - MOE_BACKEND="DEEPGEMM" - KV_CACHE_FREE_MEM_FRACTION=0.7 - elif [[ $CONC -ge 128 ]]; then - PIECEWISE_CUDA_GRAPHS="true" - fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -ge 64 ]]; then PIECEWISE_CUDA_GRAPHS="true" @@ -100,6 +92,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens] capture_tokens=(1 2 4 8 16 32 64 128) @@ -146,7 +144,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh index c60388848..79f84f8a1 100644 --- a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh @@ -45,10 +45,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then if [[ $CONC -le 4 ]]; then PIECEWISE_CUDA_GRAPHS="false" fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -le 8 ]]; then - PIECEWISE_CUDA_GRAPHS="false" - fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -le 16 ]]; then PIECEWISE_CUDA_GRAPHS="false" @@ -89,7 +85,15 @@ attention_dp_config: EOF fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) +if [ "${EVAL_ONLY}" = "true" ]; then + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi # prep PW CUDA config per the documentation if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then @@ -104,10 +108,9 @@ if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then cat << EOF >> $EXTRA_CONFIG_FILE torch_compile_config: capture_num_tokens: [${CAPTURE_TOKENS_LIST%, }] - enable_piecewise_cuda_graph: true + enable_piecewise_cuda_graph: true EOF fi - # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -144,7 +147,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh index fde2cfede..c820d180b 100644 --- a/benchmarks/single_node/dsr1_fp8_h200.sh +++ b/benchmarks/single_node/dsr1_fp8_h200.sh @@ -15,7 +15,7 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -pip3 install --user sentencepiece +pip3 install --user --break-system-packages sentencepiece hf download "$MODEL" SERVER_LOG=/workspace/server.log @@ -26,6 +26,12 @@ start_gpu_monitor export TORCH_CUDA_ARCH_LIST="9.0" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + set -x if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ @@ -35,7 +41,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & else PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -44,7 +50,7 @@ else --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & fi SERVER_PID=$! @@ -66,7 +72,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt.sh b/benchmarks/single_node/dsr1_fp8_h200_trt.sh index 5d98aa75e..383b86065 100644 --- a/benchmarks/single_node/dsr1_fp8_h200_trt.sh +++ b/benchmarks/single_node/dsr1_fp8_h200_trt.sh @@ -64,6 +64,12 @@ MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + # Launch TRT-LLM server PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL --port=$PORT \ @@ -94,7 +100,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh index 0ecd48f02..9d0010903 100644 --- a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh @@ -80,6 +80,11 @@ fi MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -116,7 +121,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi300x.sh b/benchmarks/single_node/dsr1_fp8_mi300x.sh index 41731427e..a5f161960 100644 --- a/benchmarks/single_node/dsr1_fp8_mi300x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi300x.sh @@ -36,6 +36,11 @@ export SGLANG_AITER_MLA_PERSIST=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -50,7 +55,7 @@ python3 -m sglang.launch_server \ --max-prefill-tokens=131072 \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ ---disable-radix-cache > $SERVER_LOG 2>&1 & +--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -71,7 +76,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index 6870fe060..ae1e930f0 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -29,6 +29,12 @@ export SGLANG_AITER_MLA_PERSIST=1 # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + set -x python3 -m sglang.launch_server \ --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ @@ -41,7 +47,7 @@ python3 -m sglang.launch_server \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ --disable-radix-cache \ -> $SERVER_LOG 2>&1 & +$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -62,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh index 1d00957e4..d629437cf 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh @@ -27,6 +27,11 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -42,7 +47,7 @@ python3 -m sglang.launch_server \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ --kv-cache-dtype fp8_e4m3 \ - --cuda-graph-max-bs "$CONC" > $SERVER_LOG 2>&1 & + --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -63,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh index 08f579244..31554fc22 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -69,7 +74,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh index dfb8fafdc..920efb6ff 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -71,7 +76,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi @@ -80,4 +85,3 @@ stop_gpu_monitor set +x set -x -rm -rf ./utils/bench_serving\ diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/glm5_fp8_b200.sh index 5d09645c8..4ca4a215d 100755 --- a/benchmarks/single_node/glm5_fp8_b200.sh +++ b/benchmarks/single_node/glm5_fp8_b200.sh @@ -30,6 +30,11 @@ PORT=${PORT:-8888} echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -49,7 +54,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ --enable-flashinfer-allreduce-fusion --disable-radix-cache \ --stream-interval 30 \ ---model-loader-extra-config '{"enable_multithread_load": true}' > $SERVER_LOG 2>&1 & +--model-loader-extra-config '{"enable_multithread_load": true}' $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -72,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh index 9194bb870..7a985645f 100644 --- a/benchmarks/single_node/glm5_fp8_h200.sh +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -22,6 +22,12 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -36,7 +42,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --served-model-name glm-5-fp8 \ --trust-remote-code \ - > "$SERVER_LOG" 2>&1 & + $EVAL_CONTEXT_ARGS > "$SERVER_LOG" 2>&1 & SERVER_PID=$! @@ -60,7 +66,7 @@ run_benchmark_serving \ # Server accepts glm-5-fp8 (--served-model-name); lm-eval must use that model name if [ "${RUN_EVAL}" = "true" ]; then export MODEL_NAME=glm-5-fp8 - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/glm5_fp8_mi355x.sh index ee11463ce..3d82fd856 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x.sh @@ -30,6 +30,11 @@ export SAFETENSORS_FAST_GPU=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -44,7 +49,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ - --nsa-decode-backend tilelang > $SERVER_LOG 2>&1 & + --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -65,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/glm5_nvfp4_b200.sh b/benchmarks/single_node/glm5_nvfp4_b200.sh new file mode 100755 index 000000000..182f363ad --- /dev/null +++ b/benchmarks/single_node/glm5_nvfp4_b200.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +# following https://huggingface.co/nvidia/GLM-5-NVFP4#usage recipe +# except using latest nightly at the time of writing +# since the recommended nightly image in that recipe doesn't exist. + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +--trust-remote-code \ +--tensor-parallel-size=$TP \ +--data-parallel-size 1 --expert-parallel-size 1 \ +--tool-call-parser glm47 \ +--reasoning-parser glm45 \ +--quantization modelopt_fp4 \ +--cuda-graph-max-bs $CONC --max-running-requests $CONC \ +--mem-fraction-static 0.80 \ +--chunked-prefill-size 131072 \ +--stream-interval 30 \ +--model-loader-extra-config '{"enable_multithread_load": true}' $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index 46fccca6a..f6a6f72e9 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -26,7 +26,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi cat > config.yaml << EOF @@ -77,7 +82,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/gptoss_fp4_b200_trt.sh index 42fa96a94..c9ba2752c 100644 --- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh +++ b/benchmarks/single_node/gptoss_fp4_b200_trt.sh @@ -78,6 +78,12 @@ set -x MAX_NUM_TOKENS=20000 +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL --port=$PORT \ @@ -109,7 +115,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC )) + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index 314ec43c9..8d0e773a2 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -17,11 +17,18 @@ fi hf download "$MODEL" +MAX_MODEL_LEN=10240 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + cat > config.yaml << EOF no-enable-prefix-caching: true max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 -max-model-len: 10240 +max-model-len: $MAX_MODEL_LEN EOF export PYTHONNOUSERSITE=1 @@ -37,8 +44,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --config config.yaml \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ ---max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--max-num-seqs=$CONC > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -61,7 +67,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index 251294a62..2a9359b96 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -29,7 +29,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi # Create config.yaml @@ -50,8 +55,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --config config.yaml \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ - --max-num-seqs $CONC \ - --disable-log-requests > $SERVER_LOG 2>&1 & + --max-num-seqs $CONC > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -72,7 +76,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_h200_trt.sh b/benchmarks/single_node/gptoss_fp4_h200_trt.sh index a96b311d8..41dede14b 100644 --- a/benchmarks/single_node/gptoss_fp4_h200_trt.sh +++ b/benchmarks/single_node/gptoss_fp4_h200_trt.sh @@ -8,6 +8,7 @@ check_env_vars \ CONC \ ISL \ OSL \ + MAX_MODEL_LEN \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ DP_ATTENTION \ @@ -48,10 +49,19 @@ print_iter_log: true stream_interval: 20 EOF +MAX_NUM_TOKENS=20000 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ --max_batch_size $CONC \ ---max_num_tokens 20000 \ +--max_num_tokens $MAX_NUM_TOKENS \ +--max_seq_len=$MAX_MODEL_LEN \ --backend pytorch \ --extra_llm_api_options gptoss-config.yml \ --ep_size=$EP_SIZE \ @@ -82,7 +92,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh index f71aeb090..56a7823cf 100644 --- a/benchmarks/single_node/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh @@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -73,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh index f71aeb090..56a7823cf 100644 --- a/benchmarks/single_node/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh @@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -73,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index f23949739..37cb358ba 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -43,6 +43,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -74,7 +78,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh index cf71cbb3b..76bc87c0c 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -70,7 +75,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/kimik2.5_fp4_b200.sh index 422a74950..4818f246e 100644 --- a/benchmarks/single_node/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/kimik2.5_fp4_b200.sh @@ -26,6 +26,10 @@ export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,6 +42,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --reasoning-parser kimi_k2 \ --tool-call-parser kimi_k2 \ --compilation_config.pass_config.fuse_allreduce_rms true \ +--no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -62,7 +67,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index a8bd01442..c680529e2 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -31,6 +31,11 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. # Disable that features to avoid crashes. @@ -70,6 +75,7 @@ $EP \ --block-size=1 \ --no-enable-prefix-caching \ --trust-remote-code \ +--no-enable-prefix-caching \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -92,7 +98,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_b200.sh b/benchmarks/single_node/kimik2.5_int4_b200.sh index 6468cc05c..df4c63f6b 100755 --- a/benchmarks/single_node/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/kimik2.5_int4_b200.sh @@ -26,6 +26,10 @@ export VLLM_USE_FLASHINFER_MOE_INT4=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -64,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_h200.sh b/benchmarks/single_node/kimik2.5_int4_h200.sh index 37281f61e..766fe74a0 100755 --- a/benchmarks/single_node/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/kimik2.5_int4_h200.sh @@ -25,6 +25,11 @@ export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + # following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html recipe # Start GPU monitoring (power, temperature, clocks every second) @@ -40,6 +45,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --tool-call-parser kimi_k2 \ --compilation_config.pass_config.fuse_allreduce_rms true \ --trust-remote-code \ +--no-enable-prefix-caching \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -64,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_mi300x.sh b/benchmarks/single_node/kimik2.5_int4_mi300x.sh new file mode 100755 index 000000000..a05baddeb --- /dev/null +++ b/benchmarks/single_node/kimik2.5_int4_mi300x.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +# following AMD andy luo's recipe +# https://x.com/linluo77/status/2017024513595301985 +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +export VLLM_ROCM_USE_AITER=1 +vllm serve $MODEL --port $PORT \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.95 \ +--max-model-len $MAX_MODEL_LEN \ +--block-size=64 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--max-num-seqs 256 \ +--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/kimik2.5_int4_mi325x.sh index e6b7629ea..a05baddeb 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh +++ b/benchmarks/single_node/kimik2.5_int4_mi325x.sh @@ -28,18 +28,24 @@ PORT=${PORT:-8888} # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor set -x +export VLLM_ROCM_USE_AITER=1 vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ ---disable-log-requests \ --trust-remote-code \ +--no-enable-prefix-caching \ +--max-num-seqs 256 \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -62,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/kimik2.5_int4_mi355x.sh index 935c6cd2e..5e40da700 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_int4_mi355x.sh @@ -26,6 +26,10 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -37,6 +41,7 @@ vllm serve $MODEL --port $PORT \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ --trust-remote-code \ +--no-enable-prefix-caching \ --max-num-seqs 256 \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & @@ -60,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh index 2e5aa4b24..5ea1b8657 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh @@ -33,6 +33,10 @@ else EP=" " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -43,6 +47,7 @@ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ +--no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -65,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh index 90f5bd772..0f024ea9f 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh @@ -26,6 +26,11 @@ export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -42,7 +47,7 @@ $EP \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --max-num-seqs 256 \ ---disable-log-requests \ +--no-enable-prefix-caching \ --trust-remote-code \ --compilation-config '{"cudagraph_mode":"PIECEWISE"}' > $SERVER_LOG 2>&1 & @@ -66,7 +71,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 4b613d88e..84e73b65c 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -22,6 +22,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + if [ "$EP_SIZE" -ge 1 ]; then EP=" --enable-expert-parallel" else @@ -37,7 +42,7 @@ vllm serve $MODEL --port $PORT \ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---disable-log-requests \ +--no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -60,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh index 4dfaf6b80..d03f57c9b 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh @@ -28,6 +28,10 @@ export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,6 +42,7 @@ vllm serve $MODEL --port $PORT \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ --disable-log-requests \ +--no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -60,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh index e5d404036..aad72ad2f 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + EP_SIZE \ CONC \ ISL \ OSL \ @@ -30,16 +31,27 @@ export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "$EP_SIZE" -gt 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ +$EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ ---disable-log-requests \ +--no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -62,7 +74,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index cebbf72a0..adfb959cf 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -29,6 +29,11 @@ export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -45,6 +50,7 @@ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ +--no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -67,7 +73,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_b200.sh b/benchmarks/single_node/qwen3.5_bf16_b200.sh index 38785a104..86ce6b66f 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/qwen3.5_bf16_b200.sh @@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC MAX_RUNNING_REQUESTS=128 CONTEXT_LENGTH=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -79,7 +83,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh index ea10647d6..8aca9860a 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -31,7 +36,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -52,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh index ea10647d6..8aca9860a 100644 --- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -31,7 +36,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -52,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index f77390707..701695def 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -30,7 +35,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -51,7 +56,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/qwen3.5_fp8_b200.sh index 39b020ecc..36e5d579d 100755 --- a/benchmarks/single_node/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b200.sh @@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC MAX_RUNNING_REQUESTS=128 CONTEXT_LENGTH=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi if [[ $TP -eq 8 ]]; then EXTRA_ARGS="--enable-flashinfer-allreduce-fusion" @@ -87,7 +91,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh index 1270c76a6..87933b166 100755 --- a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh @@ -48,6 +48,10 @@ SPECULATIVE_EAGLE_TOPK=1 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -88,7 +92,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_h200.sh b/benchmarks/single_node/qwen3.5_fp8_h200.sh index 2ae26b771..636a8ee92 100644 --- a/benchmarks/single_node/qwen3.5_fp8_h200.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200.sh @@ -23,6 +23,10 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} MAX_SEQ_LEN=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN" +fi echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" @@ -76,7 +80,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh index 0640a20ab..00cc9cf91 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -32,7 +37,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -53,7 +59,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh index 0640a20ab..00cc9cf91 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -32,7 +37,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -53,7 +59,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index f77390707..701695def 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -30,7 +35,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -51,7 +56,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b85245458..967edc19c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,17 @@ +- config-keys: + - kimik2.5-int4-mi300x-vllm + description: + - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)" + - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + +- config-keys: + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + description: + - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + - config-keys: - dsr1-fp8-b200-dynamo-trt - dsr1-fp8-h200-dynamo-trt @@ -992,7 +1006,7 @@ - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 - + - config-keys: - glm5-fp8-b200-sglang description: @@ -1067,6 +1081,13 @@ - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130" - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 + +- config-keys: + - minimaxm2.5-fp8-mi325x-vllm + description: + - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" + - "Replace TP4 with TP8/EP8, add conc range 4-256" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953 - config-keys: - kimik2.5-fp4-mi355x-vllm @@ -1085,3 +1106,117 @@ - "Triton Fused Moe Tuning https://github.com/vllm-project/vllm/pull/35093" - "Add --max-num-seqs 256, remove --disable-log-requests" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950 + +- config-keys: + - kimik2.5-int4-mi325x-vllm + description: + - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" + - "Enable AITER MLA, export VLLM_ROCM_USE_AITER=1, https://github.com/vllm-project/vllm/issues/35641" + - "Triton Fused Moe Tuning https://github.com/vllm-project/vllm/pull/35093" + - "Add --max-num-seqs 256, remove --disable-log-requests" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/957 + +- config-keys: + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm + description: + - "Update vLLM image from v0.15.1 to v0.18.0 for gptoss H100 and H200 configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/960 + +- config-keys: + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - kimik2.5-int4-h200-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-fp4-b200-vllm + description: + - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926 + +- config-keys: + - minimaxm2.5-fp8-b200-vllm + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + - minimaxm2.5-fp8-mi300x-vllm + - minimaxm2.5-fp8-mi325x-vllm + - minimaxm2.5-fp8-mi355x-vllm + description: + - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966 + +- config-keys: + # NVIDIA single-node + - dsr1-fp4-b200-sglang + - dsr1-fp4-b200-trt + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-sglang + - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-b200-trt + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-sglang + - dsr1-fp8-h200-trt + - dsr1-fp8-h200-trt-mtp + - glm5-fp8-b200-sglang + - glm5-fp8-h200-sglang + - gptoss-fp4-b200-trt + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-trt + - gptoss-fp4-h200-vllm + - kimik2.5-fp4-b200-vllm + - kimik2.5-int4-b200-vllm + - kimik2.5-int4-h200-vllm + - minimaxm2.5-fp8-b200-vllm + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + - qwen3.5-bf16-b200-sglang + - qwen3.5-fp8-b200-sglang + - qwen3.5-fp8-b200-sglang-mtp + - qwen3.5-fp8-h200-sglang + # AMD single-node + - dsr1-fp4-mi355x-atom + - dsr1-fp4-mi355x-atom-mtp + - dsr1-fp4-mi355x-sglang + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi300x-sglang + - dsr1-fp8-mi355x-atom + - dsr1-fp8-mi355x-atom-mtp + - dsr1-fp8-mi355x-sglang + - glm5-fp8-mi355x-sglang + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + - gptoss-fp4-mi355x-atom + - gptoss-fp4-mi355x-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - minimaxm2.5-fp8-mi300x-vllm + - minimaxm2.5-fp8-mi325x-vllm + - minimaxm2.5-fp8-mi355x-vllm + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + - qwen3.5-fp8-mi355x-sglang + description: + - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911 + evals-only: true + +- config-keys: + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + description: + - "Add --disable-radix-cache to SGLang server launch command for qwen3.5 MI300X and MI325X benchmark scripts" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/970 + +- config-keys: + - glm5-nvfp4-b200-sglang + description: + - "Add GLM-5 NVFP4 single-node B200 SGLang benchmark (TP8, conc 4-64)" + - "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization" + - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 022fd7cb2..f8c614936 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -33,7 +33,7 @@ docker run --rm --init --network host --name $server_name \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 223264914..5100419b9 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 170a1bdc3..9d157a858 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -1,6 +1,6 @@ #!/usr/bin/bash -export HF_HUB_CACHE_MOUNT="/home/gharunner/gharunners/hf-hub-cache/" +export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 4faf07338..4ebe62c41 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash -export HF_HUB_CACHE_MOUNT="/nvme_home/gharunner/gharunners/hf-hub-cache/" +export HF_HUB_CACHE_MOUNT="/home/gharunner/gharunners/hf-hub-cache/" export PORT=8888 PARTITION="compute" -SQUASH_FILE="/nvme_home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" set -x diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 32331a398..af030720e 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -439,6 +439,76 @@ def get_model(pretrained_model_name_or_path: str) -> str: return pretrained_model_name_or_path +def _fix_tokenizer_for_sglang(tokenizer, model_path): + """Fix transformers v5 tokenizer to match sglang server-side behavior. + + Root cause: transformers v5 (>= 5.0) changed how tokenizers are loaded. + Specifically, LlamaTokenizerFast.__init__ in v5 rebuilds the pre_tokenizer + and decoder from scratch using class-specific components, discarding the + originals from tokenizer.json. For models like DeepSeek-R1 that declare + LlamaTokenizerFast but actually use a ByteLevel/Sequence tokenizer + architecture, v5 incorrectly replaces the original Sequence pre_tokenizer + with Metaspace, and the original ByteLevel decoder with Sequence. + See: https://github.com/sgl-project/sglang/blob/9238bd08a2895fa3b7ec79ea567e5c27ac951343/python/sglang/srt/utils/hf_transformers_utils.py#L836 + + The sglang server applies fixes for this in hf_transformers_utils.py + (_fix_v5_tokenizer_components and _fix_v5_add_bos_eos_token), but the + benchmark client loads the tokenizer directly via AutoTokenizer without + these fixes. This mismatch causes the client to encode text differently + from the server -- e.g. a 7000-token prompt on the client becomes ~35000 + tokens on the server, leading to ~5x TTFT inflation and false performance + regressions in benchmarks. + + This function replicates the same fixes so the benchmark client tokenizes + identically to the sglang server. It is a no-op on transformers v4. + """ + import json + from pathlib import Path + + backend = getattr(tokenizer, "_tokenizer", None) + if backend is not None: + try: + from tokenizers import Tokenizer as RawTokenizer + tok_file = Path(model_path) / "tokenizer.json" + if tok_file.is_file(): + raw = RawTokenizer.from_file(str(tok_file)) + raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None + loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None + if raw_pre and loaded_pre and raw_pre != loaded_pre: + backend.pre_tokenizer = raw.pre_tokenizer + backend.decoder = raw.decoder + except Exception: + pass + + try: + config_file = Path(model_path) / "tokenizer_config.json" + if config_file.is_file(): + with open(config_file) as f: + config = json.load(f) + tok_class = config.get("tokenizer_class", "") + bos_eos_classes = { + "LlamaTokenizer", "LlamaTokenizerFast", + "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", + "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", + } + if tok_class in bos_eos_classes: + defaults = {"add_bos_token": True, "add_eos_token": False} + changed = False + for attr in ("add_bos_token", "add_eos_token"): + val = config.get(attr) + if val is None: + val = defaults.get(attr, False) + if getattr(tokenizer, attr, None) != val: + setattr(tokenizer, f"_{attr}", val) + changed = True + if changed and hasattr(tokenizer, "update_post_processor"): + tokenizer.update_post_processor() + except Exception: + pass + + return tokenizer + + def get_tokenizer( pretrained_model_name_or_path: str, tokenizer_mode: str = "auto", @@ -464,11 +534,12 @@ def get_tokenizer( return MistralTokenizer.from_pretrained( str(pretrained_model_name_or_path)) else: - return AutoTokenizer.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs, ) + return _fix_tokenizer_for_sglang(tokenizer, pretrained_model_name_or_path) ASYNC_REQUEST_FUNCS = { @@ -481,4 +552,4 @@ def get_tokenizer( "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, "sglang": async_request_openai_completions, -} \ No newline at end of file +} diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index c3dddfcc6..e32d6d988 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -6,7 +6,7 @@ Quick graded QnA which measures model performance. Examples of test suites: - **gpqa**: Graduate level, Google-Proof multiple choice questions ## When? -At highest concurrency for highest TP and lowest TP, per GPU per model only for 1k8k. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py` +At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py` ## Why? To verify how model outputs are affected by throughput optimizations. @@ -15,7 +15,7 @@ To verify how model outputs are affected by throughput optimizations. - If there was a tradeoff in accuracy for performance ## How? -- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`. +- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`. ## Misc Following files are task definitions from lmeval, more info on changes within the files diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml index fb0f0a829..e748119cd 100644 --- a/utils/evals/gsm8k.yaml +++ b/utils/evals/gsm8k.yaml @@ -9,7 +9,7 @@ output_type: generate_until training_split: train fewshot_split: train test_split: test -doc_to_text: "Question: {{question}}\nEnd your answer with: #### \nAnswer:" +doc_to_text: "Question: {{question}}\nEnd your response with the answer on the last line, formatted as: #### [number]\nAnswer:" doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}" metric_list: - metric: exact_match diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json new file mode 100644 index 000000000..8ea0b71c0 --- /dev/null +++ b/utils/evals/thresholds.json @@ -0,0 +1,4 @@ +{ + "gsm8k": 0.85, + "gpqa_diamond_cot_n_shot": 0.30 +} diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py new file mode 100644 index 000000000..85433ec4b --- /dev/null +++ b/utils/evals/validate_scores.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +"""Validate eval scores against minimum thresholds. + +Reads lm-eval results JSON files and checks that scored metrics meet the +required minimum. Thresholds are configured per-task in a JSON config file +(default: utils/evals/thresholds.json). + +Usage: + python3 utils/evals/validate_scores.py + python3 utils/evals/validate_scores.py --thresholds my_thresholds.json + python3 utils/evals/validate_scores.py --min-score 0.90 # flat threshold, no config +""" +import argparse +import glob +import json +import sys +from pathlib import Path + + +def load_thresholds(path: str) -> dict[str, float]: + """Load thresholds config. Returns {task_name: min_score}.""" + with open(path) as f: + return json.load(f) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate eval scores") + parser.add_argument( + "--min-score", type=float, default=0.85, + help="Fallback minimum score when no threshold config matches (default: 0.85)", + ) + parser.add_argument( + "--thresholds", default=None, + help="Path to thresholds JSON config (default: utils/evals/thresholds.json)", + ) + parser.add_argument( + "--metric-prefix", default="exact_match,", + help="Only check metrics whose name starts with this prefix (default: 'exact_match,')", + ) + parser.add_argument( + "--results-glob", default="results*.json", + help="Glob pattern for result files (default: 'results*.json')", + ) + args = parser.parse_args() + + # Load thresholds config + thresholds = {} + thresholds_path = args.thresholds + if thresholds_path is None: + default_path = Path(__file__).parent / "thresholds.json" + if default_path.exists(): + thresholds_path = str(default_path) + if thresholds_path: + try: + thresholds = load_thresholds(thresholds_path) + print(f"Loaded thresholds from {thresholds_path}") + except (json.JSONDecodeError, OSError) as e: + print(f"WARN: could not load thresholds from {thresholds_path}: {e}", file=sys.stderr) + + failed = False + checked = 0 + + for f in sorted(glob.glob(args.results_glob)): + with open(f) as fh: + data = json.load(fh) + for task, metrics in data.get("results", {}).items(): + min_score = thresholds.get(task, args.min_score) + for name, val in metrics.items(): + if not name.startswith(args.metric_prefix) or "stderr" in name: + continue + if not isinstance(val, (int, float)): + continue + checked += 1 + if val < min_score: + print( + f"FAIL: {task} {name} = {val:.4f} (< {min_score})", + file=sys.stderr, + ) + failed = True + else: + print(f"PASS: {task} {name} = {val:.4f} (>= {min_score})") + + if checked == 0: + print("WARN: no metrics matched prefix '{}'".format(args.metric_prefix), file=sys.stderr) + + return 1 if (failed or checked == 0) else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 850fecd6a..5d61dffa5 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -17,7 +17,6 @@ seq_len_stoi = { "1k1k": (1024, 1024), - "1k8k": (1024, 8192), "8k1k": (8192, 1024) } @@ -35,31 +34,22 @@ def seq_len_to_str(isl: int, osl: int) -> str: def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: """Eval selection policy: - - Only consider 1k8k (isl=1024, osl=8192). - - Single-node: for each unique (model, runner, framework, precision, isl, osl, - spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP - with highest conc. + - Single-node: only consider 8k1k (isl=8192, osl=1024). + For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn): + - Mark all entries at the highest CONC (all TPs) + - Mark all entries at the median CONC (all TPs) - Multi-node: for each unique (model, runner, framework, precision, - spec-decoding, prefill-dp-attn, decode-dp-attn), prefer 8k1k entries; - fall back to 1k8k if unavailable (never 1k1k). Mark the entry with the - highest max concurrency. - - Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated - independently. + spec-decoding, prefill-dp-attn, decode-dp-attn), only 8k1k entries. + Mark the entry with the highest max concurrency. """ from collections import defaultdict - # Only run evals on 1k8k - target_isl, target_osl = seq_len_stoi["1k8k"] - - # --- Single-node eval selection --- - # Group entries by (model, runner, framework, precision, isl, osl) + # Only run evals on 8k1k + target_isl, target_osl = seq_len_stoi["8k1k"] + # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn). # Only include entries that have a top-level TP (i.e., single-node schema). - # This avoids relying on structural hints like prefill/decode which may be - # reused by future single-node disaggregated modes. groups = defaultdict(list) for i, entry in enumerate(matrix_values): - # Skip entries without a top-level TP field if Fields.TP.value not in entry: continue @@ -78,40 +68,24 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: ) groups[key].append((i, entry)) - # For each group, find highest TP/highest conc and lowest TP/highest conc + # For each group, select entries at highest CONC and median CONC (all TPs) eval_indices = set() for key, entries in groups.items(): if not entries: continue - # Find min and max TP values - min_tp = min(e[Fields.TP.value] for _, e in entries) - max_tp = max(e[Fields.TP.value] for _, e in entries) - - # Find highest conc for highest TP - highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp] - if highest_tp_entries: - max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries) - for i, e in highest_tp_entries: - if e[Fields.CONC.value] == max_conc_highest_tp: - eval_indices.add(i) - - # Find highest conc for lowest TP (only if different from max_tp) - if min_tp != max_tp: - lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp] - if lowest_tp_entries: - max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries) - for i, e in lowest_tp_entries: - if e[Fields.CONC.value] == max_conc_lowest_tp: - eval_indices.add(i) + conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries)) + median_conc = conc_values[len(conc_values) // 2] + target_concs = {conc_values[-1], median_conc} + + for i, e in entries: + if e[Fields.CONC.value] in target_concs: + eval_indices.add(i) # --- Multi-node eval selection --- # For multi-node (disaggregated) entries, pick one representative per group. - # Prefer 8k1k; fall back to 1k8k if unavailable (never 1k1k). + # Only 8k1k entries are eligible (never 1k1k). # Within a group, pick the entry with the highest max concurrency. - # Multi-node: prefer 8k1k, fallback to 1k8k - mn_target_isl, mn_target_osl = seq_len_stoi["8k1k"] - fallback_isl, fallback_osl = seq_len_stoi["1k8k"] mn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): if Fields.TP.value in entry: @@ -136,14 +110,10 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: if not entries: continue - # Prefer 8k1k entries; fall back to 1k8k + # Only 8k1k entries are eligible for eval preferred = [(i, e) for i, e in entries - if e.get(Fields.ISL.value) == mn_target_isl - and e.get(Fields.OSL.value) == mn_target_osl] - if not preferred: - preferred = [(i, e) for i, e in entries - if e.get(Fields.ISL.value) == fallback_isl - and e.get(Fields.OSL.value) == fallback_osl] + if e.get(Fields.ISL.value) == target_isl + and e.get(Fields.OSL.value) == target_osl] if not preferred: continue @@ -806,9 +776,9 @@ def main(): ) eval_group = parent_parser.add_mutually_exclusive_group() eval_group.add_argument( - '--run-evals', + '--no-evals', action='store_true', - help='When specified, run evals on a subset of configs (in addition to all configs).' + help='When specified, skip evals (throughput benchmarks only).' ) eval_group.add_argument( '--evals-only', @@ -1020,10 +990,9 @@ def main(): else: parser.error(f"Unknown command: {args.command}") - # Handle eval options (mutually exclusive) - if args.run_evals or args.evals_only: + # Handle eval options (mutually exclusive: --no-evals or --evals-only) + if not args.no_evals: matrix_values = mark_eval_entries(matrix_values) - # IF --evals-only is specified, filter to only eval entries if args.evals_only: matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)] diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index 84ecddd3d..1fecdd487 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -158,13 +158,11 @@ class TestSeqLenMappings: def test_seq_len_stoi_values(self): """Verify seq_len_stoi has expected mappings.""" assert seq_len_stoi["1k1k"] == (1024, 1024) - assert seq_len_stoi["1k8k"] == (1024, 8192) assert seq_len_stoi["8k1k"] == (8192, 1024) def test_seq_len_itos_reverse_mapping(self): """Verify seq_len_itos is reverse of stoi.""" assert seq_len_itos[(1024, 1024)] == "1k1k" - assert seq_len_itos[(1024, 8192)] == "1k8k" assert seq_len_itos[(8192, 1024)] == "8k1k" @@ -174,7 +172,6 @@ class TestSeqLenToStr: def test_known_sequence_lengths(self): """Known sequence lengths should return short name.""" assert seq_len_to_str(1024, 1024) == "1k1k" - assert seq_len_to_str(1024, 8192) == "1k8k" assert seq_len_to_str(8192, 1024) == "8k1k" def test_unknown_sequence_lengths(self): diff --git a/utils/process_changelog.py b/utils/process_changelog.py index 6b4c7878c..9d231ad3c 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -90,60 +90,80 @@ def main(): }, } - all_results = [] - # Deduplicate repeated configs, if for some reason a config key appears multiple times - # in one commit, we don't want to run that config two times (there will just be twice as many - # data points for that config, which is not useful) - all_configs_to_run = set() + all_benchmark_results = [] + all_eval_results = [] + # Deduplicate repeated configs separately for benchmarks and evals. + # An evals-only entry should not prevent a later regular entry from + # generating benchmarks for the same config, and vice versa. + benchmark_configs_seen = set() + eval_configs_seen = set() for entry_data in changelog_data: entry = ChangelogEntry.model_validate(entry_data) - configs_to_run = get_config_keys_from_master( + all_configs = get_config_keys_from_master( entry.config_keys, load_config_files(MASTER_CONFIGS) ) - # Skip configs already processed - configs_to_run = [c for c in configs_to_run if c not in all_configs_to_run] - if not configs_to_run: - continue - all_configs_to_run.update(configs_to_run) - - # Use --evals-only if specified in changelog entry, otherwise --run-evals - eval_flag = "--evals-only" if entry.evals_only else "--run-evals" - - try: - result = subprocess.run( - [ + if not entry.evals_only: + # Generate benchmark entries (no evals) + benchmark_configs = [c for c in all_configs if c not in benchmark_configs_seen] + if benchmark_configs: + benchmark_configs_seen.update(benchmark_configs) + base_cmd = [ "python3", GENERATE_SWEEPS_PY_SCRIPT, "test-config", "--config-keys", - *configs_to_run, + *benchmark_configs, "--config-files", *MASTER_CONFIGS, - eval_flag - ], - capture_output=True, - text=True, - check=True, - ) - except subprocess.CalledProcessError as e: - print(e.stderr) - raise - - all_results.extend(json.loads(result.stdout)) + "--no-evals", + ] + try: + result = subprocess.run( + base_cmd, + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + print(e.stderr) + raise + all_benchmark_results.extend(json.loads(result.stdout)) + + # Generate eval entries separately + eval_configs = [c for c in all_configs if c not in eval_configs_seen] + if eval_configs: + eval_configs_seen.update(eval_configs) + base_cmd = [ + "python3", + GENERATE_SWEEPS_PY_SCRIPT, + "test-config", + "--config-keys", + *eval_configs, + "--config-files", + *MASTER_CONFIGS, + "--evals-only", + ] + try: + eval_result = subprocess.run( + base_cmd, + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + print(e.stderr) + raise + all_eval_results.extend(json.loads(eval_result.stdout)) - all_eval_results = [] - for result in all_results: + for result in all_benchmark_results: seq_len_str = seq_len_to_str(result["isl"], result["osl"]) if "prefill" in result and result["prefill"] is not None: final_results["multi_node"][seq_len_str].append(result) else: final_results["single_node"][seq_len_str].append(result) - if result.get("run-eval"): - all_eval_results.append(result) - final_results["evals"] = [e for e in all_eval_results if "prefill" not in e or e.get("prefill") is None] final_results["multinode_evals"] = [e for e in all_eval_results if "prefill" in e and e.get("prefill") is not None] From 8d26331c14abf9290ba9d54689fb70c3e1cec42c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 30 Mar 2026 21:56:56 -0700 Subject: [PATCH 11/22] update multinode to singlenode --- .../workflows/benchmark-multinode-tmpl.yml | 1 + runners/launch_gb200-nv.sh | 90 ++++++++++--------- runners/launch_gb300-nv.sh | 90 ++++++++++--------- utils/matrix_logic/generate_sweep_configs.py | 64 +++++-------- 4 files changed, 117 insertions(+), 128 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index d529b7ccc..79799df2b 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -240,6 +240,7 @@ jobs: run: | rm -f meta_env.json || true rm -f results*.json || true + rm -f sample*.jsonl || true - name: Upload logs if: always() diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index e1ecc76a0..075bf6500 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -228,57 +228,61 @@ set -x echo "Job $JOB_ID completed!" echo "Collecting results..." -if [ ! -d "$LOGS_DIR" ]; then - echo "Warning: Logs directory not found at $LOGS_DIR" - exit 1 -fi - -echo "Found logs directory: $LOGS_DIR" - -cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" -tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - -# Find all result subdirectories -RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - -if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" -else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + echo "Warning: Logs directory not found at $LOGS_DIR" + exit 1 + fi - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + echo "Found logs directory: $LOGS_DIR" - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done done - done -fi + fi -echo "All result files processed" + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi # Collect eval results if eval was requested -if [[ "${RUN_EVAL:-false}" == "true" ]]; then +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then EVAL_DIR="$LOGS_DIR/eval_results" if [ -d "$EVAL_DIR" ]; then echo "Extracting eval results from $EVAL_DIR" diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 079d5169e..f3a360b65 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -155,57 +155,61 @@ set -x echo "Job $JOB_ID completed!" echo "Collecting results..." -if [ ! -d "$LOGS_DIR" ]; then - echo "Warning: Logs directory not found at $LOGS_DIR" - exit 1 -fi - -echo "Found logs directory: $LOGS_DIR" - -cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" -tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - -# Find all result subdirectories -RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - -if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" -else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + echo "Warning: Logs directory not found at $LOGS_DIR" + exit 1 + fi - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + echo "Found logs directory: $LOGS_DIR" - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done done - done -fi + fi -echo "All result files processed" + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi # Collect eval results if eval was requested -if [[ "${RUN_EVAL:-false}" == "true" ]]; then +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then EVAL_DIR="$LOGS_DIR/eval_results" if [ -d "$EVAL_DIR" ]; then echo "Extracting eval results from $EVAL_DIR" diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 5d61dffa5..4487d07c1 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -44,18 +44,21 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: """ from collections import defaultdict - # Only run evals on 8k1k target_isl, target_osl = seq_len_stoi["8k1k"] - # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn). - # Only include entries that have a top-level TP (i.e., single-node schema). - groups = defaultdict(list) + eval_indices = set() + + def _max_conc(ie): + c = ie[1][Fields.CONC.value] + return max(c) if isinstance(c, list) else c + + # Single-node: group by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn). + # Only 8k1k entries with a top-level TP (single-node schema). + sn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): if Fields.TP.value not in entry: continue - if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: continue - key = ( entry[Fields.MODEL.value], entry[Fields.RUNNER.value], @@ -64,65 +67,42 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: entry[Fields.ISL.value], entry[Fields.OSL.value], entry[Fields.SPEC_DECODING.value], - entry[Fields.DP_ATTN.value] + entry[Fields.DP_ATTN.value], ) - groups[key].append((i, entry)) - - # For each group, select entries at highest CONC and median CONC (all TPs) - eval_indices = set() - for key, entries in groups.items(): - if not entries: - continue + sn_groups[key].append((i, entry)) + for entries in sn_groups.values(): conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries)) median_conc = conc_values[len(conc_values) // 2] target_concs = {conc_values[-1], median_conc} - for i, e in entries: if e[Fields.CONC.value] in target_concs: eval_indices.add(i) - # --- Multi-node eval selection --- - # For multi-node (disaggregated) entries, pick one representative per group. - # Only 8k1k entries are eligible (never 1k1k). - # Within a group, pick the entry with the highest max concurrency. + # Multi-node: group by (model, runner, framework, precision, spec-decoding, prefill-dp, decode-dp). + # Only 8k1k entries with a prefill key (multi-node schema). + # Pick the entry with the highest max concurrency per group. mn_groups = defaultdict(list) for i, entry in enumerate(matrix_values): if Fields.TP.value in entry: - continue # single-node, already handled + continue if Fields.PREFILL.value not in entry: continue - - prefill_dp = entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value) - decode_dp = entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value) + if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: + continue key = ( entry[Fields.MODEL.value], entry[Fields.RUNNER.value], entry[Fields.FRAMEWORK.value], entry[Fields.PRECISION.value], entry[Fields.SPEC_DECODING.value], - prefill_dp, - decode_dp, + entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value), + entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value), ) mn_groups[key].append((i, entry)) - for key, entries in mn_groups.items(): - if not entries: - continue - - # Only 8k1k entries are eligible for eval - preferred = [(i, e) for i, e in entries - if e.get(Fields.ISL.value) == target_isl - and e.get(Fields.OSL.value) == target_osl] - if not preferred: - continue - - # Pick entry with highest max concurrency - def _max_conc(ie): - c = ie[1][Fields.CONC.value] - return max(c) if isinstance(c, list) else c - best = max(preferred, key=_max_conc) - eval_indices.add(best[0]) + for entries in mn_groups.values(): + eval_indices.add(max(entries, key=_max_conc)[0]) # Mark the selected entries for i, entry in enumerate(matrix_values): From 0b271879e08a0284c90dc5f88d7cbafff53cb40d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 30 Mar 2026 22:03:39 -0700 Subject: [PATCH 12/22] hanging rm rf --- .github/workflows/benchmark-multinode-tmpl.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 79799df2b..29f0bff98 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -150,9 +150,6 @@ jobs: fi fi - - name: Clean up root-owned files from previous runs - run: sudo rm -rf benchmark_logs benchmark_artifacts 2>/dev/null || true - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: token: ${{ secrets.REPO_PAT }} From 056a4156098b4c8ff11b47e68ff72ccb1484db1d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 31 Mar 2026 06:54:57 -0700 Subject: [PATCH 13/22] debug --- runners/launch_gb200-nv.sh | 14 ++++++++------ runners/launch_gb300-nv.sh | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 075bf6500..22bf58665 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -228,17 +228,19 @@ set -x echo "Job $JOB_ID completed!" echo "Collecting results..." +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then if [ ! -d "$LOGS_DIR" ]; then - echo "Warning: Logs directory not found at $LOGS_DIR" exit 1 fi - echo "Found logs directory: $LOGS_DIR" - - cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" - tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index f3a360b65..2f56c3633 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -155,17 +155,19 @@ set -x echo "Job $JOB_ID completed!" echo "Collecting results..." +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then if [ ! -d "$LOGS_DIR" ]; then - echo "Warning: Logs directory not found at $LOGS_DIR" exit 1 fi - echo "Found logs directory: $LOGS_DIR" - - cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" - tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) From 61f7d9babb5916df9dd8a5a62d80864a3eb0c353 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 31 Mar 2026 07:20:20 -0700 Subject: [PATCH 14/22] update conc req --- benchmarks/multi_node/amd_utils/server.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 9271c4382..2adbcd8df 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -504,13 +504,13 @@ if [ "$NODE_RANK" -eq 0 ]; then source /workspace/benchmarks/benchmark_lib.sh # Use max concurrency from benchmark config (conc values are x-separated) - EVAL_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC" + echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})" else # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC" + run_eval --framework lm-eval --port 30000 # Set metadata env vars for append_lm_eval_summary export TP="${PREFILL_TP_SIZE}" From ffdd49b89f0235b84069c4a296670ecf159ca777 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 31 Mar 2026 21:08:34 -0700 Subject: [PATCH 15/22] documentation --- .github/workflows/e2e-tests.yml | 2 +- utils/evals/EVALS.md | 43 +++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 6765113b2..eb1a97713 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -206,7 +206,7 @@ jobs: collect-results: needs: [test-sweep-multi-node, test-sweep-single-node] - if: ${{ always() }} + if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped') }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index e32d6d988..f729d5f24 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -6,21 +6,54 @@ Quick graded QnA which measures model performance. Examples of test suites: - **gpqa**: Graduate level, Google-Proof multiple choice questions ## When? -At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py` +Evals run as **separate workflow jobs** from throughput benchmarks. The selection logic is in `mark_eval_entries()` of `utils/matrix_logic/generate_sweep_configs.py`. + +**Single-node**: At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. + +**Multi-node**: One entry per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) with the highest max concurrency, only for 8k1k. ## Why? -To verify how model outputs are affected by throughput optimizations. +To verify how model outputs are affected by throughput optimizations. - TP/Conc might affect model outputs - Check kernel implementations for correctness - If there was a tradeoff in accuracy for performance ## How? -- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`. +`run_eval` in `benchmarks/benchmark_lib.sh` runs EleutherAI/lm-evaluation-harness against the server's OpenAI-compatible endpoint. Concurrency is set via `EVAL_CONCURRENT_REQUESTS` env var (not a CLI flag). Results are collected by `utils/collect_eval_results.py` and published as a summary table. + +### Single-node +In eval-only mode (`EVAL_ONLY=true`), the benchmark script starts the server with expanded context length (via `compute_eval_context_length`), skips throughput, and runs lm-eval directly. Each framework handles the context expansion differently (`--context-length` for SGLang, `--max_seq_len` for TRT-LLM). + +### Multi-node +Multi-node evals support three hardware paths: + +**MI355X (AMD)** — `benchmarks/multi_node/amd_utils/server.sh` +- Skips `bench.sh` when `EVAL_ONLY=true` +- Runs lm-eval via `run_eval` against the router on port 30000 +- Concurrency derived from max of `BENCH_MAX_CONCURRENCY` (x-separated values) +- Eval artifacts copied to `/run_logs/slurm_job-*/eval_results/` +- `runners/launch_mi355x-amds.sh` skips benchmark result collection when `EVAL_ONLY=true` and uses `find` to locate eval results + +**GB200/GB300 (NVIDIA)** — via [srt-slurm fork](https://github.com/Oseltamivir/srt-slurm) (`sa-submission-q1-2026` branch) +- `do_sweep.py` skips the benchmark stage when `EVAL_ONLY=true`, runs `_run_post_eval()` directly +- In eval-only mode, uses the full `wait_for_model()` health check (same as benchmark stage) since the benchmark health check was skipped +- `lm-eval` benchmark runner (`benchmarks/lm_eval.py`) sources InferenceX's `benchmark_lib.sh` from the mounted workspace (`/infmax-workspace`) +- Eval artifacts written to `/logs/eval_results/` inside the container, collected by launch scripts +- `runners/launch_gb200-nv.sh` and `launch_gb300-nv.sh` always collect server logs (for debugging) but skip benchmark result collection when `EVAL_ONLY=true` +- Env vars threaded: `RUN_EVAL`, `EVAL_ONLY`, `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP/EP/DP_ATTN`, `DECODE_TP/EP/DP_ATTN`, `MODEL_NAME`, `EVAL_CONC` + +### Workflow structure +- `e2e-tests.yml`: `test-sweep-evals` (single-node) and `test-sweep-multi-node-evals` (multi-node) +- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node) +- Both use their respective benchmark templates with `eval-only: true`, `run-eval: true` +- `collect-evals` depends on both eval jobs; `collect-results` only runs when benchmark jobs ran +- `process_changelog.py` splits eval results into `evals` (single-node) and `multinode_evals` + +### Score validation +`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails. ## Misc Following files are task definitions from lmeval, more info on changes within the files - `utils/evals/gsm8k.yaml` - `utils/evals/gpqa_diamond.yaml` - - From 7639f3da40cfe4caea3dc89cfdaf01bad0e5c51f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 31 Mar 2026 22:18:50 -0700 Subject: [PATCH 16/22] median instead of max --- .github/workflows/benchmark-multinode-tmpl.yml | 6 ++++++ .github/workflows/e2e-tests.yml | 1 + .github/workflows/run-sweep.yml | 1 + benchmarks/multi_node/amd_utils/server.sh | 8 ++++++-- utils/matrix_logic/generate_sweep_configs.py | 13 +++++++++++-- utils/matrix_logic/validation.py | 2 ++ 6 files changed, 27 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 29f0bff98..ea086beb7 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -86,6 +86,11 @@ on: type: boolean required: false default: false + eval-conc: + description: "Concurrency to use for eval requests (overrides default max-of-conc-list)" + type: string + required: false + default: "" ref: description: "Git ref (branch/sha) to checkout" required: false @@ -107,6 +112,7 @@ env: DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} + EVAL_CONC: ${{ inputs.eval-conc }} PYTHONDONTWRITEBYTECODE: '1' PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index eb1a97713..487a4a0c3 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -141,6 +141,7 @@ jobs: decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} run-eval: true eval-only: true + eval-conc: ${{ matrix.config.eval-conc }} ref: ${{ inputs.ref }} test-sweep-single-node: diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index a87f7ee13..e3eaf1c3b 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -226,6 +226,7 @@ jobs: decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} run-eval: true eval-only: true + eval-conc: ${{ matrix.config.eval-conc }} collect-results: needs: diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 2adbcd8df..2d001fd53 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -503,8 +503,12 @@ if [ "$NODE_RANK" -eq 0 ]; then # Source eval functions from benchmark_lib.sh source /workspace/benchmarks/benchmark_lib.sh - # Use max concurrency from benchmark config (conc values are x-separated) - export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list + if [[ -n "${EVAL_CONC}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})" diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 4487d07c1..9682c1423 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -40,12 +40,14 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: - Mark all entries at the median CONC (all TPs) - Multi-node: for each unique (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn), only 8k1k entries. - Mark the entry with the highest max concurrency. + Mark the entry with the highest max concurrency. Sets eval-conc to the + median of the conc list to avoid OOM during eval. """ from collections import defaultdict target_isl, target_osl = seq_len_stoi["8k1k"] eval_indices = set() + mn_eval_conc = {} # index -> chosen eval concurrency for multinode entries def _max_conc(ie): c = ie[1][Fields.CONC.value] @@ -102,11 +104,18 @@ def _max_conc(ie): mn_groups[key].append((i, entry)) for entries in mn_groups.values(): - eval_indices.add(max(entries, key=_max_conc)[0]) + best_idx, best_entry = max(entries, key=_max_conc) + eval_indices.add(best_idx) + # Set eval-conc to median of the conc list to avoid OOM during eval + conc = best_entry[Fields.CONC.value] + sorted_conc = sorted(conc) if isinstance(conc, list) else [conc] + mn_eval_conc[best_idx] = sorted_conc[len(sorted_conc) // 2] # Mark the selected entries for i, entry in enumerate(matrix_values): entry[Fields.RUN_EVAL.value] = i in eval_indices + if i in mn_eval_conc: + entry[Fields.EVAL_CONC.value] = mn_eval_conc[i] return matrix_values diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 2e8626abe..62a92c5ed 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -53,6 +53,7 @@ class Fields(Enum): # Eval RUN_EVAL = 'run-eval' + EVAL_CONC = 'eval-conc' """ @@ -126,6 +127,7 @@ class MultiNodeMatrixEntry(BaseModel): exp_name: str = Field(alias=Fields.EXP_NAME.value) disagg: bool run_eval: bool = Field(alias=Fields.RUN_EVAL.value) + eval_conc: Optional[int] = Field(default=None, alias=Fields.EVAL_CONC.value) def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: From 4ffd505ee2447fa8f87f50e4a50b45c0e1ab8427 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 1 Apr 2026 06:47:39 -0700 Subject: [PATCH 17/22] config file guard --- runners/launch_b200-dgxc-slurm.sh | 7 +++++++ runners/launch_b300-nv.sh | 7 +++++++ runners/launch_gb200-nv.sh | 8 ++++++++ runners/launch_gb300-nv.sh | 6 ++++++ runners/launch_h100-dgxc-slurm.sh | 7 +++++++ runners/launch_h200-dgxc-slurm.sh | 7 +++++++ 6 files changed, 42 insertions(+) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 0d1bd40cc..282f597f4 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -99,6 +99,13 @@ EOF make setup ARCH=x86_64 echo "Submitting job with srtctl..." + + if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 + fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 68da9f2b7..b9d7612ef 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -100,6 +100,13 @@ echo "Running make setup..." make setup ARCH=x86_64 echo "Submitting job with srtctl..." + +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 22bf58665..d84c0ac13 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -107,6 +107,14 @@ PY fi +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" if [ -d "$SRT_REPO_DIR" ]; then diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 2f56c3633..91147d90d 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -102,6 +102,12 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index bb0335955..0cc03ae27 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -113,6 +113,13 @@ EOF make setup ARCH=x86_64 echo "Submitting job with srtctl..." + + if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 + fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" sed -i "/^name:.*/a sbatch_directives:\n exclude: \"${SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_FILE" diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 9b3b771a5..f23f1f138 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -113,6 +113,13 @@ EOF make setup ARCH=x86_64 echo "Submitting job with srtctl..." + + if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 + fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) From 0d0e1e857cd152fdae665011df5159f5a6b45ddb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 1 Apr 2026 07:21:13 -0700 Subject: [PATCH 18/22] h100/h200/b200/b300 evals --- runners/launch_b200-dgxc-slurm.sh | 88 +++++++++++++++++++------------ runners/launch_b300-nv.sh | 88 +++++++++++++++++++------------ runners/launch_h100-dgxc-slurm.sh | 88 +++++++++++++++++++------------ runners/launch_h200-dgxc-slurm.sh | 88 +++++++++++++++++++------------ 4 files changed, 220 insertions(+), 132 deletions(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 282f597f4..0bd734282 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -65,6 +65,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" + export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -98,6 +99,9 @@ EOF echo "Running make setup..." make setup ARCH=x86_64 + # Export eval-related env vars for srt-slurm post-benchmark eval + export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then @@ -169,45 +173,63 @@ EOF cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" + echo "All result files processed" else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "EVAL_ONLY=true: Skipping benchmark result collection" + fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + # Collect eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi - echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index b9d7612ef..0b40a2a00 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -64,6 +64,7 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX export ISL="$ISL" export OSL="$OSL" +export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -99,6 +100,9 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=x86_64 +# Export eval-related env vars for srt-slurm post-benchmark eval +export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then @@ -170,45 +174,63 @@ echo "Found logs directory: $LOGS_DIR" cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . -# Find all result subdirectories -RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi -if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" + echo "All result files processed" else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi -echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 0cc03ae27..29a7e1340 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -75,6 +75,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" + export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -112,6 +113,9 @@ EOF echo "Running make setup..." make setup ARCH=x86_64 + # Export eval-related env vars for srt-slurm post-benchmark eval + export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then @@ -184,45 +188,63 @@ EOF cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" - else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "All result files processed" + else + echo "EVAL_ONLY=true: Skipping benchmark result collection" + fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + # Collect eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi - echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index f23f1f138..dffef2d28 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -74,6 +74,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" + export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -112,6 +113,9 @@ EOF echo "Running make setup..." make setup ARCH=x86_64 + # Export eval-related env vars for srt-slurm post-benchmark eval + export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then @@ -183,45 +187,63 @@ EOF cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" - else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "All result files processed" + else + echo "EVAL_ONLY=true: Skipping benchmark result collection" + fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi + # Collect eval results if eval was requested + if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi - echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." From bf615b962ba460aea7554fafd6e1627f8880873e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 1 Apr 2026 13:23:02 -0700 Subject: [PATCH 19/22] Update repo --- runners/launch_b200-dgxc-slurm.sh | 2 +- runners/launch_b300-nv.sh | 2 +- runners/launch_h100-dgxc-slurm.sh | 2 +- runners/launch_h200-dgxc-slurm.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 0bd734282..fb9bb7b22 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -36,7 +36,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 git checkout sa-submission-q1-2026 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 0b40a2a00..e4100de94 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -35,7 +35,7 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 git checkout sa-submission-q1-2026 diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 29a7e1340..91cea74f3 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -41,7 +41,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index dffef2d28..3e7032314 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -40,7 +40,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q1-2026 From 28a75a2ee91dde844b23bfe02b4d2bc504da34c4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 1 Apr 2026 16:26:47 -0700 Subject: [PATCH 20/22] models_name --- runners/launch_b300-nv.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index e4100de94..c718dcad0 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -16,11 +16,11 @@ fi # The yaml files specify HuggingFace model IDs for portability, but we use # local paths to avoid repeated downloading on the shared B300 cluster. if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/scratch/models/deepseek-r1-0528-nvfp4-v2" + export MODEL_PATH="/data/models/dsr1-fp4" export SERVED_MODEL_NAME="deepseek-r1-fp4" export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then - export MODEL_PATH="/scratch/models/deepseek-r1-0528" + export MODEL_PATH="/data/models/dsr1-fp8" export SERVED_MODEL_NAME="deepseek-r1-fp8" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else From 98a45e9877caf3a675467a3cdcde11fe8e32b579 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 1 Apr 2026 17:24:26 -0700 Subject: [PATCH 21/22] model config --- benchmarks/benchmark_lib.sh | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index b3264cef0..d21f827ae 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -593,14 +593,23 @@ PY get_native_max_context_length() { local model_path="$1" + # Prefer MODEL_PATH (local model directory) when available, since the + # argument may be a served-model name that is neither a valid HF repo + # ID nor a local path (e.g. "deepseek-r1-fp4" on the B300 cluster). + if [ -n "${MODEL_PATH:-}" ] && [ -d "${MODEL_PATH}" ]; then + model_path="${MODEL_PATH}" + fi python3 -c " -from transformers import AutoConfig -config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True) -for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']: - if hasattr(config, attr): - print(getattr(config, attr)) - break -else: +try: + from transformers import AutoConfig + config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True) + for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']: + if hasattr(config, attr): + print(getattr(config, attr)) + break + else: + print(0) +except Exception: print(0) " } From de5497449df975f2646c26e98ae391f3d63e321f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 2 Apr 2026 13:25:10 -0700 Subject: [PATCH 22/22] summary table --- .../workflows/benchmark-multinode-tmpl.yml | 9 +- .github/workflows/benchmark-tmpl.yml | 7 + benchmarks/benchmark_lib.sh | 35 +++- benchmarks/multi_node/amd_utils/job.slurm | 2 + benchmarks/multi_node/amd_utils/server.sh | 14 +- benchmarks/multi_node/amd_utils/submit.sh | 2 + utils/collect_eval_results.py | 183 ++++++++++++++---- 7 files changed, 212 insertions(+), 40 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index ea086beb7..4da79d5cd 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -163,6 +163,13 @@ jobs: ref: ${{ inputs.ref || github.ref }} clean: false + - name: Cleanup stale eval outputs (pre-run) + if: ${{ inputs.run-eval || inputs.eval-only }} + run: | + rm -f meta_env.json || true + rm -f results*.json || true + rm -f sample*.jsonl || true + - name: Launch multi-node job script env: RUNNER_NAME: ${{ runner.name }} @@ -239,7 +246,7 @@ jobs: run: python3 utils/evals/validate_scores.py - name: Cleanup eval outputs (post-upload) - if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }} + if: ${{ always() && (inputs.run-eval || inputs.eval-only) }} run: | rm -f meta_env.json || true rm -f results*.json || true diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 797505eec..25bec61ee 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -140,6 +140,13 @@ jobs: ref: ${{ inputs.ref || github.ref }} clean: false + - name: Cleanup stale eval outputs (pre-run) + if: ${{ inputs.run-eval || inputs.eval-only }} + run: | + rm -f meta_env.json || true + rm -f results*.json || true + rm -f sample*.jsonl || true + - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index d21f827ae..403484998 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -717,8 +717,32 @@ append_lm_eval_summary() { # Write minimal meta for collectors that expect it local meta_json="${out_dir}/meta_env.json" local model_name="${MODEL_NAME:-$MODEL}" + local is_multinode_json="false" + if [ "${IS_MULTINODE:-false}" = "true" ]; then + is_multinode_json="true" + fi + + local prefill_tp="${PREFILL_TP:-${TP:-1}}" + local prefill_ep="${PREFILL_EP:-${EP_SIZE:-1}}" + local prefill_num_workers="${PREFILL_NUM_WORKERS:-1}" + local decode_tp="${DECODE_TP:-${TP:-1}}" + local decode_ep="${DECODE_EP:-${EP_SIZE:-1}}" + local decode_num_workers="${DECODE_NUM_WORKERS:-1}" + local dp_json="false" - if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi + if [ "${DP_ATTENTION:-false}" = "true" ]; then dp_json="true"; fi + local prefill_dp_json="$dp_json" + if [ "${PREFILL_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then + prefill_dp_json="true" + else + prefill_dp_json="false" + fi + local decode_dp_json="$dp_json" + if [ "${DECODE_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then + decode_dp_json="true" + else + decode_dp_json="false" + fi # Derive framework/precision from env, fallback to parsing RESULT_FILENAME # RESULT_FILENAME format (from workflow): @@ -743,6 +767,7 @@ append_lm_eval_summary() { fi cat > "${meta_json}" < str: return '' +def as_int(x: Any, default: int = 0) -> int: + """Convert a metadata field to int with a fallback.""" + try: + return int(x) + except Exception: + return default + + +def as_bool(x: Any, default: bool = False) -> bool: + """Parse a metadata boolean stored as bool/string/int.""" + if isinstance(x, bool): + return x + if x is None: + return default + return str(x).lower() == 'true' + + def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: """Build a result row from metadata and extracted metrics.""" + is_multinode = as_bool(meta.get('is_multinode'), False) + prefill_tp = as_int(meta.get('prefill_tp', meta.get('tp', 1)), 1) + prefill_ep = as_int(meta.get('prefill_ep', meta.get('ep', 1)), 1) + prefill_num_workers = as_int(meta.get('prefill_num_workers', 1), 1) + decode_tp = as_int(meta.get('decode_tp', meta.get('tp', 1)), 1) + decode_ep = as_int(meta.get('decode_ep', meta.get('ep', 1)), 1) + decode_num_workers = as_int(meta.get('decode_num_workers', 1), 1) + prefill_dp_attention = meta.get('prefill_dp_attention') + decode_dp_attention = meta.get('decode_dp_attention') + dp_attention = meta.get('dp_attention', 'none') + + if prefill_dp_attention is None: + prefill_dp_attention = dp_attention + if decode_dp_attention is None: + decode_dp_attention = dp_attention + + if is_multinode: + if prefill_dp_attention == decode_dp_attention: + dp_attention = prefill_dp_attention + else: + dp_attention = f"prefill={str(prefill_dp_attention).lower()},decode={str(decode_dp_attention).lower()}" + row = { + 'is_multinode': is_multinode, 'model_prefix': meta.get('infmax_model_prefix', 'unknown'), 'model': m.get('model') or meta.get('model', 'unknown'), 'hw': meta.get('hw', 'unknown').upper(), 'framework': meta.get('framework', 'unknown').lower(), 'precision': meta.get('precision', 'unknown').lower(), 'spec_decoding': meta.get('spec_decoding', 'unknown'), - 'tp': int(meta.get('tp', 1)), - 'ep': int(meta.get('ep', 1)), - 'conc': int(meta.get('conc', 0)), - 'dp_attention': str(meta.get('dp_attention', "none")).lower(), + 'tp': as_int(meta.get('tp', prefill_tp), prefill_tp), + 'ep': as_int(meta.get('ep', prefill_ep), prefill_ep), + 'prefill_tp': prefill_tp, + 'prefill_ep': prefill_ep, + 'prefill_num_workers': prefill_num_workers, + 'decode_tp': decode_tp, + 'decode_ep': decode_ep, + 'decode_num_workers': decode_num_workers, + 'conc': as_int(meta.get('conc', 0), 0), + 'dp_attention': str(dp_attention).lower(), + 'prefill_dp_attention': str(prefill_dp_attention).lower(), + 'decode_dp_attention': str(decode_dp_attention).lower(), 'task': m.get('task', 'unknown'), 'em_strict': m.get('strict'), 'em_strict_se': m.get('strict_se'), @@ -226,49 +275,111 @@ def main(): row = build_row(meta, m) rows.append(row) + single_node_rows = [r for r in rows if not r['is_multinode']] + multinode_rows = [r for r in rows if r['is_multinode']] + # Sort for stable output (default: by model_prefix) sort_by = sys.argv[3] if len(sys.argv) > 3 else 'model_prefix' - if sort_by == 'hw': - rows.sort(key=lambda r: ( - r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'] + single_node_sort_key = ( + (lambda r: ( + r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), + r['tp'], r['ep'], r['conc'], )) - else: - rows.sort(key=lambda r: ( - r['model_prefix'], r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'] + if sort_by == 'hw' + else (lambda r: ( + r['model_prefix'], r['hw'], r['framework'], r['precision'], + r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'], + )) + ) + multinode_sort_key = ( + (lambda r: ( + r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), + r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'], + r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'], + )) + if sort_by == 'hw' + else (lambda r: ( + r['model_prefix'], r['hw'], r['framework'], r['precision'], + r.get('spec_decoding', ''), + r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'], + r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'], )) + ) + single_node_rows.sort(key=single_node_sort_key) + multinode_rows.sort(key=multinode_sort_key) if not rows: print('> No eval results found to summarize.') else: # Print table using tabulate MODEL_PREFIX = "Model Prefix" - headers = [ - MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION, - TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL - ] - - table_rows = [ - [ - r['model_prefix'], - r['hw'], - r['framework'].upper(), - r['precision'].upper(), - r['spec_decoding'], - r['tp'], - r['ep'], - r['conc'], - r['dp_attention'], - r['task'], - f"{pct(r['score'])}{se(r['score_se'])}", - f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", - f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", - r['n_eff'] or '', - r['model'] + + if single_node_rows: + headers = [ + MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, + TP, EP, CONC, DP_ATTENTION, + TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL, + ] + table_rows = [ + [ + r['model_prefix'], + r['hw'], + r['framework'].upper(), + r['precision'].upper(), + r['spec_decoding'], + r['tp'], + r['ep'], + r['conc'], + r['dp_attention'], + r['task'], + f"{pct(r['score'])}{se(r['score_se'])}", + f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", + f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", + r['n_eff'] or '', + r['model'], + ] + for r in single_node_rows + ] + print("### Single-Node Eval Results\n") + print(tabulate(table_rows, headers=headers, tablefmt="github")) + + if multinode_rows: + headers = [ + MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, + PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, + DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, + CONC, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL, + ] + table_rows = [ + [ + r['model_prefix'], + r['hw'], + r['framework'].upper(), + r['precision'].upper(), + r['spec_decoding'], + r['prefill_tp'], + r['prefill_ep'], + r['prefill_dp_attention'], + r['prefill_num_workers'], + r['decode_tp'], + r['decode_ep'], + r['decode_dp_attention'], + r['decode_num_workers'], + r['conc'], + r['task'], + f"{pct(r['score'])}{se(r['score_se'])}", + f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", + f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", + r['n_eff'] or '', + r['model'], + ] + for r in multinode_rows ] - for r in rows - ] + if single_node_rows: + print("\n") + print("### Multi-Node Eval Results\n") + print(tabulate(table_rows, headers=headers, tablefmt="github")) - print(tabulate(table_rows, headers=headers, tablefmt="github")) # Write JSON aggregate out_path = Path(f'agg_eval_{exp_name}.json')