From 033aa6ecd15be0257fa60c367eafc6f6bb97f9c7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 19 Feb 2026 16:12:44 -0800
Subject: [PATCH 01/22] init

---
 .../workflows/benchmark-multinode-tmpl.yml    | 23 +++++++++
 .github/workflows/e2e-tests.yml               |  1 +
 .github/workflows/run-sweep.yml               |  1 +
 benchmarks/multi_node/amd_utils/job.slurm     | 14 +++++
 benchmarks/multi_node/amd_utils/server.sh     | 51 +++++++++++++++++++
 benchmarks/multi_node/amd_utils/submit.sh     |  8 +++
 runners/launch_mi355x-amds.sh                 | 14 +++++
 utils/matrix_logic/generate_sweep_configs.py  | 48 +++++++++++++++--
 8 files changed, 155 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index af3652e6b..6f2a6397a 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -77,6 +77,10 @@ on:
         required: false
         type: string
         default: "[]"
+      run-eval:
+        type: boolean
+        required: false
+        default: false
       ref:
         description: "Git ref (branch/sha) to checkout"
         required: false
@@ -96,6 +100,7 @@ env:
   CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }}
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
+  RUN_EVAL: ${{ inputs.run-eval }}
 
   PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }}
   PREFILL_TP: ${{ inputs.prefill-tp }}
@@ -146,6 +151,7 @@ jobs:
       - name: Launch multi-node job script
         env:
           RUNNER_NAME: ${{ runner.name }}
+          RUNNER_TYPE: ${{ inputs.runner }}
           # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}_prefill-tp{}-ep{}-dp{}-nw{}_decode-tp{}-ep{}-dp{}-nw{}_disagg-{}_spec-{}_conc{}_{runner}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_prefill-tp${{ env.PREFILL_TP }}-ep${{ env.PREFILL_EP }}-dp${{ env.PREFILL_DP_ATTN }}-nw${{ env.PREFILL_NUM_WORKERS }}_decode-tp${{ env.DECODE_TP }}-ep${{ env.DECODE_EP }}-dp${{ env.DECODE_DP_ATTN }}-nw${{ env.DECODE_NUM_WORKERS }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }}
         run: |
@@ -188,6 +194,23 @@ jobs:
           name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}_*.json
 
+      - name: Upload eval results (if any)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
+          path: |
+            meta_env.json
+            results*.json
+            sample*.jsonl
+          if-no-files-found: ignore
+
+      - name: Cleanup eval outputs (post-upload)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        run: |
+          rm -f meta_env.json || true
+          rm -f results*.json || true
+
       - name: Slurm cleanup (post-run)
         if: always()
         run: *slurm-cleanup
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index c108b3960..2658b8d94 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -93,6 +93,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: ${{ matrix.config.run-eval }}
             ref: ${{ inputs.ref }}
 
     test-sweep-single-node:
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index 22a71afd7..5d2d1dc51 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -105,6 +105,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     sweep-multi-node-1k8k:
         needs: setup
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 6b0352f24..fd37b583d 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -285,6 +285,14 @@ export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY
 export DRY_RUN="${DRY_RUN:-0}"
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 
+# Eval-related env vars (threaded from submit.sh)
+export RUN_EVAL="${RUN_EVAL:-false}"
+export FRAMEWORK="${FRAMEWORK:-}"
+export PRECISION="${PRECISION:-}"
+export MODEL_PREFIX="${MODEL_PREFIX:-}"
+export RUNNER_TYPE="${RUNNER_TYPE:-}"
+export RESULT_FILENAME="${RESULT_FILENAME:-}"
+
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
 export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"
@@ -389,6 +397,12 @@ exec sudo docker run --rm \
     -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \
     -e DRY_RUN=\$DRY_RUN \
     -e BENCHMARK_LOGS_DIR=/benchmark_logs \
+    -e RUN_EVAL=\$RUN_EVAL \
+    -e FRAMEWORK=\$FRAMEWORK \
+    -e PRECISION=\$PRECISION \
+    -e MODEL_PREFIX=\$MODEL_PREFIX \
+    -e RUNNER_TYPE=\$RUNNER_TYPE \
+    -e RESULT_FILENAME=\$RESULT_FILENAME \
     --name \"$DOCKER_CONT_NAME\" \
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index dadea4728..3bd0e5573 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -478,6 +478,57 @@ if [ "$NODE_RANK" -eq 0 ]; then
         set +x
     fi
 
+    # Run evaluation if requested (before killing router)
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        echo "Running lm-eval evaluation on Node 0..."
+
+        # Must run from repo root so utils/evals/${task}.yaml resolves
+        pushd /workspace
+
+        # Source eval functions from benchmark_lib.sh
+        source /workspace/benchmarks/benchmark_lib.sh
+
+        # Determine eval concurrency (cap at 64 for eval stability)
+        IFS='x' read -r -a _conc_arr <<< "${BENCH_MAX_CONCURRENCY}"
+        EVAL_CONC="${_conc_arr[0]:-32}"
+        (( EVAL_CONC > 64 )) && EVAL_CONC=32
+
+        if [[ "$DRY_RUN" -eq 1 ]]; then
+            echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC"
+        else
+            # Run lm-eval against the router on port 30000
+            run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC"
+
+            # Set metadata env vars for append_lm_eval_summary
+            export TP="${PREFILL_TP_SIZE}"
+            export CONC="${EVAL_CONC}"
+            export EP_SIZE=1
+            [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+            export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+            export ISL="${BENCH_INPUT_LEN}"
+            export OSL="${BENCH_OUTPUT_LEN}"
+            # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME
+            # are already set via Docker -e flags from job.slurm
+
+            append_lm_eval_summary
+            # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
+
+            # Copy eval artifacts to run_logs for NFS extraction by runner
+            EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
+            mkdir -p "$EVAL_COPY_DIR"
+            for f in meta_env.json; do
+                [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
+            done
+            # Use find for glob patterns to avoid "no match" errors
+            find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+            find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+
+            echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+        fi
+
+        popd
+    fi
+
     # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
     LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
     mkdir -p "$LOGS_OUTPUT"
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index a2c3622b9..ddf5bcda7 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -103,6 +103,14 @@ export BENCH_NUM_PROMPTS_MULTIPLIER=10
 export BENCH_MAX_CONCURRENCY=${CONCURRENCIES}
 export BENCH_REQUEST_RATE=${REQUEST_RATE}
 
+# Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker)
+export RUN_EVAL="${RUN_EVAL:-false}"
+export FRAMEWORK="${FRAMEWORK:-}"
+export PRECISION="${PRECISION:-}"
+export MODEL_PREFIX="${MODEL_PREFIX:-}"
+export RUNNER_TYPE="${RUNNER_TYPE:-}"
+export RESULT_FILENAME="${RESULT_FILENAME:-}"
+
 # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output.
 # SLURM writes output files on the batch node, so /tmp won't work (node-local).
 # Defaults to a sibling directory of the submit working directory.
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 2b9902b0b..20da5b5d6 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -129,6 +129,20 @@ PY
         fi
     done
 
+    # Extract eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        EVAL_DIR="$(dirname "$LOGS_DIR")/eval_results"
+        if [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
+            done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+        fi
+    fi
+
     echo "All result files processed"
     # Use sync scancel to ensure nfs file handle is released in time
     set +x
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 48bac118f..e19acb164 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -33,12 +33,14 @@ def seq_len_to_str(isl: int, osl: int) -> str:
     return seq_len_itos.get((isl, osl), f"{isl}_{osl}")
 
 def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
-    """Eval selection policy (single-node only):
+    """Eval selection policy:
     - Only consider 1k8k (isl=1024, osl=8192).
-    - For each unique (model, runner, framework, precision, isl, osl, spec-decoding):
-        - Mark highest TP with highest conc
-        - Mark lowest TP with highest conc
-        
+    - Single-node: for each unique (model, runner, framework, precision, isl, osl,
+      spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP
+      with highest conc.
+    - Multi-node: for each unique (model, runner, framework, precision, isl, osl,
+      spec-decoding), mark the entry with the highest max concurrency.
+
     Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
     independently.
     """
@@ -46,6 +48,8 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
 
     # Only run evals on 1k8k
     target_isl, target_osl = seq_len_stoi["1k8k"]
+
+    # --- Single-node eval selection ---
     # Group entries by (model, runner, framework, precision, isl, osl)
     # Only include entries that have a top-level TP (i.e., single-node schema).
     # This avoids relying on structural hints like prefill/decode which may be
@@ -98,6 +102,40 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
                     if e[Fields.CONC.value] == max_conc_lowest_tp:
                         eval_indices.add(i)
 
+    # --- Multi-node eval selection ---
+    # For multi-node (disaggregated) entries, pick one representative per group
+    # with the highest max concurrency.
+    mn_groups = defaultdict(list)
+    for i, entry in enumerate(matrix_values):
+        if Fields.TP.value in entry:
+            continue  # single-node, already handled
+        if Fields.PREFILL.value not in entry:
+            continue
+
+        if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
+            continue
+
+        key = (
+            entry[Fields.MODEL.value],
+            entry[Fields.RUNNER.value],
+            entry[Fields.FRAMEWORK.value],
+            entry[Fields.PRECISION.value],
+            entry[Fields.ISL.value],
+            entry[Fields.OSL.value],
+            entry[Fields.SPEC_DECODING.value],
+        )
+        mn_groups[key].append((i, entry))
+
+    for key, entries in mn_groups.items():
+        if not entries:
+            continue
+        # Pick entry with highest max concurrency
+        def _max_conc(ie):
+            c = ie[1][Fields.CONC.value]
+            return max(c) if isinstance(c, list) else c
+        best = max(entries, key=_max_conc)
+        eval_indices.add(best[0])
+
     # Mark the selected entries
     for i, entry in enumerate(matrix_values):
         entry[Fields.RUN_EVAL.value] = i in eval_indices

From c177baaaad143e552c52caa9c8eaa0f5e41d15a7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 19 Feb 2026 16:33:48 -0800
Subject: [PATCH 02/22] add mat

---
 utils/matrix_logic/generate_sweep_configs.py | 41 +++++++++++++++-----
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index e19acb164..ca075d6f1 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -38,8 +38,9 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
     - Single-node: for each unique (model, runner, framework, precision, isl, osl,
       spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP
       with highest conc.
-    - Multi-node: for each unique (model, runner, framework, precision, isl, osl,
-      spec-decoding), mark the entry with the highest max concurrency.
+    - Multi-node: for each unique (model, runner, framework, precision,
+      spec-decoding), prefer 1k8k entries if available, otherwise fall back to
+      any seq-len. Mark the entry with the highest max concurrency.
 
     Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
     independently.
@@ -103,8 +104,10 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
                         eval_indices.add(i)
 
     # --- Multi-node eval selection ---
-    # For multi-node (disaggregated) entries, pick one representative per group
-    # with the highest max concurrency.
+    # For multi-node (disaggregated) entries, pick one representative per group.
+    # Prefer 1k8k if available (matching single-node policy), otherwise fall back
+    # to whatever seq-len exists so eval coverage is not skipped entirely.
+    # Within a group, pick the entry with the highest max concurrency.
     mn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
         if Fields.TP.value in entry:
@@ -112,16 +115,11 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         if Fields.PREFILL.value not in entry:
             continue
 
-        if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
-            continue
-
         key = (
             entry[Fields.MODEL.value],
             entry[Fields.RUNNER.value],
             entry[Fields.FRAMEWORK.value],
             entry[Fields.PRECISION.value],
-            entry[Fields.ISL.value],
-            entry[Fields.OSL.value],
             entry[Fields.SPEC_DECODING.value],
         )
         mn_groups[key].append((i, entry))
@@ -129,11 +127,18 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
     for key, entries in mn_groups.items():
         if not entries:
             continue
+
+        # Prefer 1k8k entries; fall back to all entries if none exist
+        preferred = [(i, e) for i, e in entries
+                     if e.get(Fields.ISL.value) == target_isl
+                     and e.get(Fields.OSL.value) == target_osl]
+        candidates = preferred if preferred else entries
+
         # Pick entry with highest max concurrency
         def _max_conc(ie):
             c = ie[1][Fields.CONC.value]
             return max(c) if isinstance(c, list) else c
-        best = max(entries, key=_max_conc)
+        best = max(candidates, key=_max_conc)
         eval_indices.add(best[0])
 
     # Mark the selected entries
@@ -619,9 +624,18 @@ def generate_test_config_sweep(args, all_config_data):
         runner = val[Fields.RUNNER.value]
         disagg = val.get(Fields.DISAGG.value, False)
 
+        # Build seq-len filter if --seq-lens was provided
+        seq_lens_filter = None
+        if getattr(args, 'seq_lens', None):
+            seq_lens_filter = {seq_len_stoi[s] for s in args.seq_lens}
+
         for seq_len_config in val[Fields.SEQ_LEN_CONFIGS.value]:
             isl = seq_len_config[Fields.ISL.value]
             osl = seq_len_config[Fields.OSL.value]
+
+            if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+                continue
+
             seq_len_str = seq_len_to_str(isl, osl)
 
             for bmk in seq_len_config[Fields.SEARCH_SPACE.value]:
@@ -930,6 +944,13 @@ def main():
         required=False,
         help='Only include these concurrency values. Values must exist in the config conc-range/list.'
     )
+    test_config_keys_parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help='Only include these sequence length configurations (e.g., 1k1k 8k1k)'
+    )
     test_config_keys_parser.add_argument(
         '-h', '--help',
         action='help',

From 6988322ddbf3996be574e39104a220fc2a316d32 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 19 Feb 2026 19:17:30 -0800
Subject: [PATCH 03/22] Increase Eval Conc

---
 benchmarks/multi_node/amd_utils/server.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 3bd0e5573..1a441819c 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -490,8 +490,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
         # Determine eval concurrency (cap at 64 for eval stability)
         IFS='x' read -r -a _conc_arr <<< "${BENCH_MAX_CONCURRENCY}"
-        EVAL_CONC="${_conc_arr[0]:-32}"
-        (( EVAL_CONC > 64 )) && EVAL_CONC=32
+        EVAL_CONC="${_conc_arr[0]:-64}"
 
         if [[ "$DRY_RUN" -eq 1 ]]; then
             echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC"

From c0d008ba5c75b3d8d6f01642c9b708c99c6ad2a2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 19 Feb 2026 22:13:35 -0800
Subject: [PATCH 04/22] 8k1k evals instead of 1k1k

---
 benchmarks/multi_node/amd_utils/submit.sh    |  1 +
 utils/matrix_logic/generate_sweep_configs.py | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index ddf5bcda7..5aa476c63 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -110,6 +110,7 @@ export PRECISION="${PRECISION:-}"
 export MODEL_PREFIX="${MODEL_PREFIX:-}"
 export RUNNER_TYPE="${RUNNER_TYPE:-}"
 export RESULT_FILENAME="${RESULT_FILENAME:-}"
+export SPEC_DECODING="${SPEC_DECODING:-}"
 
 # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output.
 # SLURM writes output files on the batch node, so /tmp won't work (node-local).
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index ca075d6f1..54c687059 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -39,8 +39,8 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
       spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP
       with highest conc.
     - Multi-node: for each unique (model, runner, framework, precision,
-      spec-decoding), prefer 1k8k entries if available, otherwise fall back to
-      any seq-len. Mark the entry with the highest max concurrency.
+      spec-decoding), prefer 1k8k entries; fall back to 8k1k if unavailable
+      (never 1k1k). Mark the entry with the highest max concurrency.
 
     Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
     independently.
@@ -105,9 +105,9 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
 
     # --- Multi-node eval selection ---
     # For multi-node (disaggregated) entries, pick one representative per group.
-    # Prefer 1k8k if available (matching single-node policy), otherwise fall back
-    # to whatever seq-len exists so eval coverage is not skipped entirely.
+    # Prefer 1k8k; fall back to 8k1k if unavailable (never 1k1k).
     # Within a group, pick the entry with the highest max concurrency.
+    fallback_isl, fallback_osl = seq_len_stoi["8k1k"]
     mn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
         if Fields.TP.value in entry:
@@ -128,17 +128,22 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         if not entries:
             continue
 
-        # Prefer 1k8k entries; fall back to all entries if none exist
+        # Prefer 1k8k entries; fall back to 8k1k
         preferred = [(i, e) for i, e in entries
                      if e.get(Fields.ISL.value) == target_isl
                      and e.get(Fields.OSL.value) == target_osl]
-        candidates = preferred if preferred else entries
+        if not preferred:
+            preferred = [(i, e) for i, e in entries
+                         if e.get(Fields.ISL.value) == fallback_isl
+                         and e.get(Fields.OSL.value) == fallback_osl]
+        if not preferred:
+            continue
 
         # Pick entry with highest max concurrency
         def _max_conc(ie):
             c = ie[1][Fields.CONC.value]
             return max(c) if isinstance(c, list) else c
-        best = max(candidates, key=_max_conc)
+        best = max(preferred, key=_max_conc)
         eval_indices.add(best[0])
 
     # Mark the selected entries

From d73bf3d76862e94c9e34195c37690c6c1704edef Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 20 Feb 2026 09:45:51 -0800
Subject: [PATCH 05/22] reduce conc

---
 benchmarks/multi_node/amd_utils/server.sh | 99 +++++++++++++----------
 1 file changed, 57 insertions(+), 42 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 1a441819c..285945b02 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -482,50 +482,65 @@ if [ "$NODE_RANK" -eq 0 ]; then
     if [[ "${RUN_EVAL:-false}" == "true" ]]; then
         echo "Running lm-eval evaluation on Node 0..."
 
-        # Must run from repo root so utils/evals/${task}.yaml resolves
-        pushd /workspace
-
-        # Source eval functions from benchmark_lib.sh
-        source /workspace/benchmarks/benchmark_lib.sh
-
-        # Determine eval concurrency (cap at 64 for eval stability)
-        IFS='x' read -r -a _conc_arr <<< "${BENCH_MAX_CONCURRENCY}"
-        EVAL_CONC="${_conc_arr[0]:-64}"
-
-        if [[ "$DRY_RUN" -eq 1 ]]; then
-            echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC"
+        # Health check: verify the router is still serving before running eval.
+        # The throughput benchmark may have crashed/exhausted decode workers.
+        EVAL_HEALTH_OK=false
+        for _attempt in 1 2 3; do
+            if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then
+                EVAL_HEALTH_OK=true
+                break
+            fi
+            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
+            sleep 10
+        done
+
+        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
+            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
         else
-            # Run lm-eval against the router on port 30000
-            run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC"
-
-            # Set metadata env vars for append_lm_eval_summary
-            export TP="${PREFILL_TP_SIZE}"
-            export CONC="${EVAL_CONC}"
-            export EP_SIZE=1
-            [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
-            export DP_ATTENTION="${PREFILL_ENABLE_DP}"
-            export ISL="${BENCH_INPUT_LEN}"
-            export OSL="${BENCH_OUTPUT_LEN}"
-            # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME
-            # are already set via Docker -e flags from job.slurm
-
-            append_lm_eval_summary
-            # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
-
-            # Copy eval artifacts to run_logs for NFS extraction by runner
-            EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
-            mkdir -p "$EVAL_COPY_DIR"
-            for f in meta_env.json; do
-                [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
-            done
-            # Use find for glob patterns to avoid "no match" errors
-            find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-            find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-
-            echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+            # Must run from repo root so utils/evals/${task}.yaml resolves
+            pushd /workspace
+
+            # Source eval functions from benchmark_lib.sh
+            source /workspace/benchmarks/benchmark_lib.sh
+
+            # Cap eval concurrency at 32 for stability
+            EVAL_CONC=256
+
+            if [[ "$DRY_RUN" -eq 1 ]]; then
+                echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC"
+            else
+                # Run lm-eval against the router on port 30000
+                run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC"
+
+                # Set metadata env vars for append_lm_eval_summary
+                export TP="${PREFILL_TP_SIZE}"
+                export CONC="${EVAL_CONC}"
+                export EP_SIZE=1
+                [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+                export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                export ISL="${BENCH_INPUT_LEN}"
+                export OSL="${BENCH_OUTPUT_LEN}"
+                # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME
+                # are already set via Docker -e flags from job.slurm
+
+                append_lm_eval_summary
+                # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
+
+                # Copy eval artifacts to run_logs for NFS extraction by runner
+                EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
+                mkdir -p "$EVAL_COPY_DIR"
+                for f in meta_env.json; do
+                    [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
+                done
+                # Use find for glob patterns to avoid "no match" errors
+                find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+                find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+
+                echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+            fi
+
+            popd
         fi
-
-        popd
     fi
 
     # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)

From ab179c73046b36891eb6e8f8b35b1dcda834faf6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 25 Feb 2026 11:44:24 -0800
Subject: [PATCH 06/22] Eval table missing spec decode

---
 benchmarks/multi_node/amd_utils/job.slurm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index fd37b583d..87d4dcc9d 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -292,6 +292,7 @@ export PRECISION="${PRECISION:-}"
 export MODEL_PREFIX="${MODEL_PREFIX:-}"
 export RUNNER_TYPE="${RUNNER_TYPE:-}"
 export RESULT_FILENAME="${RESULT_FILENAME:-}"
+export SPEC_DECODING="${SPEC_DECODING:-}"
 
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
 export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
@@ -403,6 +404,7 @@ exec sudo docker run --rm \
     -e MODEL_PREFIX=\$MODEL_PREFIX \
     -e RUNNER_TYPE=\$RUNNER_TYPE \
     -e RESULT_FILENAME=\$RESULT_FILENAME \
+    -e SPEC_DECODING=\$SPEC_DECODING \
     --name \"$DOCKER_CONT_NAME\" \
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'

From 86629313bfe0beea5dde71c8a61d9f9c88bd1458 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 11 Mar 2026 13:11:08 -0700
Subject: [PATCH 07/22] fix: force-reinstall pinned lm-eval to override Docker
 image version

The sglang 0.5.8 Docker image ships a newer lm-eval 0.4.9.2 commit
that defaults fewshot_as_multiturn=True for chat-completion models.
Since the version string matches the pinned commit, pip silently
skips the install. Adding --force-reinstall ensures the pinned
commit is always used regardless of what's pre-installed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f69d3c418..326b796dd 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -489,13 +489,13 @@ _install_lm_eval_deps() {
     python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true
     local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476"
     if command -v git >/dev/null 2>&1; then
-        if ! python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \
+        if ! python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \
             "git+https://github.com/EleutherAI/lm-evaluation-harness.git@${lm_eval_ref}"; then
-            python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \
+            python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \
                 "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true
         fi
     else
-        python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \
+        python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \
             "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true
     fi
 }

From d44f10d048b6f6d3d282aea2cdeaaf5e2fc491f1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 11 Mar 2026 16:31:59 -0700
Subject: [PATCH 08/22] add fp8 disagg no-DPA eval config to isolate DPA as
 variable

Adds dsr1-fp8-mi355x-sglang-disagg-nodpa-eval: same image/model/precision
as the DPA config but with dp-attn=false and ep=1. Running evals on this
will tell us if DPA is the cause of the 0% GSM8K score or if it's
something else about the fp8 disagg setup.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 337047e57..e1f3123e2 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -595,6 +595,38 @@ dsr1-fp8-mi355x-atom-mtp:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
+# Eval-only: fp8 disagg WITHOUT DPA — isolates DPA as variable
+dsr1-fp8-mi355x-sglang-disagg-nodpa-eval:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
 dsr1-fp8-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528

From e5c63dcff47c5ed244866bdb0d2e7b97e9abf871 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 26 Mar 2026 12:00:00 -0700
Subject: [PATCH 09/22] nvda evals

---
 .github/configs/amd-master.yaml               |  4 +-
 .../workflows/benchmark-multinode-tmpl.yml    | 61 ++++++++++++++++---
 .github/workflows/e2e-tests.yml               | 49 ++++++++++++++-
 .github/workflows/run-sweep.yml               | 52 ++++++++++++----
 AGENTS.md                                     | 50 +++++++++++----
 benchmarks/multi_node/amd_utils/job.slurm     |  2 +
 benchmarks/multi_node/amd_utils/server.sh     |  8 ++-
 runners/launch_gb200-nv.sh                    | 21 ++++++-
 runners/launch_gb300-nv.sh                    | 21 ++++++-
 runners/launch_mi355x-amds.sh                 | 56 ++++++++++-------
 utils/matrix_logic/generate_sweep_configs.py  | 21 ++++---
 utils/matrix_logic/validation.py              |  2 +
 utils/process_changelog.py                    |  9 +++
 13 files changed, 284 insertions(+), 72 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index dddee854e..223a2bd07 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1061,7 +1061,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
 
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1269,7 +1269,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index b94ac86a1..5d3035a5f 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -81,6 +81,11 @@ on:
         type: boolean
         required: false
         default: false
+      eval-only:
+        description: "Run only evals (skip throughput benchmark)"
+        type: boolean
+        required: false
+        default: false
       ref:
         description: "Git ref (branch/sha) to checkout"
         required: false
@@ -101,6 +106,7 @@ env:
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
+  EVAL_ONLY: ${{ inputs.eval-only }}
 
   PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }}
   PREFILL_TP: ${{ inputs.prefill-tp }}
@@ -119,7 +125,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 480
-    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}"
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
 
     steps:
       - name: Slurm cleanup (pre-run)
@@ -142,6 +148,9 @@ jobs:
             fi
           fi
 
+      - name: Clean up root-owned files from previous runs
+        run: sudo rm -rf benchmark_logs benchmark_artifacts 2>/dev/null || true
+
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           token: ${{ secrets.REPO_PAT }}
@@ -162,16 +171,26 @@ jobs:
           export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }}
           export IS_MULTINODE=true
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-          # Check if at least one result file was created
-          if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then
-            echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
-            echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)"
+          if [ "${{ inputs.eval-only }}" = "true" ]; then
+            echo "Eval-only mode: skipping benchmark result file check"
+            # Verify eval produced results
+            if ! ls results*.json 1>/dev/null 2>&1; then
+              echo "Eval-only run failed: no results*.json files found." >&2
+              exit 1
+            fi
           else
-            echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2
-            exit 1
+            # Check if at least one result file was created
+            if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then
+              echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
+              echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)"
+            else
+              echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2
+              exit 1
+            fi
           fi
 
       - name: Process results
+        if: ${{ !inputs.eval-only }}
         env:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
@@ -192,13 +211,14 @@ jobs:
           done
 
       - name: Upload results
+        if: ${{ !inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}_*.json
 
       - name: Upload eval results (if any)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
@@ -208,8 +228,31 @@ jobs:
             sample*.jsonl
           if-no-files-found: ignore
 
+      - name: Verify eval scores
+        if: ${{ inputs.eval-only }}
+        run: |
+          python3 << 'PYEOF'
+          import json, glob, sys
+          MIN_SCORE = 0.85
+          failed = False
+          for f in glob.glob("results*.json"):
+              with open(f) as fh:
+                  data = json.load(fh)
+              for task, metrics in data.get("results", {}).items():
+                  for name, val in metrics.items():
+                      if not name.startswith("exact_match,") or "stderr" in name:
+                          continue
+                      if isinstance(val, (int, float)) and val < MIN_SCORE:
+                          print(f"FAIL: {task} {name} = {val:.4f} (< {MIN_SCORE})", file=sys.stderr)
+                          failed = True
+                      elif isinstance(val, (int, float)):
+                          print(f"PASS: {task} {name} = {val:.4f}")
+          if failed:
+              sys.exit(1)
+          PYEOF
+
       - name: Cleanup eval outputs (post-upload)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
         run: |
           rm -f meta_env.json || true
           rm -f results*.json || true
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index c3afe42d7..620addebe 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -37,6 +37,7 @@ jobs:
         outputs:
             single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
             multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
+            multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -54,9 +55,11 @@ jobs:
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
                   SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))")
-                  MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
+                  MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and not x.get('run-eval', False)]))")
+                  MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
+                  echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT
 
     test-sweep-multi-node:
         needs: get-jobs
@@ -94,7 +97,47 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
-            run-eval: ${{ matrix.config.run-eval }}
+            run-eval: false
+            ref: ${{ inputs.ref }}
+
+    test-sweep-multi-node-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.multi-node-eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: multi-node eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.multi-node-eval-config) }}
+        secrets: inherit
+        with:
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.exp-name }}
+            conc-list: ${{ toJson(matrix.config.conc) }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+
+            prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
+            prefill-tp: ${{ matrix.config.prefill.tp }}
+            prefill-ep: ${{ matrix.config.prefill.ep }}
+            prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
+            prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
+
+            decode-num-worker: ${{ matrix.config.decode.num-worker }}
+            decode-tp: ${{ matrix.config.decode.tp }}
+            decode-ep: ${{ matrix.config.decode.ep }}
+            decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
+            decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: true
+            eval-only: true
             ref: ${{ inputs.ref }}
 
     test-sweep-single-node:
@@ -136,7 +179,7 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-multi-node, test-sweep-single-node]
+        needs: [test-sweep-multi-node-evals, test-sweep-single-node]
         if: ${{ always() }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index afd04c808..b575e706b 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -105,7 +105,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
-            run-eval: ${{ matrix.config.run-eval }}
+            run-eval: false
 
     sweep-multi-node-1k8k:
         needs: setup
@@ -184,6 +184,45 @@ jobs:
         secrets: inherit
         with: *single-node-inputs
 
+    sweep-multi-node-evals:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: multi-node eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).multinode_evals }}
+        secrets: inherit
+        with:
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.exp-name }}
+            conc-list: ${{ toJson(matrix.config.conc) }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+
+            prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
+            prefill-tp: ${{ matrix.config.prefill.tp }}
+            prefill-ep: ${{ matrix.config.prefill.ep }}
+            prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
+            prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
+
+            decode-num-worker: ${{ matrix.config.decode.num-worker }}
+            decode-tp: ${{ matrix.config.decode.tp }}
+            decode-ep: ${{ matrix.config.decode.ep }}
+            decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
+            decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: true
+            eval-only: true
+
     collect-results:
         needs:
             [
@@ -202,16 +241,7 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs:
-            [
-                sweep-single-node-1k1k,
-                sweep-single-node-1k8k,
-                sweep-single-node-8k1k,
-                sweep-multi-node-1k1k,
-                sweep-multi-node-1k8k,
-                sweep-multi-node-8k1k,
-                setup,
-            ]
+        needs: [sweep-multi-node-evals, setup]
         if: ${{ always() && needs.setup.result != 'skipped' }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
diff --git a/AGENTS.md b/AGENTS.md
index 6bb4a86c8..787978cfc 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -37,8 +37,9 @@ InferenceX is an open-source, automated benchmarking system that continuously tr
 │   ├── workflows/           # GitHub Actions CI/CD
 │   │   ├── run-sweep.yml    # Main performance sweep
 │   │   ├── e2e-tests.yml    # End-to-end testing
-│   │   ├── benchmark-tmpl.yml  # Benchmark job template
-│   │   └── collect-evals.yml   # Eval results collection
+│   │   ├── benchmark-tmpl.yml           # Single-node benchmark job template
+│   │   ├── benchmark-multinode-tmpl.yml # Multi-node benchmark job template
+│   │   └── collect-evals.yml            # Eval results collection
 │   └── configs/             # Master configuration files
 │       ├── nvidia-master.yaml
 │       ├── amd-master.yaml
@@ -300,14 +301,27 @@ Evals run optional accuracy checks after throughput benchmarks to ensure model o
 
 ### When Evals Run
 
-Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run for two representative points per configuration group:
+Evals run as **separate workflow jobs** from throughput benchmarks (eval-only mode). The `EVAL_ONLY` flag skips throughput benchmarking and only runs lm-eval.
 
-- **Lowest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding)
-- **Highest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding)
+**Single-node** eval selection (from PR #911):
+- All TPs at **highest concurrency** and **median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn)
+- Only on `8k1k` sequence length
+
+**Multi-node** eval selection:
+- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding)
+- Prefers `8k1k`; falls back to `1k8k` (never `1k1k`)
 
 This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`.
 
-**Note**: Evals only run on `1k8k` sequence length.
+**Workflow separation**: Eval jobs are independent from benchmark jobs:
+- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node)
+- `e2e-tests.yml`: `test-sweep-evals` and `test-sweep-multi-node-evals`
+- Both use their respective benchmark templates with `eval-only: true`
+- `collect-evals` depends only on eval jobs, not benchmark jobs
+
+**Multi-node eval infrastructure**:
+- AMD (MI355X): `server.sh` skips `bench.sh` when `EVAL_ONLY=true`, runs lm-eval directly
+- NVIDIA (GB200/GB300): Uses srt-slurm `infmax-eval` benchmark type with expanded `eval_context_length`
 
 ### Eval Framework: lm-eval
 
@@ -329,19 +343,28 @@ python utils/matrix_logic/generate_sweep_configs.py full-sweep \
 
 ### Eval Integration in Benchmark Scripts
 
-All benchmark scripts in `benchmarks/` follow this pattern:
-
+**Single-node** scripts in `benchmarks/single_node/` follow this pattern:
 ```bash
-# 1. Start server
+# 1. Start server (with --context-length expansion if EVAL_ONLY=true)
 # 2. wait_for_server_ready
-# 3. run_benchmark_serving (throughput)
-# 4. Conditionally run evals:
+# 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true)
+# 4. Run evals:
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary  # Writes meta_env.json and moves artifacts
 fi
 ```
 
+**Multi-node AMD** (`benchmarks/multi_node/amd_utils/server.sh`):
+- Skips `bench.sh` when `EVAL_ONLY=true`
+- Runs lm-eval via `run_eval` against the router on port 30000
+- Copies eval artifacts to `/run_logs/slurm_job-*/eval_results/`
+
+**Multi-node NVIDIA** (GB200/GB300 via srt-slurm):
+- Uses `benchmark.type: "infmax-eval"` in srt-slurm config
+- `benchmark.eval_context_length` expands server context for eval
+- `infmax-eval` benchmark runner sources `benchmark_lib.sh` from `INFMAX_WORKSPACE`
+
 ### Key Eval Functions in `benchmarks/benchmark_lib.sh`
 
 | Function | Description |
@@ -391,10 +414,13 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `RUN_EVAL` | `false` | Enable eval after throughput |
+| `EVAL_ONLY` | `false` | Skip throughput, only run evals (set by workflow) |
 | `EVAL_FRAMEWORK` | `lm-eval` | Eval framework to use |
 | `EVAL_TASK` | `gsm8k` | Task definition file (without `.yaml`) |
 | `NUM_FEWSHOT` | `2` | Number of few-shot examples |
 | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results |
+| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by compute_eval_context_length) |
+| `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval |
 
 ### Adding a New Eval Task
 
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 87d4dcc9d..eb993f64e 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -287,6 +287,7 @@ export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 
 # Eval-related env vars (threaded from submit.sh)
 export RUN_EVAL="${RUN_EVAL:-false}"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
 export FRAMEWORK="${FRAMEWORK:-}"
 export PRECISION="${PRECISION:-}"
 export MODEL_PREFIX="${MODEL_PREFIX:-}"
@@ -399,6 +400,7 @@ exec sudo docker run --rm \
     -e DRY_RUN=\$DRY_RUN \
     -e BENCHMARK_LOGS_DIR=/benchmark_logs \
     -e RUN_EVAL=\$RUN_EVAL \
+    -e EVAL_ONLY=\$EVAL_ONLY \
     -e FRAMEWORK=\$FRAMEWORK \
     -e PRECISION=\$PRECISION \
     -e MODEL_PREFIX=\$MODEL_PREFIX \
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 2bb686eca..9271c4382 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -468,7 +468,9 @@ if [ "$NODE_RANK" -eq 0 ]; then
         ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
         ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
 
-    if [[ "$DRY_RUN" -eq 1 ]]; then
+    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+    elif [[ "$DRY_RUN" -eq 1 ]]; then
         echo "DRY RUN: $BENCH_CMD"
     else
         set -x
@@ -501,8 +503,8 @@ if [ "$NODE_RANK" -eq 0 ]; then
             # Source eval functions from benchmark_lib.sh
             source /workspace/benchmarks/benchmark_lib.sh
 
-            # Cap eval concurrency at 32 for stability
-            EVAL_CONC=256
+            # Use max concurrency from benchmark config (conc values are x-separated)
+            EVAL_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
 
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC"
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index f8f0ef26e..e1ecc76a0 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -50,6 +50,8 @@ NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$NGINX_IMAGE" |
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
 export ISL="$ISL"
 export OSL="$OSL"
 
@@ -112,7 +114,7 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout sa-submission-q1-2026
 
@@ -164,6 +166,9 @@ cat srtslurm.yaml
 echo "Running make setup..."
 make setup ARCH=aarch64
 
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
 echo "Submitting job with srtctl..."
 
 # Override the job name in the config file with the runner name
@@ -271,3 +276,17 @@ else
 fi
 
 echo "All result files processed"
+
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index d71fd5af7..079d5169e 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -31,6 +31,8 @@ NGINX_SQUASH_FILE="/home/sa-shared/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]
 srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
 srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE"
 
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
 export ISL="$ISL"
 export OSL="$OSL"
 
@@ -41,7 +43,7 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout sa-submission-q1-2026
 
@@ -95,6 +97,9 @@ cat srtslurm.yaml
 echo "Running make setup..."
 make setup ARCH=aarch64
 
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
 echo "Submitting job with srtctl..."
 
 # Override the job name in the config file with the runner name
@@ -199,6 +204,20 @@ fi
 
 echo "All result files processed"
 
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi
+
 # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
 # from blocking the next job's checkout on this runner
 echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 406072b2d..cfc4862af 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -51,6 +51,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     mkdir -p "$BENCHMARK_LOGS_DIR"
     sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
+    # Ensure root-owned files are cleaned up even on early exit to prevent
+    # EACCES errors when the next GH Actions job checks out on this runner
+    trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT
+
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
     if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then
         BENCHMARK_SUBDIR="multi_node"
@@ -101,45 +105,48 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement
     # Find the latest log directory that contains the data
 
-    cat > collect_latest_results.py <<'PY'
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        cat > collect_latest_results.py <<'PY'
 import os, sys
 sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
 for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
     print(path)
 PY
 
-    LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
-    if [ -z "$LOGS_DIR" ]; then
-        echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
-        exit 1
-    fi
-
-    echo "Found logs directory: $LOGS_DIR"
-    ls -la "$LOGS_DIR"
-
-    # Result JSON are contained within the result directory
-    for result_file in $(find $LOGS_DIR -type f); do
-        # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json
-        file_name=$(basename $result_file)
-        if [ -f $result_file ]; then
-            # Copy the result file to workspace with a unique name
-            WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
-            echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
-            cp $result_file $WORKSPACE_RESULT_FILE
+        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
+        if [ -z "$LOGS_DIR" ]; then
+            echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
+            exit 1
         fi
-    done
+
+        echo "Found logs directory: $LOGS_DIR"
+        ls -la "$LOGS_DIR"
+
+        # Result JSON are contained within the result directory
+        for result_file in $(find $LOGS_DIR -type f); do
+            # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json
+            file_name=$(basename $result_file)
+            if [ -f $result_file ]; then
+                # Copy the result file to workspace with a unique name
+                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
+                echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
+                cp $result_file $WORKSPACE_RESULT_FILE
+            fi
+        done
+    fi
 
     # Extract eval results if eval was requested
     if [[ "${RUN_EVAL:-false}" == "true" ]]; then
-        EVAL_DIR="$(dirname "$LOGS_DIR")/eval_results"
-        if [ -d "$EVAL_DIR" ]; then
+        # Find eval_results in the slurm job logs directory
+        EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1)
+        if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then
             echo "Extracting eval results from $EVAL_DIR"
             for eval_file in "$EVAL_DIR"/*; do
                 [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
                 echo "Copied eval artifact: $(basename "$eval_file")"
             done
         else
-            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+            echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs"
         fi
     fi
 
@@ -160,6 +167,9 @@ PY
         echo "Logs copied to $ARTIFACT_DIR for artifact upload"
     fi
 
+    # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup
+    sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+
 else
 
     export HF_HUB_CACHE_MOUNT="/var/lib/hf-hub-cache/"
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 7f6bb11ea..850fecd6a 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -40,8 +40,9 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
       spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP
       with highest conc.
     - Multi-node: for each unique (model, runner, framework, precision,
-      spec-decoding), prefer 1k8k entries; fall back to 8k1k if unavailable
-      (never 1k1k). Mark the entry with the highest max concurrency.
+      spec-decoding, prefill-dp-attn, decode-dp-attn), prefer 8k1k entries;
+      fall back to 1k8k if unavailable (never 1k1k). Mark the entry with the
+      highest max concurrency.
 
     Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
     independently.
@@ -106,9 +107,11 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
 
     # --- Multi-node eval selection ---
     # For multi-node (disaggregated) entries, pick one representative per group.
-    # Prefer 1k8k; fall back to 8k1k if unavailable (never 1k1k).
+    # Prefer 8k1k; fall back to 1k8k if unavailable (never 1k1k).
     # Within a group, pick the entry with the highest max concurrency.
-    fallback_isl, fallback_osl = seq_len_stoi["8k1k"]
+    # Multi-node: prefer 8k1k, fallback to 1k8k
+    mn_target_isl, mn_target_osl = seq_len_stoi["8k1k"]
+    fallback_isl, fallback_osl = seq_len_stoi["1k8k"]
     mn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
         if Fields.TP.value in entry:
@@ -116,12 +119,16 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         if Fields.PREFILL.value not in entry:
             continue
 
+        prefill_dp = entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value)
+        decode_dp = entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value)
         key = (
             entry[Fields.MODEL.value],
             entry[Fields.RUNNER.value],
             entry[Fields.FRAMEWORK.value],
             entry[Fields.PRECISION.value],
             entry[Fields.SPEC_DECODING.value],
+            prefill_dp,
+            decode_dp,
         )
         mn_groups[key].append((i, entry))
 
@@ -129,10 +136,10 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         if not entries:
             continue
 
-        # Prefer 1k8k entries; fall back to 8k1k
+        # Prefer 8k1k entries; fall back to 1k8k
         preferred = [(i, e) for i, e in entries
-                     if e.get(Fields.ISL.value) == target_isl
-                     and e.get(Fields.OSL.value) == target_osl]
+                     if e.get(Fields.ISL.value) == mn_target_isl
+                     and e.get(Fields.OSL.value) == mn_target_osl]
         if not preferred:
             preferred = [(i, e) for i, e in entries
                          if e.get(Fields.ISL.value) == fallback_isl
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index ad7658176..2e8626abe 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -360,6 +360,8 @@ class ChangelogMatrixEntry(BaseModel):
                       ] = Field(default_factory=dict)
     multi_node: dict[str, list[MultiNodeMatrixEntry]
                      ] = Field(default_factory=dict)
+    evals: list[SingleNodeMatrixEntry] = Field(default_factory=list)
+    multinode_evals: list[MultiNodeMatrixEntry] = Field(default_factory=list)
     changelog_metadata: ChangelogMetadata
 
 
diff --git a/utils/process_changelog.py b/utils/process_changelog.py
index d17fc3729..6b4c7878c 100644
--- a/utils/process_changelog.py
+++ b/utils/process_changelog.py
@@ -81,6 +81,8 @@ def main():
     final_results = {
         "single_node": defaultdict(list),
         "multi_node": defaultdict(list),
+        "evals": [],
+        "multinode_evals": [],
         "changelog_metadata": {
             "base_ref": args.base_ref,
             "head_ref": args.head_ref,
@@ -131,6 +133,7 @@ def main():
 
         all_results.extend(json.loads(result.stdout))
 
+    all_eval_results = []
     for result in all_results:
         seq_len_str = seq_len_to_str(result["isl"], result["osl"])
         if "prefill" in result and result["prefill"] is not None:
@@ -138,6 +141,12 @@ def main():
         else:
             final_results["single_node"][seq_len_str].append(result)
 
+        if result.get("run-eval"):
+            all_eval_results.append(result)
+
+    final_results["evals"] = [e for e in all_eval_results if "prefill" not in e or e.get("prefill") is None]
+    final_results["multinode_evals"] = [e for e in all_eval_results if "prefill" in e and e.get("prefill") is not None]
+
     # Validate final results structure
     validated = ChangelogMatrixEntry.model_validate(final_results)
     print(validated.model_dump_json(by_alias=True))

From 7215f1f88929dde27ba1173ce0b519235c26be8d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 30 Mar 2026 11:00:07 -0700
Subject: [PATCH 10/22] merge main

---
 .github/configs/amd-master.yaml               |  306 +----
 .github/configs/nvidia-master.yaml            | 1145 ++---------------
 .github/workflows/README.md                   |    6 +-
 .../workflows/benchmark-multinode-tmpl.yml    |   24 +-
 .github/workflows/benchmark-tmpl.yml          |   60 +-
 .github/workflows/claude.yml                  |    7 +-
 .github/workflows/e2e-tests.yml               |   42 +-
 .github/workflows/profile.yml                 |    5 +-
 .github/workflows/run-sweep.yml               |   66 +-
 AGENTS.md                                     |   59 +-
 benchmarks/benchmark_lib.sh                   |  128 +-
 benchmarks/single_node/dsr1_fp4_b200.sh       |    9 +-
 benchmarks/single_node/dsr1_fp4_b200_trt.sh   |    8 +-
 .../single_node/dsr1_fp4_b200_trt_mtp.sh      |   12 +-
 benchmarks/single_node/dsr1_fp4_mi355x.sh     |    9 +-
 .../single_node/dsr1_fp4_mi355x_atom.sh       |    7 +-
 .../single_node/dsr1_fp4_mi355x_atom_mtp.sh   |    7 +-
 benchmarks/single_node/dsr1_fp8_b200.sh       |    9 +-
 benchmarks/single_node/dsr1_fp8_b200_mtp.sh   |    9 +-
 benchmarks/single_node/dsr1_fp8_b200_trt.sh   |   16 +-
 .../single_node/dsr1_fp8_b200_trt_mtp.sh      |   17 +-
 benchmarks/single_node/dsr1_fp8_h200.sh       |   14 +-
 benchmarks/single_node/dsr1_fp8_h200_trt.sh   |    8 +-
 .../single_node/dsr1_fp8_h200_trt_mtp.sh      |    7 +-
 benchmarks/single_node/dsr1_fp8_mi300x.sh     |    9 +-
 benchmarks/single_node/dsr1_fp8_mi325x.sh     |   10 +-
 benchmarks/single_node/dsr1_fp8_mi355x.sh     |    9 +-
 .../single_node/dsr1_fp8_mi355x_atom.sh       |    7 +-
 .../single_node/dsr1_fp8_mi355x_atom_mtp.sh   |    8 +-
 benchmarks/single_node/glm5_fp8_b200.sh       |    9 +-
 benchmarks/single_node/glm5_fp8_h200.sh       |   10 +-
 benchmarks/single_node/glm5_fp8_mi355x.sh     |    9 +-
 benchmarks/single_node/glm5_nvfp4_b200.sh     |   81 ++
 benchmarks/single_node/gptoss_fp4_b200.sh     |    9 +-
 benchmarks/single_node/gptoss_fp4_b200_trt.sh |    8 +-
 benchmarks/single_node/gptoss_fp4_h100.sh     |   14 +-
 benchmarks/single_node/gptoss_fp4_h200.sh     |   12 +-
 benchmarks/single_node/gptoss_fp4_h200_trt.sh |   14 +-
 benchmarks/single_node/gptoss_fp4_mi300x.sh   |    6 +-
 benchmarks/single_node/gptoss_fp4_mi325x.sh   |    6 +-
 benchmarks/single_node/gptoss_fp4_mi355x.sh   |    6 +-
 .../single_node/gptoss_fp4_mi355x_atom.sh     |    7 +-
 benchmarks/single_node/kimik2.5_fp4_b200.sh   |    7 +-
 benchmarks/single_node/kimik2.5_fp4_mi355x.sh |    8 +-
 benchmarks/single_node/kimik2.5_int4_b200.sh  |    6 +-
 benchmarks/single_node/kimik2.5_int4_h200.sh  |    8 +-
 .../single_node/kimik2.5_int4_mi300x.sh       |   77 ++
 .../single_node/kimik2.5_int4_mi325x.sh       |   10 +-
 .../single_node/kimik2.5_int4_mi355x.sh       |    7 +-
 .../single_node/minimaxm2.5_fp8_b200.sh       |    7 +-
 .../single_node/minimaxm2.5_fp8_h100.sh       |    9 +-
 .../single_node/minimaxm2.5_fp8_h200.sh       |    9 +-
 .../single_node/minimaxm2.5_fp8_mi300x.sh     |    7 +-
 .../single_node/minimaxm2.5_fp8_mi325x.sh     |   16 +-
 .../single_node/minimaxm2.5_fp8_mi355x.sh     |    8 +-
 benchmarks/single_node/qwen3.5_bf16_b200.sh   |    6 +-
 benchmarks/single_node/qwen3.5_bf16_mi300x.sh |   10 +-
 benchmarks/single_node/qwen3.5_bf16_mi325x.sh |   10 +-
 benchmarks/single_node/qwen3.5_bf16_mi355x.sh |    9 +-
 benchmarks/single_node/qwen3.5_fp8_b200.sh    |    6 +-
 .../single_node/qwen3.5_fp8_b200_mtp.sh       |    6 +-
 benchmarks/single_node/qwen3.5_fp8_h200.sh    |    6 +-
 benchmarks/single_node/qwen3.5_fp8_mi300x.sh  |   10 +-
 benchmarks/single_node/qwen3.5_fp8_mi325x.sh  |   10 +-
 benchmarks/single_node/qwen3.5_fp8_mi355x.sh  |    9 +-
 perf-changelog.yaml                           |  137 +-
 runners/launch_b200-dgxc.sh                   |    2 +-
 runners/launch_h100-cr.sh                     |    2 +-
 runners/launch_h200-nb.sh                     |    2 +-
 runners/launch_mi300x-amds.sh                 |    4 +-
 utils/bench_serving/backend_request_func.py   |   75 +-
 utils/evals/EVALS.md                          |    4 +-
 utils/evals/gsm8k.yaml                        |    2 +-
 utils/evals/thresholds.json                   |    4 +
 utils/evals/validate_scores.py                |   90 ++
 utils/matrix_logic/generate_sweep_configs.py  |   81 +-
 .../test_generate_sweep_configs.py            |    3 -
 utils/process_changelog.py                    |   90 +-
 78 files changed, 1328 insertions(+), 1643 deletions(-)
 create mode 100755 benchmarks/single_node/glm5_nvfp4_b200.sh
 create mode 100755 benchmarks/single_node/kimik2.5_int4_mi300x.sh
 create mode 100644 utils/evals/thresholds.json
 create mode 100644 utils/evals/validate_scores.py

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2c34a93f4..6890126cf 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -12,10 +12,6 @@ dsr1-fp4-mi355x-sglang:
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -35,11 +31,6 @@ dsr1-fp4-mi355x-atom:
     search-space:
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 1, conc-start: 128, conc-end: 256 }
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -61,11 +52,6 @@ dsr1-fp4-mi355x-atom-mtp:
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -85,10 +71,6 @@ dsr1-fp8-mi300x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -107,10 +89,6 @@ dsr1-fp8-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -129,10 +107,6 @@ dsr1-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -152,10 +126,6 @@ qwen3.5-bf16-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -174,10 +144,6 @@ qwen3.5-bf16-mi300x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -196,10 +162,6 @@ qwen3.5-bf16-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -218,10 +180,6 @@ qwen3.5-fp8-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -240,10 +198,6 @@ qwen3.5-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -262,10 +216,6 @@ qwen3.5-fp8-mi300x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -284,10 +234,6 @@ glm5-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -306,17 +252,13 @@ kimik2.5-int4-mi355x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.16.0
+  image: vllm/vllm-openai-rocm:v0.18.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi325x
@@ -328,8 +270,22 @@ kimik2.5-int4-mi325x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+kimik2.5-int4-mi300x-vllm:
+  image: vllm/vllm-openai-rocm:v0.18.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: mi300x
+  precision: int4
+  framework: vllm
+  multinode: false
+  seq-len-configs:
   - isl: 1024
-    osl: 8192
+    osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
@@ -351,12 +307,6 @@ kimik2.5-fp4-mi355x-vllm:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -378,12 +328,6 @@ minimaxm2.5-fp8-mi355x-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -405,11 +349,6 @@ minimaxm2.5-fp8-mi300x-vllm:
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -417,7 +356,7 @@ minimaxm2.5-fp8-mi300x-vllm:
     - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.16.0
+  image: vllm/vllm-openai-rocm:v0.18.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi325x
@@ -429,17 +368,12 @@ minimaxm2.5-fp8-mi325x-vllm:
     osl: 1024
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
 
 gptoss-fp4-mi300x-vllm:
   image: vllm/vllm-openai-rocm:v0.17.0
@@ -457,13 +391,6 @@ gptoss-fp4-mi300x-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -488,13 +415,6 @@ gptoss-fp4-mi325x-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 64, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -518,12 +438,6 @@ gptoss-fp4-mi355x-vllm:
     - { tp: 1, conc-start: 4, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 8 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 4, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 8 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -545,11 +459,6 @@ gptoss-fp4-mi355x-atom:
     search-space:
     - { tp: 1, conc-start: 16, conc-end: 128 }
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 16, conc-end: 128 }
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -570,10 +479,6 @@ dsr1-fp8-mi355x-atom:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -593,10 +498,6 @@ dsr1-fp8-mi355x-atom-mtp:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
   - isl: 8192
     osl: 1024
     search-space:
@@ -943,129 +844,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
-  # FIXME(billishyahao): disable 1k8k for now
-  # - isl: 1024
-  #   osl: 8192
-  #   search-space:
-  #   # MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-  #   # non-MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
 
 dsr1-fp4-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
@@ -1485,49 +1263,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-
-  # FIXME(billishyahao): disable FP4 1k8k for now
-  # - isl: 1024
-  #   osl: 8192
-  #   search-space:
-  #   # MTP configurations
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-  #   # non-MTP configurations
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f4570fd2c..157a9b54c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1663,11 +1663,6 @@ dsr1-fp4-b200-sglang:
     search-space:
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1694,17 +1689,6 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 4 }
     - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # low concurrency cases use TP only
-    # concurrency 64 uses TP & EP
-    # high concurrency cases use TP & EP & DP-ATTN
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
-    - { tp: 8, conc-start: 4, conc-end: 4 }
-    - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1737,17 +1721,6 @@ dsr1-fp4-b200-trt-mtp:
     - { tp: 8, conc-start: 128, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, ep: 8, conc-start: 32, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 64, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-      # TP=4 configurations
-    - { tp: 4, conc-start: 16, conc-end: 16, spec-decoding: mtp }
-    - { tp: 4, ep: 4, conc-start: 8, conc-end: 8, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-      # TP=8 configurations
-    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
-    - { tp: 8, ep: 8, conc-start: 32, conc-end: 64, spec-decoding: mtp }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1772,10 +1745,6 @@ dsr1-fp8-b200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1795,10 +1764,6 @@ qwen3.5-bf16-b200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1818,11 +1783,6 @@ qwen3.5-fp8-b200-sglang:
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
     - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1842,14 +1802,28 @@ glm5-fp8-b200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
+  - isl: 8192
+    osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
+
+glm5-nvfp4-b200-sglang:
+  image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5
+  model: nvidia/GLM-5-NVFP4
+  model-prefix: glm5
+  runner: b200
+  precision: fp4
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
 
 qwen3.5-fp8-b200-sglang-mtp:
   image: lmsysorg/sglang:v0.5.9-cu130
@@ -1864,10 +1838,6 @@ qwen3.5-fp8-b200-sglang-mtp:
     osl: 1024
     search-space:
     - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1886,10 +1856,6 @@ kimik2.5-int4-b200-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1908,10 +1874,6 @@ kimik2.5-int4-h200-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1931,10 +1893,6 @@ kimik2.5-fp4-b200-vllm:
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1954,10 +1912,6 @@ dsr1-fp8-b200-sglang-mtp:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1978,11 +1932,6 @@ dsr1-fp8-b200-trt:
     - { tp: 8, ep: 1, conc-start: 64, conc-end: 128 }
     - { tp: 4, ep: 1, conc-start: 8, conc-end: 16 } 
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256}
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }    
   - isl: 8192
     osl: 1024
     search-space:
@@ -2007,13 +1956,6 @@ dsr1-fp8-b200-trt-mtp:
     # If CONC == 256, then TP8, EP8, DP_ATTN=true
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # mostly TP8
-    # If CONC >= 128, then TP8, EP8, DP_ATTN=true
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2033,10 +1975,6 @@ dsr1-fp8-h200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2055,10 +1993,6 @@ qwen3.5-fp8-h200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2077,10 +2011,6 @@ glm5-fp8-h200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2101,11 +2031,6 @@ dsr1-fp8-h200-trt:
     # If CONC > 64, then DP_ATTN=true
     search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    # If CONC > 64, then DP_ATTN=true
-    search-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     # If CONC > 32, then DP_ATTN=true
@@ -2129,12 +2054,6 @@ dsr1-fp8-h200-trt-mtp:
     # If CONC >= 128, then DP_ATTN=true, MTP=1
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # If CONC >= 256, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3149,14 +3068,6 @@ gptoss-fp4-b200-trt:
     - { tp: 4, conc-start: 4, conc-end: 4 }
     - { tp: 8, conc-start: 4, conc-end: 4 }
   # Low ==> high TP from Left to Right of pareto
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 256, conc-end: 256}
-    - { tp: 2, conc-start: 128, conc-end: 256}
-    - { tp: 4, conc-start:   4, conc-end: 256}
-    - { tp: 8, conc-start:   4, conc-end:   4}
-  # Low ==> high TP from Left to Right of pareto
   - isl: 8192
     osl: 1024
     search-space:
@@ -3181,13 +3092,6 @@ gptoss-fp4-b200-vllm:
     - { tp: 2, conc-start: 4, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 4, conc-end: 128 }
-    - { tp: 2, conc-start: 4, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3210,11 +3114,6 @@ minimaxm2.5-fp8-b200-vllm:
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3222,7 +3121,7 @@ minimaxm2.5-fp8-b200-vllm:
     - { tp: 4, conc-start: 4, conc-end: 64 }
 
 gptoss-fp4-h100-vllm:
-  image: vllm/vllm-openai:v0.15.1
+  image: vllm/vllm-openai:v0.18.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: h100
@@ -3236,12 +3135,6 @@ gptoss-fp4-h100-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3250,7 +3143,7 @@ gptoss-fp4-h100-vllm:
     - { tp: 8, conc-start: 4, conc-end: 16 }
 
 minimaxm2.5-fp8-h100-vllm:
-  image: vllm/vllm-openai:v0.16.0
+  image: vllm/vllm-openai:v0.18.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: h100
@@ -3263,11 +3156,6 @@ minimaxm2.5-fp8-h100-vllm:
     search-space:
     # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3345,67 +3233,6 @@ dsr1-fp8-h100-dynamo-sglang:
         tp: 16
         ep: 16
         dp-attn: true
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # # STP: Max throughput TEP (1 prefill, 2 decode)
-    # - conc-list: [1, 2, 4, 8, 16, 32]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 1
-    #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p2d-max-tp.yaml"
-    #   decode:
-    #     num-worker: 2
-    #     tp: 16
-    #     ep: 1
-    #     dp-attn: false
-    # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
-    # - conc-list: [1, 2, 4, 8]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 1
-    #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p1d-max-dep.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 16
-    #     dp-attn: true
-    # MTP: Max throughput TEP (1 prefill, 2 decode)
-    - spec-decoding: "mtp"
-      conc-list: [1, 2, 4, 8, 16, 32, 64]
-      prefill:
-        num-worker: 1
-        tp: 16
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 1
-        dp-attn: false
-    # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
-    - spec-decoding: "mtp"
-      conc-list: [1, 2, 4, 8, 16, 32, 64]
-      prefill:
-        num-worker: 1
-        tp: 16
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
   - isl: 8192
     osl: 1024
     search-space:
@@ -3485,13 +3312,6 @@ gptoss-fp4-h200-trt:
     - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3501,7 +3321,7 @@ gptoss-fp4-h200-trt:
     - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
 
 gptoss-fp4-h200-vllm:
-  image: vllm/vllm-openai:v0.15.1
+  image: vllm/vllm-openai:v0.18.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: h200
@@ -3516,13 +3336,6 @@ gptoss-fp4-h200-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 4, conc-end: 4 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3532,7 +3345,7 @@ gptoss-fp4-h200-vllm:
     - { tp: 8, conc-start: 4, conc-end: 32 }
 
 minimaxm2.5-fp8-h200-vllm:
-  image: vllm/vllm-openai:v0.16.0
+  image: vllm/vllm-openai:v0.18.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: h200
@@ -3544,10 +3357,6 @@ minimaxm2.5-fp8-h200-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3744,8 +3553,8 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
 
-  - isl: 1024
-    osl: 8192
+  - isl: 8192
+    osl: 1024
     search-space:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
@@ -3756,105 +3565,89 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 7
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [ 7 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - spec-decoding: "mtp"
-      conc-list: [ 128 ]
+      conc-list: [ 180 ]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [ 512 ]
+      conc-list: [ 1229 ]
       prefill:
-        num-worker: 1
+        num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
-        tp: 32
-        ep: 32
+        tp: 16
+        ep: 16
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [ 3072 ]
+      conc-list: [ 666 ]
       prefill:
-        num-worker: 1
+        num-worker: 8
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
       decode:
-        num-worker: 3
-        tp: 16
-        ep: 16
+        num-worker: 1
+        tp: 32
+        ep: 32
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [ 6144 ]
+      conc-list: [ 4301 ]
       prefill:
-        num-worker: 1
+        num-worker: 11
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
       decode:
-        num-worker: 3
+        num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 8192 ]
+
+    # Non-MTP configurations (default spec_decoding="none")
+    - conc-list: [ 12, 44, 76 ]
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
       decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-
-    # Non-MTP configurations (default spec_decoding="none")
+        num-worker: 4
+        tp: 8
+        ep: 8
+        dp-attn: false
     - conc-list: [ 5 ]
       prefill:
         num-worker: 1
@@ -3862,216 +3655,22 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
-    - conc-list: [ 60 ]
+    - conc-list: [ 333 ]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 15
-        tp: 4
-        ep: 4
-        dp-attn: false
-    - conc-list: [ 135 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 15
-        tp: 4
-        ep: 4
-        dp-attn: false
-    - conc-list: [ 563 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [ 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [ 4096 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [ 8192 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations (spec_decoding="mtp")
-    - spec-decoding: "mtp"
-      conc-list: [ 4, 8, 12, 24, 48 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [ 180 ]
-      prefill:
-        num-worker: 3
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 1229 ]
-      prefill:
-        num-worker: 7
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 666 ]
-      prefill:
-        num-worker: 8
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 4301 ]
-      prefill:
-        num-worker: 11
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-
-    # Non-MTP configurations (default spec_decoding="none")
-    - conc-list: [ 12, 44, 76 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [ 5 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [ 333 ]
-      prefill:
-        num-worker: 2
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4339,156 +3938,6 @@ dsr1-fp8-gb200-dynamo-trt:
         tp: 8
         ep: 8
         dp-attn: false
-  # 1k8k MTP configs
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - spec-decoding: "mtp"
-      conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [2152]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [564]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [72]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [4, 8]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    # 1k8k STP configs
-    - conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - conc-list: [2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [564]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [36]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [4]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
   # 8k1k MTP configs
   - isl: 8192
     osl: 1024
@@ -5079,343 +4528,164 @@ dsr1-fp4-gb300-dynamo-trt:
         - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [333]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [5]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [8, 12, 24, 48]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [2253]
-      prefill:
-        num-worker: 3
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [1229]
-      prefill:
-        num-worker: 3
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    # Non-MTP configurations (default spec_decoding="none")
-    - conc-list: [5]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [12, 48, 96, 192]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [8192]
-      prefill:
-        num-worker: 2
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-    - conc-list: [1229]
-      prefill:
-        num-worker: 2
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [4301]
-      prefill:
-        num-worker: 3
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 4
+        ep: 4
         dp-attn: true
-    - conc-list: [2253]
+    - spec-decoding: "mtp"
+      conc-list: [333]
       prefill:
-        num-worker: 3
+        num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
-      conc-list: [7]
+      conc-list: [5]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - spec-decoding: "mtp"
-      conc-list: [63]
+      conc-list: [8, 12, 24, 48]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - spec-decoding: "mtp"
-      conc-list: [563]
+      conc-list: [2253]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
-        tp: 32
-        ep: 32
+        tp: 16
+        ep: 16
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [2088]
+      conc-list: [1229]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [16384]
+    # Non-MTP configurations (default spec_decoding="none")
+    - conc-list: [5]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
-        dp-attn: true
-    # STP configurations (no spec_decoding)
-    - conc-list: [7]
+        dp-attn: false
+    - conc-list: [12, 48, 96, 192]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
-    - conc-list: [60]
+    - conc-list: [8192]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       decode:
-        num-worker: 15
-        tp: 4
-        ep: 4
-        dp-attn: false
-    - conc-list: [245]
-      prefill:
         num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 7
         tp: 8
         ep: 8
-        dp-attn: false
-    - conc-list: [1024]
+        dp-attn: true
+    - conc-list: [1229]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
-    - conc-list: [4096]
+    - conc-list: [4301]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
       decode:
         num-worker: 1
-        tp: 32
-        ep: 32
+        tp: 16
+        ep: 16
         dp-attn: true
-    - conc-list: [8192]
+    - conc-list: [2253]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -6184,187 +5454,6 @@ dsr1-fp8-gb300-dynamo-trt:
         tp: 8
         ep: 8
         dp-attn: true
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # MTP configurations (spec_decoding="mtp")
-    - spec-decoding: "mtp"
-      conc-list: [4]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [16]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [141]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [544]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [2048]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: true
-    # STP configurations (no spec_decoding)
-    - conc-list: [4]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [36]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [282]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [1024]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [4096]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-
-
 gptoss-fp4-gb200-dynamo-trt:
   image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2
   model: openai/gpt-oss-120b
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 37e64b8ed..de0a3dcab 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -40,7 +40,7 @@ usage: generate_sweep_configs.py full-sweep
     [--precision PRECISION [PRECISION ...]]
     [--framework FRAMEWORK [FRAMEWORK ...]]
     [--runner-type RUNNER_TYPE [RUNNER_TYPE ...]]
-    [--seq-lens {1k1k,1k8k,8k1k} [{1k1k,1k8k,8k1k} ...]]
+    [--seq-lens {1k1k,8k1k} [{1k1k,8k1k} ...]]
     [--step-size STEP_SIZE]
     [--max-conc MAX_CONC]
     [--max-tp MAX_TP]
@@ -62,9 +62,9 @@ full-sweep --config-files .github/configs/nvidia-master.yaml
 full-sweep --single-node --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml
 ```
 
-**Test all single-node fp8 precision configs for 1k8k workloads:**
+**Test all single-node fp8 precision configs for 8k1k workloads:**
 ```
-full-sweep --single-node --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml
+full-sweep --single-node --precision fp8 --seq-lens 8k1k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml
 ```
 
 **Test all single-node TRT configs on H200 runners:**
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 5d3035a5f..d529b7ccc 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -107,6 +107,8 @@ env:
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
   PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }}
   PREFILL_TP: ${{ inputs.prefill-tp }}
@@ -156,6 +158,7 @@ jobs:
           token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch multi-node job script
         env:
@@ -230,26 +233,7 @@ jobs:
 
       - name: Verify eval scores
         if: ${{ inputs.eval-only }}
-        run: |
-          python3 << 'PYEOF'
-          import json, glob, sys
-          MIN_SCORE = 0.85
-          failed = False
-          for f in glob.glob("results*.json"):
-              with open(f) as fh:
-                  data = json.load(fh)
-              for task, metrics in data.get("results", {}).items():
-                  for name, val in metrics.items():
-                      if not name.startswith("exact_match,") or "stderr" in name:
-                          continue
-                      if isinstance(val, (int, float)) and val < MIN_SCORE:
-                          print(f"FAIL: {task} {name} = {val:.4f} (< {MIN_SCORE})", file=sys.stderr)
-                          failed = True
-                      elif isinstance(val, (int, float)):
-                          print(f"PASS: {task} {name} = {val:.4f}")
-          if failed:
-              sys.exit(1)
-          PYEOF
+        run: python3 utils/evals/validate_scores.py
 
       - name: Cleanup eval outputs (post-upload)
         if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 16b587657..797505eec 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -54,6 +54,11 @@ on:
         type: boolean
         required: true
         default: false
+      eval-only:
+        description: "Run only evals (skip throughput benchmark)"
+        type: boolean
+        required: false
+        default: false
       random-range-ratio:
         required: false
         type: string
@@ -83,6 +88,9 @@ env:
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
+  EVAL_ONLY: ${{ inputs.eval-only }}
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
 permissions:
   contents: read
@@ -91,7 +99,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 300
-    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}"
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
     steps:
       - name: Resource cleanup (pre-run)
         run: &resource-cleanup |
@@ -123,13 +131,14 @@ jobs:
                 sleep 5
               done
             fi
-          fi
+          fi 
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch job script
         env:
@@ -145,28 +154,42 @@ jobs:
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-          FOUND_RESULT_FILE=
-          for i in {1..10}; do
-            if [ -f "$RESULT_FILENAME.json" ]; then
-              FOUND_RESULT_FILE=true
-              break
+
+          if [ "${{ inputs.eval-only }}" = "true" ]; then
+            echo "Eval-only mode: skipping benchmark result file check"
+            # Verify eval produced results
+            if ! ls results*.json 1>/dev/null 2>&1; then
+              echo "Eval-only run failed: no results*.json files found." >&2
+              exit 1
             fi
-            echo "Waiting for result file... (attempt $i)"
-            sleep 1
-          done
+            # Verify eval scores meet per-benchmark minimum thresholds
+            python3 utils/evals/validate_scores.py
+          else
+            FOUND_RESULT_FILE=
+            for i in {1..10}; do
+              if [ -f "$RESULT_FILENAME.json" ]; then
+                FOUND_RESULT_FILE=true
+                break
+              fi
+              echo "Waiting for result file... (attempt $i)"
+              sleep 1
+            done
 
-          if [ -z "$FOUND_RESULT_FILE" ]; then
-            echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
-            exit 1
+            if [ -z "$FOUND_RESULT_FILE" ]; then
+              echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
+              exit 1
+            fi
           fi
 
       - name: Process result
+        if: ${{ !inputs.eval-only }}
         env:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
 
       - name: Upload result
+        if: ${{ !inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
@@ -176,7 +199,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: server_logs_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
           path: server.log
           if-no-files-found: ignore
 
@@ -184,12 +207,12 @@ jobs:
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: gpu_metrics_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_gpu_metrics_' || 'gpu_metrics_' }}${{ env.RESULT_FILENAME }}
           path: gpu_metrics.csv
           if-no-files-found: ignore
 
       - name: Upload eval results (if any)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
@@ -197,14 +220,15 @@ jobs:
             meta_env.json
             results*.json
             sample*.jsonl
-          if-no-files-found: ignore
+          if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
 
       - name: Cleanup eval outputs (post-upload)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         run: |
           rm -f meta_env.json || true
           # Remove any eval results JSONs that were moved into workspace
           rm -f results*.json || true
+          rm -f sample*.jsonl || true
           
       - name: Resource cleanup (post-run)
         if: always()
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
index 1be4b1b98..b5b474471 100644
--- a/.github/workflows/claude.yml
+++ b/.github/workflows/claude.yml
@@ -97,7 +97,7 @@ jobs:
             The `generate-cli-command` input accepts arguments for `generate_sweep_configs.py`. Usage: `generate_sweep_configs.py` `[-h]` `{full-sweep,runner-model-sweep,test-config}`
 
             **Subcommand reference:**
-            - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-len`. This is the primary subcommand for running benchmarks.
+            - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-lens`. This is the primary subcommand for running benchmarks.
             - `test-config`: Use this subcommand ONLY when prompted to with 'test-config'. Uses the flags `--config-files` and `--config-keys`, does NOT accept any other arguments.
 
             Examples:
@@ -119,7 +119,7 @@ jobs:
 
             **Specify concurrency and sequence length:**
             ```
-            generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-len 1k1k"
+            generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-lens 1k1k"
             ```
 
             **Test specific config keys (MUST USE `--conc`):**
@@ -130,7 +130,7 @@ jobs:
             **IMPORTANT: Keep runs precise and efficient:**
             - Use `full-sweep` with filter flags to narrow down the benchmark scope - "full-sweep" does NOT mean running everything
             - When using `full-sweep`, you must use `--min-conc` and `--max-conc` together to specify a single concurrency value. Unless prompted otherwise, use `--min-conc 4 --max-conc 4`
-            - When using `full-sweep`, you can use `--seq-len` to specify a single sequence length (choices: 1k1k, 1k8k, 8k1k). Unless prompted otherwise, use `--seq-len 1k1k`
+            - When using `full-sweep`, you can use `--seq-lens` to specify sequence lengths (choices: 1k1k, 8k1k). Unless prompted otherwise, use `--seq-lens 1k1k`
             - Use `test-config` ONLY when given specific config keys to test - Use `--config-files`, `--config-keys`, and `--conc` flags ONLY
             - Always filter by specific models, frameworks, precision, conc, or config keys when possible
 
@@ -291,4 +291,3 @@ jobs:
             # Then use $EP in the vllm serve command
             ```
             This ensures the script respects the `ep` setting in the master config YAML's search-space.
-
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index af6410e2d..6765113b2 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -37,6 +37,7 @@ jobs:
         outputs:
             single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
             multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
+            eval-config: ${{ steps.get-jobs.outputs.eval-config }}
             multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
         steps:
             - name: Checkout code (ref)
@@ -54,11 +55,13 @@ jobs:
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and not x.get('run-eval', False)]))")
+                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
                   MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
+                  echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
                   echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT
 
     test-sweep-multi-node:
@@ -167,7 +170,38 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
-            run-eval: ${{ matrix.config.run-eval }}
+            run-eval: false
+            ref: ${{ inputs.ref }}
+
+    test-sweep-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
             ref: ${{ inputs.ref }}
 
     collect-results:
@@ -179,8 +213,8 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-multi-node-evals, test-sweep-single-node]
-        if: ${{ always() }}
+        needs: [test-sweep-evals, test-sweep-multi-node-evals]
+        if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index d72f54b8f..64e4ea531 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -35,6 +35,8 @@ env:
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
   RANDOM_RANGE_RATIO: '0.8'
   PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
 jobs:
   get-jobs:
@@ -87,7 +89,7 @@ jobs:
       - name: Fail if no matching entries
         if: ${{ steps.filter.outputs.count == '0' }}
         run: |
-          echo "No entries produced for config-key=${{ inputs.config-key }}, seq-lens=${{ inputs.seq-lens }}, conc=${{ inputs.conc }}." >&2
+          echo "No entries produced for config-key=${{ inputs.config-key }}, conc=${{ inputs.conc }}." >&2
           exit 1
 
   profile:
@@ -153,6 +155,7 @@ jobs:
         with:
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch + Profile (single-node sglang/vllm)
         id: run
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index a73eb5bbc..a87f7ee13 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -107,18 +107,6 @@ jobs:
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
             run-eval: false
 
-    sweep-multi-node-1k8k:
-        needs: setup
-        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k']) != 'null' }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: multi-node 1k8k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k'] }}
-        secrets: inherit
-        with: *multi-node-inputs
-
     sweep-multi-node-8k1k:
         needs: setup
         if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }}
@@ -160,29 +148,47 @@ jobs:
             disagg: ${{ matrix.config.disagg }}
             run-eval: ${{ matrix.config.run-eval }}
 
-    sweep-single-node-1k8k:
+    sweep-single-node-8k1k:
         needs: setup
-        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k8k']) != 'null' }}
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }}
         uses: ./.github/workflows/benchmark-tmpl.yml
-        name: single-node 1k8k /
+        name: single-node 8k1k /
         strategy:
             fail-fast: false
             matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k8k'] }}
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
         secrets: inherit
         with: *single-node-inputs
 
-    sweep-single-node-8k1k:
+    sweep-evals:
         needs: setup
-        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }}
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }}
         uses: ./.github/workflows/benchmark-tmpl.yml
-        name: single-node 8k1k /
+        name: eval /
         strategy:
             fail-fast: false
             matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }}
         secrets: inherit
-        with: *single-node-inputs
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
 
     sweep-multi-node-evals:
         needs: setup
@@ -195,6 +201,7 @@ jobs:
                 config: ${{ fromJson(needs.setup.outputs.search-space-config).multinode_evals }}
         secrets: inherit
         with:
+            exp-name: ${{ matrix.config.exp-name }}
             isl: ${{ matrix.config.isl }}
             osl: ${{ matrix.config.osl }}
             max-model-len: ${{ matrix.config.max-model-len }}
@@ -204,17 +211,14 @@ jobs:
             model-prefix: ${{ matrix.config.model-prefix }}
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
-            exp-name: ${{ matrix.config.exp-name }}
             conc-list: ${{ toJson(matrix.config.conc) }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
-
             prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
             prefill-tp: ${{ matrix.config.prefill.tp }}
             prefill-ep: ${{ matrix.config.prefill.ep }}
             prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
             prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
-
             decode-num-worker: ${{ matrix.config.decode.num-worker }}
             decode-tp: ${{ matrix.config.decode.tp }}
             decode-ep: ${{ matrix.config.decode.ep }}
@@ -227,10 +231,8 @@ jobs:
         needs:
             [
                 sweep-single-node-1k1k,
-                sweep-single-node-1k8k,
                 sweep-single-node-8k1k,
                 sweep-multi-node-1k1k,
-                sweep-multi-node-1k8k,
                 sweep-multi-node-8k1k,
                 setup,
             ]
@@ -241,8 +243,8 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [sweep-multi-node-evals, setup]
-        if: ${{ always() && needs.setup.result != 'skipped' }}
+        needs: [sweep-evals, sweep-multi-node-evals, setup]
+        if: ${{ always() && needs.setup.result != 'skipped' && (needs.sweep-evals.result != 'skipped' || needs.sweep-multi-node-evals.result != 'skipped') }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
@@ -252,10 +254,12 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - name: Extract and save changelog metadata
-              env:
-                  CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }}
               run: |
-                  echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json
+                  cat <<'CONFIGEOF' > _full_config.json
+                  ${{ needs.setup.outputs.search-space-config }}
+                  CONFIGEOF
+                  jq '.changelog_metadata' _full_config.json > changelog_metadata.json
+                  rm -f _full_config.json
 
             - name: Upload changelog artifact
               uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
diff --git a/AGENTS.md b/AGENTS.md
index 787978cfc..e64a903cd 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -76,27 +76,27 @@ python -m pytest matrix_logic/ -v
 ```bash
 # Full sweep with all configs
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml
+  --config-files .github/configs/nvidia-master.yaml
 
 # Filter by model prefix (dsr1 or gptoss)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
-  --model dsr1
+  --config-files .github/configs/nvidia-master.yaml \
+  --model-prefix dsr1
 
 # Filter by framework (sglang, trt, vllm, atom, dynamo-trt, dynamo-sglang)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --framework sglang
 
 # Filter by precision (fp4, fp8)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --precision fp8
 
 # Filter by runner type (b200, h100, h200, gb200, mi300x, mi325x, mi355x)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
-  --runner b200
+  --config-files .github/configs/nvidia-master.yaml \
+  --runner-type b200
 ```
 
 ### Processing Results
@@ -141,7 +141,6 @@ When working with benchmark configurations, use these valid values:
 
 **Sequence Lengths (ISL/OSL)**:
 - `1k1k` - 1024 input / 1024 output
-- `1k8k` - 1024 input / 8192 output
 - `8k1k` - 8192 input / 1024 output
 
 ## Code Conventions
@@ -267,7 +266,7 @@ dsr1-fp8-h200-dynamo-sglang:
 **7. Validate configuration:**
 ```bash
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --framework dynamo-sglang
 ```
 
@@ -297,19 +296,19 @@ When upgrading Docker images in benchmark scripts and master configs .yaml:
 
 ## Evals (Accuracy Validation)
 
-Evals run optional accuracy checks after throughput benchmarks to ensure model outputs aren't degraded by inference optimizations.
+Evals run optional accuracy checks to ensure model outputs aren't degraded by inference optimizations. They can run alongside benchmarks or independently in eval-only mode.
 
 ### When Evals Run
 
 Evals run as **separate workflow jobs** from throughput benchmarks (eval-only mode). The `EVAL_ONLY` flag skips throughput benchmarking and only runs lm-eval.
 
-**Single-node** eval selection (from PR #911):
+**Single-node** eval selection:
 - All TPs at **highest concurrency** and **median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn)
 - Only on `8k1k` sequence length
 
 **Multi-node** eval selection:
-- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding)
-- Prefers `8k1k`; falls back to `1k8k` (never `1k1k`)
+- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn)
+- Only `8k1k` sequence length
 
 This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`.
 
@@ -330,21 +329,27 @@ The default eval framework is [lm-evaluation-harness](https://github.com/Eleuthe
 ### Running Evals via CLI
 
 ```bash
-# Generate configs with evals marked (in addition to all configs)
+# Generate configs (evals marked by default on 8k1k subset)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
-  --run-evals
+  --config-files .github/configs/nvidia-master.yaml
+
+# Generate throughput-only configs (skip evals)
+python utils/matrix_logic/generate_sweep_configs.py full-sweep \
+  --config-files .github/configs/nvidia-master.yaml \
+  --no-evals
 
 # Generate ONLY the eval subset (excludes non-eval configs)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --evals-only
 ```
 
 ### Eval Integration in Benchmark Scripts
 
-**Single-node** scripts in `benchmarks/single_node/` follow this pattern:
+All benchmark scripts in `benchmarks/` follow one of two flows:
+
 ```bash
+# Combined mode (benchmark + eval):
 # 1. Start server (with --context-length expansion if EVAL_ONLY=true)
 # 2. wait_for_server_ready
 # 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true)
@@ -353,6 +358,13 @@ if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary  # Writes meta_env.json and moves artifacts
 fi
+
+# Eval-only mode (EVAL_ONLY=true):
+# 1. Compute expanded context via compute_eval_context_length
+# 2. Start server with expanded context (--context-length or --max-model-len)
+# 3. wait_for_server_ready
+# 4. run_benchmark_serving returns immediately (skipped)
+# 5. run_eval + append_lm_eval_summary
 ```
 
 **Multi-node AMD** (`benchmarks/multi_node/amd_utils/server.sh`):
@@ -374,6 +386,8 @@ fi
 | `append_lm_eval_summary` | Writes `meta_env.json` and moves eval artifacts to workspace |
 | `_install_lm_eval_deps` | Installs lm-eval dependencies |
 | `_patch_lm_eval` | Patches lm-eval for reasoning tokens and TRT compatibility |
+| `compute_eval_context_length` | Computes eval context length (5x benchmark context, capped at model native max) |
+| `get_native_max_context_length` | Extracts model's native max context length from HF config |
 
 ### Eval Results Collection
 
@@ -413,19 +427,18 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `RUN_EVAL` | `false` | Enable eval after throughput |
+| `RUN_EVAL` | `false` | Enable eval after throughput benchmark |
 | `EVAL_ONLY` | `false` | Skip throughput, only run evals (set by workflow) |
 | `EVAL_FRAMEWORK` | `lm-eval` | Eval framework to use |
-| `EVAL_TASK` | `gsm8k` | Task definition file (without `.yaml`) |
-| `NUM_FEWSHOT` | `2` | Number of few-shot examples |
+| `EVAL_TASKS_DIR` | `utils/evals/gsm8k.yaml` | Path to lm-eval task YAML |
 | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results |
-| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by compute_eval_context_length) |
+| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) |
 | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval |
 
 ### Adding a New Eval Task
 
 1. Create a task YAML in `utils/evals/` (follow lm-eval task format)
-2. Set `EVAL_TASK=<your_task>` when running benchmarks
+2. Set `EVAL_TASKS_DIR=utils/evals/<your_task>.yaml` when running benchmarks
 3. Update `utils/collect_eval_results.py` if new metrics need extraction
 
 ### lm-eval Patches
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 326b796dd..b3264cef0 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -2,6 +2,13 @@
 
 # Shared benchmarking utilities for InferenceMAX
 
+# Keep Python bytecode out of the mounted workspace. Benchmark jobs often run as
+# root inside containers, and root-owned cache directories break future checkout
+# cleanup on self-hosted runners.
+export PYTHONDONTWRITEBYTECODE=1
+export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}"
+mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
+
 # --------------------------------
 # GPU monitoring helpers
 # --------------------------------
@@ -174,6 +181,12 @@ wait_for_server_ready() {
 #   --trust-remote-code: Optional flag to trust remote code from HuggingFace
 #   --server-pid: Optional server process ID to monitor during benchmark
 run_benchmark_serving() {
+    # In eval-only mode, skip the throughput benchmark entirely.
+    if [ "${EVAL_ONLY}" = "true" ]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+        return 0
+    fi
+
     set +x
     local model=""
     local port=""
@@ -486,6 +499,10 @@ move_profile_trace_for_relay() {
 # ------------------------------
 
 _install_lm_eval_deps() {
+    # torchvision causes circular imports in ATOM; TRT-LLM/SGLang need it at module level.
+    if [[ "${IMAGE:-}" == *atom* ]]; then
+        python3 -m pip uninstall -y torchvision 2>/dev/null || true
+    fi
     python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true
     local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476"
     if command -v git >/dev/null 2>&1; then
@@ -574,26 +591,74 @@ PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
 
+get_native_max_context_length() {
+    local model_path="$1"
+    python3 -c "
+from transformers import AutoConfig
+config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
+for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
+    if hasattr(config, attr):
+        print(getattr(config, attr))
+        break
+else:
+    print(0)
+"
+}
+
+# Compute the context length for eval-only mode.
+# Uses 5x the benchmark context capped at the model's native max.
+# Sets EVAL_MAX_MODEL_LEN (needed by run_lm_eval).
+# Echoes the computed value for scripts to capture.
+#
+# Usage: local ctx=$(compute_eval_context_length "$MODEL" "${current_ctx}")
+compute_eval_context_length() {
+    local model="$1"
+    local benchmark_ctx="${2:-0}"
+    local native_max
+    native_max=$(get_native_max_context_length "$model")
+    native_max="${native_max:-0}"
+
+    if [ "$benchmark_ctx" -eq 0 ] 2>/dev/null; then
+        benchmark_ctx="${native_max:-0}"
+    fi
+    local eval_ctx=$(( benchmark_ctx * 1 ))
+    if [ "$native_max" -gt 0 ] 2>/dev/null && [ "$eval_ctx" -gt "$native_max" ]; then
+        eval_ctx="$native_max"
+    fi
+    # If eval_ctx is still 0 (both benchmark_ctx and native_max were 0), fall back
+    if [ "$eval_ctx" -le 0 ] 2>/dev/null; then
+        echo "WARN: compute_eval_context_length could not determine context length for $model" >&2
+        eval_ctx="${MAX_MODEL_LEN:-16384}"
+    fi
+    EVAL_MAX_MODEL_LEN="$eval_ctx"
+    echo "$eval_ctx"
+}
+
+# Convenience wrapper: compute eval context from ISL/OSL and export EVAL_MAX_MODEL_LEN.
+# Call directly (not in a subshell) so the export persists.
+# Scripts then wire $EVAL_MAX_MODEL_LEN into whichever server variable they need.
+setup_eval_context() {
+    EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$((ISL + OSL + 200))")
+    export EVAL_MAX_MODEL_LEN
+}
+
 run_lm_eval() {
     local port="${PORT:-8888}"
-    local task="${EVAL_TASK:-gsm8k}"
-    local num_fewshot="${NUM_FEWSHOT:-2}"
+    local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}"
     local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
-    local gen_max_tokens=16384
+    local eval_context_len="${EVAL_MAX_MODEL_LEN:-16384}"
     local temperature=0
     local top_p=1
-    local concurrent_requests=32
+    local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-64}"
 
     while [[ $# -gt 0 ]]; do
         case $1 in
             --port)           port="$2"; shift 2 ;;
-            --task)           task="$2"; shift 2 ;;
-            --num-fewshot)    num_fewshot="$2"; shift 2 ;;
+            --task)           tasks_dir="$2"; shift 2 ;;
             --results-dir)    results_dir="$2"; shift 2 ;;
-            --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
+            --gen-max-tokens) eval_context_len="$2"; shift 2 ;;
             --temperature)    temperature="$2"; shift 2 ;;
             --top-p)          top_p="$2"; shift 2 ;;
-            --concurrent-requests) concurrent_requests="$2"; shift 2 ;;
             *)                echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
@@ -606,16 +671,23 @@ run_lm_eval() {
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
     MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
 
+    # Cap output tokens: must fit within context window (leave room for input),
+    # and avoid excessive KV cache reservation per request on TRT.
+    local max_output_tokens=$(( eval_context_len > 4096 ? eval_context_len - 4096 : eval_context_len / 2 ))
+    if [ "$max_output_tokens" -gt 16384 ]; then
+        max_output_tokens=16384
+    fi
+    echo "Eval budget: eval_context_len=${eval_context_len}, max_output_tokens=${max_output_tokens}"
+
     # Export for append_lm_eval_summary to pick up
     export EVAL_RESULT_DIR="$results_dir"
     set -x
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
-      --tasks "utils/evals/${task}.yaml" \
-      --num_fewshot "${num_fewshot}" \
+      --tasks "${tasks_dir}" \
       --output_path "${results_dir}" \
       --log_samples \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=600,tokenized_requests=False,max_length=${gen_max_tokens}" \
-      --gen_kwargs "max_tokens=8192,temperature=${temperature},top_p=${top_p}"
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${eval_context_len}" \
+      --gen_kwargs "max_tokens=${max_output_tokens},temperature=${temperature},top_p=${top_p}"
     local eval_exit=$?
     set +x
     return $eval_exit
@@ -623,8 +695,15 @@ run_lm_eval() {
 
 append_lm_eval_summary() {
     local results_dir="${EVAL_RESULT_DIR}"
+    if [ -z "${results_dir}" ]; then
+        echo "WARN: EVAL_RESULT_DIR is empty; skipping artifact collection" >&2
+        return 1
+    fi
     local out_dir="${results_dir}"
-    mkdir -p "$out_dir" || true
+    if [ ! -d "${out_dir}" ]; then
+        echo "WARN: EVAL_RESULT_DIR='${out_dir}' does not exist; skipping artifact collection" >&2
+        return 1
+    fi
 
     # Write minimal meta for collectors that expect it
     local meta_json="${out_dir}/meta_env.json"
@@ -672,13 +751,13 @@ META
 
     # Move eval artifacts into PWD (no new directories in workspace)
     if [ -f "${meta_json}" ]; then
-        mv -f "${meta_json}" ./ || true
+        mv -f "${meta_json}" ./ || echo "WARN: failed to move ${meta_json}" >&2
     fi
     if [ -d "${out_dir}" ]; then
         while IFS= read -r -d '' jf; do
             base=$(basename "$jf")
             if [ "$base" != "meta_env.json" ]; then
-                mv -f "$jf" ./ || true
+                mv -f "$jf" ./ || echo "WARN: failed to move ${jf}" >&2
             fi
         done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null)
     fi
@@ -706,8 +785,23 @@ run_eval() {
         esac
     done
 
+    # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
+    if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
+        compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
+    fi
+
+    local eval_rc=0
     case "$framework" in
-        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
-        *)               echo "Unknown framework '${framework}'"; return 1 ;;
+        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
+        *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
     esac
+
+    if [ "$eval_rc" -ne 0 ]; then
+        echo "ERROR: run_eval failed with exit code $eval_rc" >&2
+        if [ "${EVAL_ONLY}" = "true" ]; then
+            echo "Eval-only mode: failing after artifact collection" >&2
+            return "$eval_rc"
+        fi
+    fi
+    return $eval_rc
 }
diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh
index d98fb8e2b..d88941628 100644
--- a/benchmarks/single_node/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/dsr1_fp4_b200.sh
@@ -31,6 +31,11 @@ else
 fi
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -40,7 +45,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 &
+--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -63,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt.sh b/benchmarks/single_node/dsr1_fp4_b200_trt.sh
index 036c2998e..7a9706d30 100644
--- a/benchmarks/single_node/dsr1_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsr1_fp4_b200_trt.sh
@@ -77,6 +77,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
     # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens]
     capture_tokens=(1 2 4 8 16 32 64 128)
@@ -120,7 +126,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
index 2a0320e53..59e5a3930 100644
--- a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
@@ -76,10 +76,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
     elif [[ $CONC == 128 && $DP_ATTENTION == "false" ]]; then
         PIECEWISE_CUDA_GRAPHS="true"
     fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC == 64 ]]; then
-        PIECEWISE_CUDA_GRAPHS="true"
-    fi
 fi
 
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
@@ -101,6 +97,12 @@ fi  # end of set of configs using piecewise_cuda_graphs
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 set -x
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
@@ -134,7 +136,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x.sh b/benchmarks/single_node/dsr1_fp4_mi355x.sh
index 58c1118eb..578a6c810 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/dsr1_fp4_mi355x.sh
@@ -30,6 +30,11 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -44,7 +49,7 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --max-prefill-tokens=$PREFILL_SIZE \
 --cuda-graph-max-bs=128 \
 --attention-backend aiter \
---kv-cache-dtype fp8_e4m3 > $SERVER_LOG 2>&1 &
+--kv-cache-dtype fp8_e4m3 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -65,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
index 08f579244..31554fc22 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -69,7 +74,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
index af1ab6aa4..1d557684e 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -72,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh
index 7b4be6b2b..e6d8a0e9c 100644
--- a/benchmarks/single_node/dsr1_fp8_b200.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200.sh
@@ -63,6 +63,11 @@ else
 fi
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -72,7 +77,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 &
+--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -95,7 +100,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh
index b5e499ecc..781869bcc 100755
--- a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh
@@ -56,6 +56,11 @@ SPECULATIVE_EAGLE_TOPK=1
 
 SGLANG_ENABLE_SPEC_V2=1
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -85,7 +90,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --speculative-num-steps $SPECULATIVE_NUM_STEPS \
     --speculative-num-draft-tokens $SPECULATIVE_DRAFT_TOKENS \
     --speculative-eagle-topk $SPECULATIVE_EAGLE_TOPK \
-    > $SERVER_LOG 2>&1 &
+    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -109,7 +114,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt.sh b/benchmarks/single_node/dsr1_fp8_b200_trt.sh
index 8df439973..139aae669 100644
--- a/benchmarks/single_node/dsr1_fp8_b200_trt.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200_trt.sh
@@ -37,14 +37,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
         PIECEWISE_CUDA_GRAPHS="true"
         DELAY_BATCHING="true"
     fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -ge 256 ]]; then
-        CUDA_GRAPH_MAX_BATCH_SIZE=$(( $CONC / 8 ))
-        MOE_BACKEND="DEEPGEMM"
-        KV_CACHE_FREE_MEM_FRACTION=0.7
-    elif [[ $CONC -ge 128 ]]; then
-        PIECEWISE_CUDA_GRAPHS="true"
-    fi
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
     if [[ $CONC -ge 64 ]]; then
         PIECEWISE_CUDA_GRAPHS="true"
@@ -100,6 +92,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
     # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens]
     capture_tokens=(1 2 4 8 16 32 64 128)
@@ -146,7 +144,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
index c60388848..79f84f8a1 100644
--- a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
@@ -45,10 +45,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
     if [[ $CONC -le 4 ]]; then
         PIECEWISE_CUDA_GRAPHS="false"
     fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -le 8 ]]; then
-        PIECEWISE_CUDA_GRAPHS="false"
-    fi
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
     if [[ $CONC -le 16 ]]; then
         PIECEWISE_CUDA_GRAPHS="false"
@@ -89,7 +85,15 @@ attention_dp_config:
 EOF
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
 
 # prep PW CUDA config per the documentation
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
@@ -104,10 +108,9 @@ if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
     cat << EOF >> $EXTRA_CONFIG_FILE
 torch_compile_config:
     capture_num_tokens: [${CAPTURE_TOKENS_LIST%, }]
-    enable_piecewise_cuda_graph: true 
+    enable_piecewise_cuda_graph: true
 EOF
 fi
-
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -144,7 +147,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh
index fde2cfede..c820d180b 100644
--- a/benchmarks/single_node/dsr1_fp8_h200.sh
+++ b/benchmarks/single_node/dsr1_fp8_h200.sh
@@ -15,7 +15,7 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-pip3 install --user sentencepiece
+pip3 install --user --break-system-packages sentencepiece
 
 hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
@@ -26,6 +26,12 @@ start_gpu_monitor
 
 export TORCH_CUDA_ARCH_LIST="9.0"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
 set -x
 if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
     PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
@@ -35,7 +41,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
     --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \
     --attention-backend flashinfer --stream-interval 10 \
     --decode-log-interval 1 \
-    > $SERVER_LOG 2>&1 &
+    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 else
     PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
@@ -44,7 +50,7 @@ else
     --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \
     --attention-backend flashinfer --stream-interval 10 \
     --decode-log-interval 1 \
-    > $SERVER_LOG 2>&1 &
+    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 fi
 
 SERVER_PID=$!
@@ -66,7 +72,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt.sh b/benchmarks/single_node/dsr1_fp8_h200_trt.sh
index 5d98aa75e..383b86065 100644
--- a/benchmarks/single_node/dsr1_fp8_h200_trt.sh
+++ b/benchmarks/single_node/dsr1_fp8_h200_trt.sh
@@ -64,6 +64,12 @@ MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 # Launch TRT-LLM server
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
     trtllm-serve $MODEL --port=$PORT \
@@ -94,7 +100,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
index 0ecd48f02..9d0010903 100644
--- a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
@@ -80,6 +80,11 @@ fi
 
 MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -116,7 +121,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi300x.sh b/benchmarks/single_node/dsr1_fp8_mi300x.sh
index 41731427e..a5f161960 100644
--- a/benchmarks/single_node/dsr1_fp8_mi300x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi300x.sh
@@ -36,6 +36,11 @@ export SGLANG_AITER_MLA_PERSIST=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -50,7 +55,7 @@ python3 -m sglang.launch_server \
 --max-prefill-tokens=131072 \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
---disable-radix-cache > $SERVER_LOG 2>&1 &
+--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -71,7 +76,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index 6870fe060..ae1e930f0 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -29,6 +29,12 @@ export SGLANG_AITER_MLA_PERSIST=1
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
 set -x
 python3 -m sglang.launch_server \
 --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
@@ -41,7 +47,7 @@ python3 -m sglang.launch_server \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
 --disable-radix-cache \
-> $SERVER_LOG 2>&1 &
+$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -62,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh
index 1d00957e4..d629437cf 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh
@@ -27,6 +27,11 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -42,7 +47,7 @@ python3 -m sglang.launch_server \
     --num-continuous-decode-steps 4 \
     --max-prefill-tokens 196608 \
     --kv-cache-dtype fp8_e4m3 \
-    --cuda-graph-max-bs "$CONC" > $SERVER_LOG 2>&1 &
+    --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -63,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
index 08f579244..31554fc22 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -69,7 +74,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
index dfb8fafdc..920efb6ff 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -71,7 +76,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
@@ -80,4 +85,3 @@ stop_gpu_monitor
 set +x
 
 set -x
-rm -rf ./utils/bench_serving\
diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/glm5_fp8_b200.sh
index 5d09645c8..4ca4a215d 100755
--- a/benchmarks/single_node/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/glm5_fp8_b200.sh
@@ -30,6 +30,11 @@ PORT=${PORT:-8888}
 
 echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -49,7 +54,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
 --enable-flashinfer-allreduce-fusion --disable-radix-cache \
 --stream-interval 30 \
---model-loader-extra-config '{"enable_multithread_load": true}' > $SERVER_LOG 2>&1 &
+--model-loader-extra-config '{"enable_multithread_load": true}' $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -72,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh
index 9194bb870..7a985645f 100644
--- a/benchmarks/single_node/glm5_fp8_h200.sh
+++ b/benchmarks/single_node/glm5_fp8_h200.sh
@@ -22,6 +22,12 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -36,7 +42,7 @@ python3 -m sglang.launch_server \
   --mem-fraction-static 0.85 \
   --served-model-name glm-5-fp8 \
   --trust-remote-code \
-  > "$SERVER_LOG" 2>&1 &
+  $EVAL_CONTEXT_ARGS > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 
@@ -60,7 +66,7 @@ run_benchmark_serving \
 # Server accepts glm-5-fp8 (--served-model-name); lm-eval must use that model name
 if [ "${RUN_EVAL}" = "true" ]; then
     export MODEL_NAME=glm-5-fp8
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/glm5_fp8_mi355x.sh
index ee11463ce..3d82fd856 100755
--- a/benchmarks/single_node/glm5_fp8_mi355x.sh
+++ b/benchmarks/single_node/glm5_fp8_mi355x.sh
@@ -30,6 +30,11 @@ export SAFETENSORS_FAST_GPU=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -44,7 +49,7 @@ python3 -m sglang.launch_server \
     --mem-fraction-static 0.85 \
     --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
     --nsa-prefill-backend tilelang \
-    --nsa-decode-backend tilelang > $SERVER_LOG 2>&1 &
+    --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -65,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/glm5_nvfp4_b200.sh b/benchmarks/single_node/glm5_nvfp4_b200.sh
new file mode 100755
index 000000000..182f363ad
--- /dev/null
+++ b/benchmarks/single_node/glm5_nvfp4_b200.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+# following https://huggingface.co/nvidia/GLM-5-NVFP4#usage recipe
+# except using latest nightly at the time of writing
+# since the recommended nightly image in that recipe doesn't exist.
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+--trust-remote-code \
+--tensor-parallel-size=$TP \
+--data-parallel-size 1 --expert-parallel-size 1 \
+--tool-call-parser glm47 \
+--reasoning-parser glm45 \
+--quantization modelopt_fp4 \
+--cuda-graph-max-bs $CONC --max-running-requests $CONC \
+--mem-fraction-static 0.80 \
+--chunked-prefill-size 131072 \
+--stream-interval 30 \
+--model-loader-extra-config '{"enable_multithread_load": true}' $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh
index 46fccca6a..f6a6f72e9 100644
--- a/benchmarks/single_node/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/gptoss_fp4_b200.sh
@@ -26,7 +26,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
 elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
     CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
 else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
 cat > config.yaml << EOF
@@ -77,7 +82,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/gptoss_fp4_b200_trt.sh
index 42fa96a94..c9ba2752c 100644
--- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh
+++ b/benchmarks/single_node/gptoss_fp4_b200_trt.sh
@@ -78,6 +78,12 @@ set -x
 
 MAX_NUM_TOKENS=20000
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
     trtllm-serve $MODEL --port=$PORT \
@@ -109,7 +115,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC ))
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh
index 314ec43c9..8d0e773a2 100644
--- a/benchmarks/single_node/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/gptoss_fp4_h100.sh
@@ -17,11 +17,18 @@ fi
 
 hf download "$MODEL"
 
+MAX_MODEL_LEN=10240
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 cat > config.yaml << EOF
 no-enable-prefix-caching: true
 max-cudagraph-capture-size: 2048
 max-num-batched-tokens: 8192
-max-model-len: 10240
+max-model-len: $MAX_MODEL_LEN
 EOF
 
 export PYTHONNOUSERSITE=1
@@ -37,8 +44,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --config config.yaml \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
---max-num-seqs=$CONC  \
---disable-log-requests > $SERVER_LOG 2>&1 &
+--max-num-seqs=$CONC > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -61,7 +67,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh
index 251294a62..2a9359b96 100644
--- a/benchmarks/single_node/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/gptoss_fp4_h200.sh
@@ -29,7 +29,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
 elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
     CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
 else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
 # Create config.yaml
@@ -50,8 +55,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
  --config config.yaml \
  --gpu-memory-utilization 0.9 \
  --tensor-parallel-size $TP \
- --max-num-seqs $CONC  \
- --disable-log-requests > $SERVER_LOG 2>&1 &
+ --max-num-seqs $CONC > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -72,7 +76,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_h200_trt.sh b/benchmarks/single_node/gptoss_fp4_h200_trt.sh
index a96b311d8..41dede14b 100644
--- a/benchmarks/single_node/gptoss_fp4_h200_trt.sh
+++ b/benchmarks/single_node/gptoss_fp4_h200_trt.sh
@@ -8,6 +8,7 @@ check_env_vars \
     CONC \
     ISL \
     OSL \
+    MAX_MODEL_LEN \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME \
     DP_ATTENTION \
@@ -48,10 +49,19 @@ print_iter_log: true
 stream_interval: 20 
 EOF
 
+MAX_NUM_TOKENS=20000
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
 trtllm-serve $MODEL \
 --max_batch_size $CONC \
---max_num_tokens 20000 \
+--max_num_tokens $MAX_NUM_TOKENS \
+--max_seq_len=$MAX_MODEL_LEN \
 --backend pytorch \
 --extra_llm_api_options gptoss-config.yml \
 --ep_size=$EP_SIZE \
@@ -82,7 +92,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh
index f71aeb090..56a7823cf 100644
--- a/benchmarks/single_node/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh
@@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -73,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh
index f71aeb090..56a7823cf 100644
--- a/benchmarks/single_node/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh
@@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -73,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh
index f23949739..37cb358ba 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh
@@ -43,6 +43,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -74,7 +78,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
index cf71cbb3b..76bc87c0c 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -70,7 +75,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/kimik2.5_fp4_b200.sh
index 422a74950..4818f246e 100644
--- a/benchmarks/single_node/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_b200.sh
@@ -26,6 +26,10 @@ export PYTHONNOUSERSITE=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -38,6 +42,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --reasoning-parser kimi_k2 \
 --tool-call-parser kimi_k2 \
 --compilation_config.pass_config.fuse_allreduce_rms true \
+--no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -62,7 +67,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
index a8bd01442..c680529e2 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -31,6 +31,11 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 # If the machine runs a MEC FW older than 177, RCCL
 # cannot reclaim some memory.
 # Disable that features to avoid crashes.
@@ -70,6 +75,7 @@ $EP \
 --block-size=1 \
 --no-enable-prefix-caching \
 --trust-remote-code \
+--no-enable-prefix-caching \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -92,7 +98,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_b200.sh b/benchmarks/single_node/kimik2.5_int4_b200.sh
index 6468cc05c..df4c63f6b 100755
--- a/benchmarks/single_node/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/kimik2.5_int4_b200.sh
@@ -26,6 +26,10 @@ export VLLM_USE_FLASHINFER_MOE_INT4=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -64,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_h200.sh b/benchmarks/single_node/kimik2.5_int4_h200.sh
index 37281f61e..766fe74a0 100755
--- a/benchmarks/single_node/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/kimik2.5_int4_h200.sh
@@ -25,6 +25,11 @@ export PYTHONNOUSERSITE=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 # following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html recipe
 
 # Start GPU monitoring (power, temperature, clocks every second)
@@ -40,6 +45,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --tool-call-parser kimi_k2 \
 --compilation_config.pass_config.fuse_allreduce_rms true \
 --trust-remote-code \
+--no-enable-prefix-caching \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -64,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_mi300x.sh b/benchmarks/single_node/kimik2.5_int4_mi300x.sh
new file mode 100755
index 000000000..a05baddeb
--- /dev/null
+++ b/benchmarks/single_node/kimik2.5_int4_mi300x.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+hf download "$MODEL"
+
+# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
+if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+# following AMD andy luo's recipe
+# https://x.com/linluo77/status/2017024513595301985
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+export VLLM_ROCM_USE_AITER=1
+vllm serve $MODEL --port $PORT \
+--tensor-parallel-size=$TP \
+--gpu-memory-utilization 0.95 \
+--max-model-len $MAX_MODEL_LEN \
+--block-size=64 \
+--trust-remote-code \
+--no-enable-prefix-caching \
+--max-num-seqs 256 \
+--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/kimik2.5_int4_mi325x.sh
index e6b7629ea..a05baddeb 100755
--- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh
+++ b/benchmarks/single_node/kimik2.5_int4_mi325x.sh
@@ -28,18 +28,24 @@ PORT=${PORT:-8888}
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
 set -x
+export VLLM_ROCM_USE_AITER=1
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=64 \
---disable-log-requests \
 --trust-remote-code \
+--no-enable-prefix-caching \
+--max-num-seqs 256 \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -62,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/kimik2.5_int4_mi355x.sh
index 935c6cd2e..5e40da700 100755
--- a/benchmarks/single_node/kimik2.5_int4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_int4_mi355x.sh
@@ -26,6 +26,10 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -37,6 +41,7 @@ vllm serve $MODEL --port $PORT \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=64 \
 --trust-remote-code \
+--no-enable-prefix-caching \
 --max-num-seqs 256 \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
 
@@ -60,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
index 2e5aa4b24..5ea1b8657 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
@@ -33,6 +33,10 @@ else
   EP=" "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -43,6 +47,7 @@ $EP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=32 \
+--no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -65,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
index 90f5bd772..0f024ea9f 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
@@ -26,6 +26,11 @@ export PYTHONNOUSERSITE=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -42,7 +47,7 @@ $EP \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --max-num-seqs 256 \
---disable-log-requests \
+--no-enable-prefix-caching \
 --trust-remote-code \
 --compilation-config '{"cudagraph_mode":"PIECEWISE"}' > $SERVER_LOG 2>&1 &
 
@@ -66,7 +71,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
index 4b613d88e..84e73b65c 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
@@ -22,6 +22,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [ "$EP_SIZE" -ge 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -37,7 +42,7 @@ vllm serve $MODEL --port $PORT \
 $EP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
---disable-log-requests \
+--no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -60,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
index 4dfaf6b80..d03f57c9b 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
@@ -28,6 +28,10 @@ export VLLM_ROCM_USE_AITER=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -38,6 +42,7 @@ vllm serve $MODEL --port $PORT \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=32 \
 --disable-log-requests \
+--no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -60,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
index e5d404036..aad72ad2f 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
@@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+    EP_SIZE \
     CONC \
     ISL \
     OSL \
@@ -30,16 +31,27 @@ export VLLM_ROCM_USE_AITER=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "$EP_SIZE" -gt 1 ]; then
+  EP=" --enable-expert-parallel"
+else
+  EP=" "
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
+$EP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=32 \
---disable-log-requests \
+--no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -62,7 +74,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index cebbf72a0..adfb959cf 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -29,6 +29,11 @@ export VLLM_ROCM_USE_AITER=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -45,6 +50,7 @@ $EP \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=32 \
+--no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -67,7 +73,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_b200.sh b/benchmarks/single_node/qwen3.5_bf16_b200.sh
index 38785a104..86ce6b66f 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b200.sh
@@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768
 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
 MAX_RUNNING_REQUESTS=128
 CONTEXT_LENGTH=$((ISL + OSL + 20))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
 
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
@@ -79,7 +83,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh
index ea10647d6..8aca9860a 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -31,7 +36,8 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 \
+    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -52,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh
index ea10647d6..8aca9860a 100644
--- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -31,7 +36,8 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 \
+    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -52,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
index f77390707..701695def 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -30,7 +35,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -51,7 +56,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/qwen3.5_fp8_b200.sh
index 39b020ecc..36e5d579d 100755
--- a/benchmarks/single_node/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b200.sh
@@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768
 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
 MAX_RUNNING_REQUESTS=128
 CONTEXT_LENGTH=$((ISL + OSL + 20))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
 
 if [[ $TP -eq 8 ]]; then
   EXTRA_ARGS="--enable-flashinfer-allreduce-fusion"
@@ -87,7 +91,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
index 1270c76a6..87933b166 100755
--- a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
@@ -48,6 +48,10 @@ SPECULATIVE_EAGLE_TOPK=1
 
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -88,7 +92,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_h200.sh b/benchmarks/single_node/qwen3.5_fp8_h200.sh
index 2ae26b771..636a8ee92 100644
--- a/benchmarks/single_node/qwen3.5_fp8_h200.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200.sh
@@ -23,6 +23,10 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 MAX_SEQ_LEN=$((ISL + OSL + 20))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
 
@@ -76,7 +80,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh
index 0640a20ab..00cc9cf91 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -32,7 +37,8 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 \
+    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -53,7 +59,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh
index 0640a20ab..00cc9cf91 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -32,7 +37,8 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 \
+    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -53,7 +59,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
index f77390707..701695def 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -30,7 +35,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -51,7 +56,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b85245458..967edc19c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,17 @@
+- config-keys:
+    - kimik2.5-int4-mi300x-vllm
+  description:
+    - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)"
+    - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+
+- config-keys:
+    - minimaxm2.5-fp8-h100-vllm
+    - minimaxm2.5-fp8-h200-vllm
+  description:
+    - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+
 - config-keys:
     - dsr1-fp8-b200-dynamo-trt
     - dsr1-fp8-h200-dynamo-trt
@@ -992,7 +1006,7 @@
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
-  
+
 - config-keys:
     - glm5-fp8-b200-sglang
   description:
@@ -1067,6 +1081,13 @@
     - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130"
     - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943
+  
+- config-keys:
+    - minimaxm2.5-fp8-mi325x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Replace TP4 with TP8/EP8, add conc range 4-256"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953
 
 - config-keys:
     - kimik2.5-fp4-mi355x-vllm
@@ -1085,3 +1106,117 @@
     - "Triton Fused Moe Tuning https://github.com/vllm-project/vllm/pull/35093"
     - "Add --max-num-seqs 256, remove --disable-log-requests"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950
+
+- config-keys:
+    - kimik2.5-int4-mi325x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Enable AITER MLA, export VLLM_ROCM_USE_AITER=1, https://github.com/vllm-project/vllm/issues/35641"
+    - "Triton Fused Moe Tuning https://github.com/vllm-project/vllm/pull/35093"
+    - "Add --max-num-seqs 256, remove --disable-log-requests"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/957
+
+- config-keys:
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-vllm
+  description:
+    - "Update vLLM image from v0.15.1 to v0.18.0 for gptoss H100 and H200 configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/960
+
+- config-keys:
+    - kimik2.5-int4-mi325x-vllm
+    - kimik2.5-int4-mi355x-vllm
+    - kimik2.5-int4-h200-vllm
+    - kimik2.5-fp4-mi355x-vllm
+    - kimik2.5-fp4-b200-vllm
+  description:
+    - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926
+
+- config-keys:
+    - minimaxm2.5-fp8-b200-vllm
+    - minimaxm2.5-fp8-h100-vllm
+    - minimaxm2.5-fp8-h200-vllm
+    - minimaxm2.5-fp8-mi300x-vllm
+    - minimaxm2.5-fp8-mi325x-vllm
+    - minimaxm2.5-fp8-mi355x-vllm
+  description:
+    - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966
+  
+- config-keys:
+    # NVIDIA single-node
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp4-b200-trt
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-b200-sglang-mtp
+    - dsr1-fp8-b200-trt
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-sglang
+    - dsr1-fp8-h200-trt
+    - dsr1-fp8-h200-trt-mtp
+    - glm5-fp8-b200-sglang
+    - glm5-fp8-h200-sglang
+    - gptoss-fp4-b200-trt
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-trt
+    - gptoss-fp4-h200-vllm
+    - kimik2.5-fp4-b200-vllm
+    - kimik2.5-int4-b200-vllm
+    - kimik2.5-int4-h200-vllm
+    - minimaxm2.5-fp8-b200-vllm
+    - minimaxm2.5-fp8-h100-vllm
+    - minimaxm2.5-fp8-h200-vllm
+    - qwen3.5-bf16-b200-sglang
+    - qwen3.5-fp8-b200-sglang
+    - qwen3.5-fp8-b200-sglang-mtp
+    - qwen3.5-fp8-h200-sglang
+    # AMD single-node
+    - dsr1-fp4-mi355x-atom
+    - dsr1-fp4-mi355x-atom-mtp
+    - dsr1-fp4-mi355x-sglang
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi300x-sglang
+    - dsr1-fp8-mi355x-atom
+    - dsr1-fp8-mi355x-atom-mtp
+    - dsr1-fp8-mi355x-sglang
+    - glm5-fp8-mi355x-sglang
+    - gptoss-fp4-mi300x-vllm
+    - gptoss-fp4-mi325x-vllm
+    - gptoss-fp4-mi355x-atom
+    - gptoss-fp4-mi355x-vllm
+    - kimik2.5-fp4-mi355x-vllm
+    - kimik2.5-int4-mi325x-vllm
+    - kimik2.5-int4-mi355x-vllm
+    - minimaxm2.5-fp8-mi300x-vllm
+    - minimaxm2.5-fp8-mi325x-vllm
+    - minimaxm2.5-fp8-mi355x-vllm
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-bf16-mi355x-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+    - qwen3.5-fp8-mi355x-sglang
+  description:
+    - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
+  evals-only: true
+
+- config-keys:
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+  description:
+    - "Add --disable-radix-cache to SGLang server launch command for qwen3.5 MI300X and MI325X benchmark scripts"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/970
+
+- config-keys:
+    - glm5-nvfp4-b200-sglang
+  description:
+    - "Add GLM-5 NVFP4 single-node B200 SGLang benchmark (TP8, conc 4-64)"
+    - "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization"
+    - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 022fd7cb2..f8c614936 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -33,7 +33,7 @@ docker run --rm --init --network host --name $server_name \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 223264914..5100419b9 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 170a1bdc3..9d157a858 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/bash
 
-export HF_HUB_CACHE_MOUNT="/home/gharunner/gharunners/hf-hub-cache/"
+export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/"
 export PORT=8888
 
 MODEL_CODE="${EXP_NAME%%_*}"
diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh
index 4faf07338..4ebe62c41 100644
--- a/runners/launch_mi300x-amds.sh
+++ b/runners/launch_mi300x-amds.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
 
-export HF_HUB_CACHE_MOUNT="/nvme_home/gharunner/gharunners/hf-hub-cache/"
+export HF_HUB_CACHE_MOUNT="/home/gharunner/gharunners/hf-hub-cache/"
 export PORT=8888
 
 PARTITION="compute"
-SQUASH_FILE="/nvme_home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
 set -x
diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py
index 32331a398..af030720e 100644
--- a/utils/bench_serving/backend_request_func.py
+++ b/utils/bench_serving/backend_request_func.py
@@ -439,6 +439,76 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     return pretrained_model_name_or_path
 
 
+def _fix_tokenizer_for_sglang(tokenizer, model_path):
+    """Fix transformers v5 tokenizer to match sglang server-side behavior.
+
+    Root cause: transformers v5 (>= 5.0) changed how tokenizers are loaded.
+    Specifically, LlamaTokenizerFast.__init__ in v5 rebuilds the pre_tokenizer
+    and decoder from scratch using class-specific components, discarding the
+    originals from tokenizer.json. For models like DeepSeek-R1 that declare
+    LlamaTokenizerFast but actually use a ByteLevel/Sequence tokenizer
+    architecture, v5 incorrectly replaces the original Sequence pre_tokenizer
+    with Metaspace, and the original ByteLevel decoder with Sequence.
+    See: https://github.com/sgl-project/sglang/blob/9238bd08a2895fa3b7ec79ea567e5c27ac951343/python/sglang/srt/utils/hf_transformers_utils.py#L836
+
+    The sglang server applies fixes for this in hf_transformers_utils.py
+    (_fix_v5_tokenizer_components and _fix_v5_add_bos_eos_token), but the
+    benchmark client loads the tokenizer directly via AutoTokenizer without
+    these fixes. This mismatch causes the client to encode text differently
+    from the server -- e.g. a 7000-token prompt on the client becomes ~35000
+    tokens on the server, leading to ~5x TTFT inflation and false performance
+    regressions in benchmarks.
+
+    This function replicates the same fixes so the benchmark client tokenizes
+    identically to the sglang server. It is a no-op on transformers v4.
+    """
+    import json
+    from pathlib import Path
+
+    backend = getattr(tokenizer, "_tokenizer", None)
+    if backend is not None:
+        try:
+            from tokenizers import Tokenizer as RawTokenizer
+            tok_file = Path(model_path) / "tokenizer.json"
+            if tok_file.is_file():
+                raw = RawTokenizer.from_file(str(tok_file))
+                raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None
+                loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None
+                if raw_pre and loaded_pre and raw_pre != loaded_pre:
+                    backend.pre_tokenizer = raw.pre_tokenizer
+                    backend.decoder = raw.decoder
+        except Exception:
+            pass
+
+    try:
+        config_file = Path(model_path) / "tokenizer_config.json"
+        if config_file.is_file():
+            with open(config_file) as f:
+                config = json.load(f)
+            tok_class = config.get("tokenizer_class", "")
+            bos_eos_classes = {
+                "LlamaTokenizer", "LlamaTokenizerFast",
+                "CodeLlamaTokenizer", "CodeLlamaTokenizerFast",
+                "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast",
+            }
+            if tok_class in bos_eos_classes:
+                defaults = {"add_bos_token": True, "add_eos_token": False}
+                changed = False
+                for attr in ("add_bos_token", "add_eos_token"):
+                    val = config.get(attr)
+                    if val is None:
+                        val = defaults.get(attr, False)
+                    if getattr(tokenizer, attr, None) != val:
+                        setattr(tokenizer, f"_{attr}", val)
+                        changed = True
+                if changed and hasattr(tokenizer, "update_post_processor"):
+                    tokenizer.update_post_processor()
+    except Exception:
+        pass
+
+    return tokenizer
+
+
 def get_tokenizer(
     pretrained_model_name_or_path: str,
     tokenizer_mode: str = "auto",
@@ -464,11 +534,12 @@ def get_tokenizer(
         return MistralTokenizer.from_pretrained(
             str(pretrained_model_name_or_path))
     else:
-        return AutoTokenizer.from_pretrained(
+        tokenizer = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path,
             trust_remote_code=trust_remote_code,
             **kwargs,
         )
+        return _fix_tokenizer_for_sglang(tokenizer, pretrained_model_name_or_path)
 
 
 ASYNC_REQUEST_FUNCS = {
@@ -481,4 +552,4 @@ def get_tokenizer(
     "tensorrt-llm": async_request_trt_llm,
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
-}
\ No newline at end of file
+}
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index c3dddfcc6..e32d6d988 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -6,7 +6,7 @@ Quick graded QnA which measures model performance. Examples of test suites:
 - **gpqa**: Graduate level, Google-Proof multiple choice questions
 
 ## When?
-At highest concurrency for highest TP and lowest TP, per GPU per model only for 1k8k. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py`
+At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py`
 
 ## Why?
 To verify how model outputs are affected by throughput optimizations. 
@@ -15,7 +15,7 @@ To verify how model outputs are affected by throughput optimizations.
 - If there was a tradeoff in accuracy for performance
 
 ## How?
-- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
 
 ## Misc
 Following files are task definitions from lmeval, more info on changes within the files
diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml
index fb0f0a829..e748119cd 100644
--- a/utils/evals/gsm8k.yaml
+++ b/utils/evals/gsm8k.yaml
@@ -9,7 +9,7 @@ output_type: generate_until
 training_split: train
 fewshot_split: train
 test_split: test
-doc_to_text: "Question: {{question}}\nEnd your answer with: #### <answer>\nAnswer:"
+doc_to_text: "Question: {{question}}\nEnd your response with the answer on the last line, formatted as: #### [number]\nAnswer:"
 doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
 metric_list:
   - metric: exact_match
diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json
new file mode 100644
index 000000000..8ea0b71c0
--- /dev/null
+++ b/utils/evals/thresholds.json
@@ -0,0 +1,4 @@
+{
+  "gsm8k": 0.85,
+  "gpqa_diamond_cot_n_shot": 0.30
+}
diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
new file mode 100644
index 000000000..85433ec4b
--- /dev/null
+++ b/utils/evals/validate_scores.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""Validate eval scores against minimum thresholds.
+
+Reads lm-eval results JSON files and checks that scored metrics meet the
+required minimum.  Thresholds are configured per-task in a JSON config file
+(default: utils/evals/thresholds.json).
+
+Usage:
+    python3 utils/evals/validate_scores.py
+    python3 utils/evals/validate_scores.py --thresholds my_thresholds.json
+    python3 utils/evals/validate_scores.py --min-score 0.90  # flat threshold, no config
+"""
+import argparse
+import glob
+import json
+import sys
+from pathlib import Path
+
+
+def load_thresholds(path: str) -> dict[str, float]:
+    """Load thresholds config. Returns {task_name: min_score}."""
+    with open(path) as f:
+        return json.load(f)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Validate eval scores")
+    parser.add_argument(
+        "--min-score", type=float, default=0.85,
+        help="Fallback minimum score when no threshold config matches (default: 0.85)",
+    )
+    parser.add_argument(
+        "--thresholds", default=None,
+        help="Path to thresholds JSON config (default: utils/evals/thresholds.json)",
+    )
+    parser.add_argument(
+        "--metric-prefix", default="exact_match,",
+        help="Only check metrics whose name starts with this prefix (default: 'exact_match,')",
+    )
+    parser.add_argument(
+        "--results-glob", default="results*.json",
+        help="Glob pattern for result files (default: 'results*.json')",
+    )
+    args = parser.parse_args()
+
+    # Load thresholds config
+    thresholds = {}
+    thresholds_path = args.thresholds
+    if thresholds_path is None:
+        default_path = Path(__file__).parent / "thresholds.json"
+        if default_path.exists():
+            thresholds_path = str(default_path)
+    if thresholds_path:
+        try:
+            thresholds = load_thresholds(thresholds_path)
+            print(f"Loaded thresholds from {thresholds_path}")
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"WARN: could not load thresholds from {thresholds_path}: {e}", file=sys.stderr)
+
+    failed = False
+    checked = 0
+
+    for f in sorted(glob.glob(args.results_glob)):
+        with open(f) as fh:
+            data = json.load(fh)
+        for task, metrics in data.get("results", {}).items():
+            min_score = thresholds.get(task, args.min_score)
+            for name, val in metrics.items():
+                if not name.startswith(args.metric_prefix) or "stderr" in name:
+                    continue
+                if not isinstance(val, (int, float)):
+                    continue
+                checked += 1
+                if val < min_score:
+                    print(
+                        f"FAIL: {task} {name} = {val:.4f} (< {min_score})",
+                        file=sys.stderr,
+                    )
+                    failed = True
+                else:
+                    print(f"PASS: {task} {name} = {val:.4f} (>= {min_score})")
+
+    if checked == 0:
+        print("WARN: no metrics matched prefix '{}'".format(args.metric_prefix), file=sys.stderr)
+
+    return 1 if (failed or checked == 0) else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 850fecd6a..5d61dffa5 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -17,7 +17,6 @@
 
 seq_len_stoi = {
     "1k1k": (1024, 1024),
-    "1k8k": (1024, 8192),
     "8k1k": (8192, 1024)
 }
 
@@ -35,31 +34,22 @@ def seq_len_to_str(isl: int, osl: int) -> str:
 
 def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
     """Eval selection policy:
-    - Only consider 1k8k (isl=1024, osl=8192).
-    - Single-node: for each unique (model, runner, framework, precision, isl, osl,
-      spec-decoding, dp-attn), mark highest TP with highest conc and lowest TP
-      with highest conc.
+    - Single-node: only consider 8k1k (isl=8192, osl=1024).
+      For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn):
+        - Mark all entries at the highest CONC (all TPs)
+        - Mark all entries at the median CONC (all TPs)
     - Multi-node: for each unique (model, runner, framework, precision,
-      spec-decoding, prefill-dp-attn, decode-dp-attn), prefer 8k1k entries;
-      fall back to 1k8k if unavailable (never 1k1k). Mark the entry with the
-      highest max concurrency.
-
-    Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
-    independently.
+      spec-decoding, prefill-dp-attn, decode-dp-attn), only 8k1k entries.
+      Mark the entry with the highest max concurrency.
     """
     from collections import defaultdict
 
-    # Only run evals on 1k8k
-    target_isl, target_osl = seq_len_stoi["1k8k"]
-
-    # --- Single-node eval selection ---
-    # Group entries by (model, runner, framework, precision, isl, osl)
+    # Only run evals on 8k1k
+    target_isl, target_osl = seq_len_stoi["8k1k"]
+    # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn).
     # Only include entries that have a top-level TP (i.e., single-node schema).
-    # This avoids relying on structural hints like prefill/decode which may be
-    # reused by future single-node disaggregated modes.
     groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
-        # Skip entries without a top-level TP field
         if Fields.TP.value not in entry:
             continue
 
@@ -78,40 +68,24 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         )
         groups[key].append((i, entry))
 
-    # For each group, find highest TP/highest conc and lowest TP/highest conc
+    # For each group, select entries at highest CONC and median CONC (all TPs)
     eval_indices = set()
     for key, entries in groups.items():
         if not entries:
             continue
 
-        # Find min and max TP values
-        min_tp = min(e[Fields.TP.value] for _, e in entries)
-        max_tp = max(e[Fields.TP.value] for _, e in entries)
-
-        # Find highest conc for highest TP
-        highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp]
-        if highest_tp_entries:
-            max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries)
-            for i, e in highest_tp_entries:
-                if e[Fields.CONC.value] == max_conc_highest_tp:
-                    eval_indices.add(i)
-
-        # Find highest conc for lowest TP (only if different from max_tp)
-        if min_tp != max_tp:
-            lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp]
-            if lowest_tp_entries:
-                max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries)
-                for i, e in lowest_tp_entries:
-                    if e[Fields.CONC.value] == max_conc_lowest_tp:
-                        eval_indices.add(i)
+        conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries))
+        median_conc = conc_values[len(conc_values) // 2]
+        target_concs = {conc_values[-1], median_conc}
+
+        for i, e in entries:
+            if e[Fields.CONC.value] in target_concs:
+                eval_indices.add(i)
 
     # --- Multi-node eval selection ---
     # For multi-node (disaggregated) entries, pick one representative per group.
-    # Prefer 8k1k; fall back to 1k8k if unavailable (never 1k1k).
+    # Only 8k1k entries are eligible (never 1k1k).
     # Within a group, pick the entry with the highest max concurrency.
-    # Multi-node: prefer 8k1k, fallback to 1k8k
-    mn_target_isl, mn_target_osl = seq_len_stoi["8k1k"]
-    fallback_isl, fallback_osl = seq_len_stoi["1k8k"]
     mn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
         if Fields.TP.value in entry:
@@ -136,14 +110,10 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         if not entries:
             continue
 
-        # Prefer 8k1k entries; fall back to 1k8k
+        # Only 8k1k entries are eligible for eval
         preferred = [(i, e) for i, e in entries
-                     if e.get(Fields.ISL.value) == mn_target_isl
-                     and e.get(Fields.OSL.value) == mn_target_osl]
-        if not preferred:
-            preferred = [(i, e) for i, e in entries
-                         if e.get(Fields.ISL.value) == fallback_isl
-                         and e.get(Fields.OSL.value) == fallback_osl]
+                     if e.get(Fields.ISL.value) == target_isl
+                     and e.get(Fields.OSL.value) == target_osl]
         if not preferred:
             continue
 
@@ -806,9 +776,9 @@ def main():
     )
     eval_group = parent_parser.add_mutually_exclusive_group()
     eval_group.add_argument(
-        '--run-evals',
+        '--no-evals',
         action='store_true',
-        help='When specified, run evals on a subset of configs (in addition to all configs).'
+        help='When specified, skip evals (throughput benchmarks only).'
     )
     eval_group.add_argument(
         '--evals-only',
@@ -1020,10 +990,9 @@ def main():
     else:
         parser.error(f"Unknown command: {args.command}")
         
-    # Handle eval options (mutually exclusive)
-    if args.run_evals or args.evals_only:
+    # Handle eval options (mutually exclusive: --no-evals or --evals-only)
+    if not args.no_evals:
         matrix_values = mark_eval_entries(matrix_values)
-        # IF --evals-only is specified, filter to only eval entries
         if args.evals_only:
             matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)]
 
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index 84ecddd3d..1fecdd487 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -158,13 +158,11 @@ class TestSeqLenMappings:
     def test_seq_len_stoi_values(self):
         """Verify seq_len_stoi has expected mappings."""
         assert seq_len_stoi["1k1k"] == (1024, 1024)
-        assert seq_len_stoi["1k8k"] == (1024, 8192)
         assert seq_len_stoi["8k1k"] == (8192, 1024)
 
     def test_seq_len_itos_reverse_mapping(self):
         """Verify seq_len_itos is reverse of stoi."""
         assert seq_len_itos[(1024, 1024)] == "1k1k"
-        assert seq_len_itos[(1024, 8192)] == "1k8k"
         assert seq_len_itos[(8192, 1024)] == "8k1k"
 
 
@@ -174,7 +172,6 @@ class TestSeqLenToStr:
     def test_known_sequence_lengths(self):
         """Known sequence lengths should return short name."""
         assert seq_len_to_str(1024, 1024) == "1k1k"
-        assert seq_len_to_str(1024, 8192) == "1k8k"
         assert seq_len_to_str(8192, 1024) == "8k1k"
 
     def test_unknown_sequence_lengths(self):
diff --git a/utils/process_changelog.py b/utils/process_changelog.py
index 6b4c7878c..9d231ad3c 100644
--- a/utils/process_changelog.py
+++ b/utils/process_changelog.py
@@ -90,60 +90,80 @@ def main():
         },
     }
 
-    all_results = []
-    # Deduplicate repeated configs, if for some reason a config key appears multiple times
-    # in one commit, we don't want to run that config two times (there will just be twice as many
-    # data points for that config, which is not useful)
-    all_configs_to_run = set()
+    all_benchmark_results = []
+    all_eval_results = []
+    # Deduplicate repeated configs separately for benchmarks and evals.
+    # An evals-only entry should not prevent a later regular entry from
+    # generating benchmarks for the same config, and vice versa.
+    benchmark_configs_seen = set()
+    eval_configs_seen = set()
 
     for entry_data in changelog_data:
         entry = ChangelogEntry.model_validate(entry_data)
-        configs_to_run = get_config_keys_from_master(
+        all_configs = get_config_keys_from_master(
             entry.config_keys, load_config_files(MASTER_CONFIGS)
         )
 
-        # Skip configs already processed
-        configs_to_run = [c for c in configs_to_run if c not in all_configs_to_run]
-        if not configs_to_run:
-            continue
-        all_configs_to_run.update(configs_to_run)
-
-        # Use --evals-only if specified in changelog entry, otherwise --run-evals
-        eval_flag = "--evals-only" if entry.evals_only else "--run-evals"
-
-        try:
-            result = subprocess.run(
-                [
+        if not entry.evals_only:
+            # Generate benchmark entries (no evals)
+            benchmark_configs = [c for c in all_configs if c not in benchmark_configs_seen]
+            if benchmark_configs:
+                benchmark_configs_seen.update(benchmark_configs)
+                base_cmd = [
                     "python3",
                     GENERATE_SWEEPS_PY_SCRIPT,
                     "test-config",
                     "--config-keys",
-                    *configs_to_run,
+                    *benchmark_configs,
                     "--config-files",
                     *MASTER_CONFIGS,
-                    eval_flag
-                ],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-        except subprocess.CalledProcessError as e:
-            print(e.stderr)
-            raise
-
-        all_results.extend(json.loads(result.stdout))
+                    "--no-evals",
+                ]
+                try:
+                    result = subprocess.run(
+                        base_cmd,
+                        capture_output=True,
+                        text=True,
+                        check=True,
+                    )
+                except subprocess.CalledProcessError as e:
+                    print(e.stderr)
+                    raise
+                all_benchmark_results.extend(json.loads(result.stdout))
+
+        # Generate eval entries separately
+        eval_configs = [c for c in all_configs if c not in eval_configs_seen]
+        if eval_configs:
+            eval_configs_seen.update(eval_configs)
+            base_cmd = [
+                "python3",
+                GENERATE_SWEEPS_PY_SCRIPT,
+                "test-config",
+                "--config-keys",
+                *eval_configs,
+                "--config-files",
+                *MASTER_CONFIGS,
+                "--evals-only",
+            ]
+            try:
+                eval_result = subprocess.run(
+                    base_cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+            except subprocess.CalledProcessError as e:
+                print(e.stderr)
+                raise
+            all_eval_results.extend(json.loads(eval_result.stdout))
 
-    all_eval_results = []
-    for result in all_results:
+    for result in all_benchmark_results:
         seq_len_str = seq_len_to_str(result["isl"], result["osl"])
         if "prefill" in result and result["prefill"] is not None:
             final_results["multi_node"][seq_len_str].append(result)
         else:
             final_results["single_node"][seq_len_str].append(result)
 
-        if result.get("run-eval"):
-            all_eval_results.append(result)
-
     final_results["evals"] = [e for e in all_eval_results if "prefill" not in e or e.get("prefill") is None]
     final_results["multinode_evals"] = [e for e in all_eval_results if "prefill" in e and e.get("prefill") is not None]
 

From 8d26331c14abf9290ba9d54689fb70c3e1cec42c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 30 Mar 2026 21:56:56 -0700
Subject: [PATCH 11/22] update multinode to singlenode

---
 .../workflows/benchmark-multinode-tmpl.yml    |  1 +
 runners/launch_gb200-nv.sh                    | 90 ++++++++++---------
 runners/launch_gb300-nv.sh                    | 90 ++++++++++---------
 utils/matrix_logic/generate_sweep_configs.py  | 64 +++++--------
 4 files changed, 117 insertions(+), 128 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index d529b7ccc..79799df2b 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -240,6 +240,7 @@ jobs:
         run: |
           rm -f meta_env.json || true
           rm -f results*.json || true
+          rm -f sample*.jsonl || true
 
       - name: Upload logs
         if: always()
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index e1ecc76a0..075bf6500 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -228,57 +228,61 @@ set -x
 echo "Job $JOB_ID completed!"
 echo "Collecting results..."
 
-if [ ! -d "$LOGS_DIR" ]; then
-    echo "Warning: Logs directory not found at $LOGS_DIR"
-    exit 1
-fi
-
-echo "Found logs directory: $LOGS_DIR"
-
-cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
-
-# Find all result subdirectories
-RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
-
-if [ -z "$RESULT_SUBDIRS" ]; then
-    echo "Warning: No result subdirectories found in $LOGS_DIR"
-else
-    # Process results from all configurations
-    for result_subdir in $RESULT_SUBDIRS; do
-        echo "Processing result subdirectory: $result_subdir"
-
-        # Extract configuration info from directory name
-        CONFIG_NAME=$(basename "$result_subdir")
-
-        # Find all result JSON files
-        RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        echo "Warning: Logs directory not found at $LOGS_DIR"
+        exit 1
+    fi
 
-        for result_file in $RESULT_FILES; do
-            if [ -f "$result_file" ]; then
-                # Extract metadata from filename
-                # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                filename=$(basename "$result_file")
-                concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+    echo "Found logs directory: $LOGS_DIR"
 
-                echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                cp "$result_file" "$WORKSPACE_RESULT_FILE"
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-                echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-            fi
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
         done
-    done
-fi
+    fi
 
-echo "All result files processed"
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
 
 # Collect eval results if eval was requested
-if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
     EVAL_DIR="$LOGS_DIR/eval_results"
     if [ -d "$EVAL_DIR" ]; then
         echo "Extracting eval results from $EVAL_DIR"
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 079d5169e..f3a360b65 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -155,57 +155,61 @@ set -x
 echo "Job $JOB_ID completed!"
 echo "Collecting results..."
 
-if [ ! -d "$LOGS_DIR" ]; then
-    echo "Warning: Logs directory not found at $LOGS_DIR"
-    exit 1
-fi
-
-echo "Found logs directory: $LOGS_DIR"
-
-cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
-
-# Find all result subdirectories
-RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
-
-if [ -z "$RESULT_SUBDIRS" ]; then
-    echo "Warning: No result subdirectories found in $LOGS_DIR"
-else
-    # Process results from all configurations
-    for result_subdir in $RESULT_SUBDIRS; do
-        echo "Processing result subdirectory: $result_subdir"
-
-        # Extract configuration info from directory name
-        CONFIG_NAME=$(basename "$result_subdir")
-
-        # Find all result JSON files
-        RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        echo "Warning: Logs directory not found at $LOGS_DIR"
+        exit 1
+    fi
 
-        for result_file in $RESULT_FILES; do
-            if [ -f "$result_file" ]; then
-                # Extract metadata from filename
-                # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                filename=$(basename "$result_file")
-                concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+    echo "Found logs directory: $LOGS_DIR"
 
-                echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                cp "$result_file" "$WORKSPACE_RESULT_FILE"
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-                echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-            fi
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
         done
-    done
-fi
+    fi
 
-echo "All result files processed"
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
 
 # Collect eval results if eval was requested
-if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
     EVAL_DIR="$LOGS_DIR/eval_results"
     if [ -d "$EVAL_DIR" ]; then
         echo "Extracting eval results from $EVAL_DIR"
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 5d61dffa5..4487d07c1 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -44,18 +44,21 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
     """
     from collections import defaultdict
 
-    # Only run evals on 8k1k
     target_isl, target_osl = seq_len_stoi["8k1k"]
-    # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn).
-    # Only include entries that have a top-level TP (i.e., single-node schema).
-    groups = defaultdict(list)
+    eval_indices = set()
+
+    def _max_conc(ie):
+        c = ie[1][Fields.CONC.value]
+        return max(c) if isinstance(c, list) else c
+
+    # Single-node: group by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn).
+    # Only 8k1k entries with a top-level TP (single-node schema).
+    sn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
         if Fields.TP.value not in entry:
             continue
-
         if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
             continue
-
         key = (
             entry[Fields.MODEL.value],
             entry[Fields.RUNNER.value],
@@ -64,65 +67,42 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
             entry[Fields.ISL.value],
             entry[Fields.OSL.value],
             entry[Fields.SPEC_DECODING.value],
-            entry[Fields.DP_ATTN.value]
+            entry[Fields.DP_ATTN.value],
         )
-        groups[key].append((i, entry))
-
-    # For each group, select entries at highest CONC and median CONC (all TPs)
-    eval_indices = set()
-    for key, entries in groups.items():
-        if not entries:
-            continue
+        sn_groups[key].append((i, entry))
 
+    for entries in sn_groups.values():
         conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries))
         median_conc = conc_values[len(conc_values) // 2]
         target_concs = {conc_values[-1], median_conc}
-
         for i, e in entries:
             if e[Fields.CONC.value] in target_concs:
                 eval_indices.add(i)
 
-    # --- Multi-node eval selection ---
-    # For multi-node (disaggregated) entries, pick one representative per group.
-    # Only 8k1k entries are eligible (never 1k1k).
-    # Within a group, pick the entry with the highest max concurrency.
+    # Multi-node: group by (model, runner, framework, precision, spec-decoding, prefill-dp, decode-dp).
+    # Only 8k1k entries with a prefill key (multi-node schema).
+    # Pick the entry with the highest max concurrency per group.
     mn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
         if Fields.TP.value in entry:
-            continue  # single-node, already handled
+            continue
         if Fields.PREFILL.value not in entry:
             continue
-
-        prefill_dp = entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value)
-        decode_dp = entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value)
+        if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
+            continue
         key = (
             entry[Fields.MODEL.value],
             entry[Fields.RUNNER.value],
             entry[Fields.FRAMEWORK.value],
             entry[Fields.PRECISION.value],
             entry[Fields.SPEC_DECODING.value],
-            prefill_dp,
-            decode_dp,
+            entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value),
+            entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value),
         )
         mn_groups[key].append((i, entry))
 
-    for key, entries in mn_groups.items():
-        if not entries:
-            continue
-
-        # Only 8k1k entries are eligible for eval
-        preferred = [(i, e) for i, e in entries
-                     if e.get(Fields.ISL.value) == target_isl
-                     and e.get(Fields.OSL.value) == target_osl]
-        if not preferred:
-            continue
-
-        # Pick entry with highest max concurrency
-        def _max_conc(ie):
-            c = ie[1][Fields.CONC.value]
-            return max(c) if isinstance(c, list) else c
-        best = max(preferred, key=_max_conc)
-        eval_indices.add(best[0])
+    for entries in mn_groups.values():
+        eval_indices.add(max(entries, key=_max_conc)[0])
 
     # Mark the selected entries
     for i, entry in enumerate(matrix_values):

From 0b271879e08a0284c90dc5f88d7cbafff53cb40d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 30 Mar 2026 22:03:39 -0700
Subject: [PATCH 12/22] hanging rm rf

---
 .github/workflows/benchmark-multinode-tmpl.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 79799df2b..29f0bff98 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -150,9 +150,6 @@ jobs:
             fi
           fi
 
-      - name: Clean up root-owned files from previous runs
-        run: sudo rm -rf benchmark_logs benchmark_artifacts 2>/dev/null || true
-
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           token: ${{ secrets.REPO_PAT }}

From 056a4156098b4c8ff11b47e68ff72ccb1484db1d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 31 Mar 2026 06:54:57 -0700
Subject: [PATCH 13/22] debug

---
 runners/launch_gb200-nv.sh | 14 ++++++++------
 runners/launch_gb300-nv.sh | 14 ++++++++------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 075bf6500..22bf58665 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -228,17 +228,19 @@ set -x
 echo "Job $JOB_ID completed!"
 echo "Collecting results..."
 
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
 if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
     if [ ! -d "$LOGS_DIR" ]; then
-        echo "Warning: Logs directory not found at $LOGS_DIR"
         exit 1
     fi
 
-    echo "Found logs directory: $LOGS_DIR"
-
-    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
-
     # Find all result subdirectories
     RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index f3a360b65..2f56c3633 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -155,17 +155,19 @@ set -x
 echo "Job $JOB_ID completed!"
 echo "Collecting results..."
 
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
 if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
     if [ ! -d "$LOGS_DIR" ]; then
-        echo "Warning: Logs directory not found at $LOGS_DIR"
         exit 1
     fi
 
-    echo "Found logs directory: $LOGS_DIR"
-
-    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
-
     # Find all result subdirectories
     RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 

From 61f7d9babb5916df9dd8a5a62d80864a3eb0c353 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 31 Mar 2026 07:20:20 -0700
Subject: [PATCH 14/22] update conc req

---
 benchmarks/multi_node/amd_utils/server.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 9271c4382..2adbcd8df 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -504,13 +504,13 @@ if [ "$NODE_RANK" -eq 0 ]; then
             source /workspace/benchmarks/benchmark_lib.sh
 
             # Use max concurrency from benchmark config (conc values are x-separated)
-            EVAL_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
 
             if [[ "$DRY_RUN" -eq 1 ]]; then
-                echo "DRY RUN: run_eval --framework lm-eval --port 30000 --concurrent-requests $EVAL_CONC"
+                echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})"
             else
                 # Run lm-eval against the router on port 30000
-                run_eval --framework lm-eval --port 30000 --concurrent-requests "$EVAL_CONC"
+                run_eval --framework lm-eval --port 30000
 
                 # Set metadata env vars for append_lm_eval_summary
                 export TP="${PREFILL_TP_SIZE}"

From ffdd49b89f0235b84069c4a296670ecf159ca777 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 31 Mar 2026 21:08:34 -0700
Subject: [PATCH 15/22] documentation

---
 .github/workflows/e2e-tests.yml |  2 +-
 utils/evals/EVALS.md            | 43 +++++++++++++++++++++++++++++----
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 6765113b2..eb1a97713 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -206,7 +206,7 @@ jobs:
 
     collect-results:
         needs: [test-sweep-multi-node, test-sweep-single-node]
-        if: ${{ always() }}
+        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped') }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
         with:
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index e32d6d988..f729d5f24 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -6,21 +6,54 @@ Quick graded QnA which measures model performance. Examples of test suites:
 - **gpqa**: Graduate level, Google-Proof multiple choice questions
 
 ## When?
-At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py`
+Evals run as **separate workflow jobs** from throughput benchmarks. The selection logic is in `mark_eval_entries()` of `utils/matrix_logic/generate_sweep_configs.py`.
+
+**Single-node**: At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k.
+
+**Multi-node**: One entry per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) with the highest max concurrency, only for 8k1k.
 
 ## Why?
-To verify how model outputs are affected by throughput optimizations. 
+To verify how model outputs are affected by throughput optimizations.
 - TP/Conc might affect model outputs
 - Check kernel implementations for correctness
 - If there was a tradeoff in accuracy for performance
 
 ## How?
-- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+`run_eval` in `benchmarks/benchmark_lib.sh` runs EleutherAI/lm-evaluation-harness against the server's OpenAI-compatible endpoint. Concurrency is set via `EVAL_CONCURRENT_REQUESTS` env var (not a CLI flag). Results are collected by `utils/collect_eval_results.py` and published as a summary table.
+
+### Single-node
+In eval-only mode (`EVAL_ONLY=true`), the benchmark script starts the server with expanded context length (via `compute_eval_context_length`), skips throughput, and runs lm-eval directly. Each framework handles the context expansion differently (`--context-length` for SGLang, `--max_seq_len` for TRT-LLM).
+
+### Multi-node
+Multi-node evals support three hardware paths:
+
+**MI355X (AMD)** — `benchmarks/multi_node/amd_utils/server.sh`
+- Skips `bench.sh` when `EVAL_ONLY=true`
+- Runs lm-eval via `run_eval` against the router on port 30000
+- Concurrency derived from max of `BENCH_MAX_CONCURRENCY` (x-separated values)
+- Eval artifacts copied to `/run_logs/slurm_job-*/eval_results/`
+- `runners/launch_mi355x-amds.sh` skips benchmark result collection when `EVAL_ONLY=true` and uses `find` to locate eval results
+
+**GB200/GB300 (NVIDIA)** — via [srt-slurm fork](https://github.com/Oseltamivir/srt-slurm) (`sa-submission-q1-2026` branch)
+- `do_sweep.py` skips the benchmark stage when `EVAL_ONLY=true`, runs `_run_post_eval()` directly
+- In eval-only mode, uses the full `wait_for_model()` health check (same as benchmark stage) since the benchmark health check was skipped
+- `lm-eval` benchmark runner (`benchmarks/lm_eval.py`) sources InferenceX's `benchmark_lib.sh` from the mounted workspace (`/infmax-workspace`)
+- Eval artifacts written to `/logs/eval_results/` inside the container, collected by launch scripts
+- `runners/launch_gb200-nv.sh` and `launch_gb300-nv.sh` always collect server logs (for debugging) but skip benchmark result collection when `EVAL_ONLY=true`
+- Env vars threaded: `RUN_EVAL`, `EVAL_ONLY`, `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP/EP/DP_ATTN`, `DECODE_TP/EP/DP_ATTN`, `MODEL_NAME`, `EVAL_CONC`
+
+### Workflow structure
+- `e2e-tests.yml`: `test-sweep-evals` (single-node) and `test-sweep-multi-node-evals` (multi-node)
+- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node)
+- Both use their respective benchmark templates with `eval-only: true`, `run-eval: true`
+- `collect-evals` depends on both eval jobs; `collect-results` only runs when benchmark jobs ran
+- `process_changelog.py` splits eval results into `evals` (single-node) and `multinode_evals`
+
+### Score validation
+`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
 
 ## Misc
 Following files are task definitions from lmeval, more info on changes within the files
 - `utils/evals/gsm8k.yaml`
 - `utils/evals/gpqa_diamond.yaml`
 
-
-

From 7639f3da40cfe4caea3dc89cfdaf01bad0e5c51f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 31 Mar 2026 22:18:50 -0700
Subject: [PATCH 16/22] median instead of max

---
 .github/workflows/benchmark-multinode-tmpl.yml |  6 ++++++
 .github/workflows/e2e-tests.yml                |  1 +
 .github/workflows/run-sweep.yml                |  1 +
 benchmarks/multi_node/amd_utils/server.sh      |  8 ++++++--
 utils/matrix_logic/generate_sweep_configs.py   | 13 +++++++++++--
 utils/matrix_logic/validation.py               |  2 ++
 6 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 29f0bff98..ea086beb7 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -86,6 +86,11 @@ on:
         type: boolean
         required: false
         default: false
+      eval-conc:
+        description: "Concurrency to use for eval requests (overrides default max-of-conc-list)"
+        type: string
+        required: false
+        default: ""
       ref:
         description: "Git ref (branch/sha) to checkout"
         required: false
@@ -107,6 +112,7 @@ env:
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
+  EVAL_CONC: ${{ inputs.eval-conc }}
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index eb1a97713..487a4a0c3 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -141,6 +141,7 @@ jobs:
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
             run-eval: true
             eval-only: true
+            eval-conc: ${{ matrix.config.eval-conc }}
             ref: ${{ inputs.ref }}
 
     test-sweep-single-node:
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index a87f7ee13..e3eaf1c3b 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -226,6 +226,7 @@ jobs:
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
             run-eval: true
             eval-only: true
+            eval-conc: ${{ matrix.config.eval-conc }}
 
     collect-results:
         needs:
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 2adbcd8df..2d001fd53 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -503,8 +503,12 @@ if [ "$NODE_RANK" -eq 0 ]; then
             # Source eval functions from benchmark_lib.sh
             source /workspace/benchmarks/benchmark_lib.sh
 
-            # Use max concurrency from benchmark config (conc values are x-separated)
-            export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
+            if [[ -n "${EVAL_CONC}" ]]; then
+                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
+            else
+                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            fi
 
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})"
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 4487d07c1..9682c1423 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -40,12 +40,14 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         - Mark all entries at the median CONC (all TPs)
     - Multi-node: for each unique (model, runner, framework, precision,
       spec-decoding, prefill-dp-attn, decode-dp-attn), only 8k1k entries.
-      Mark the entry with the highest max concurrency.
+      Mark the entry with the highest max concurrency. Sets eval-conc to the
+      median of the conc list to avoid OOM during eval.
     """
     from collections import defaultdict
 
     target_isl, target_osl = seq_len_stoi["8k1k"]
     eval_indices = set()
+    mn_eval_conc = {}  # index -> chosen eval concurrency for multinode entries
 
     def _max_conc(ie):
         c = ie[1][Fields.CONC.value]
@@ -102,11 +104,18 @@ def _max_conc(ie):
         mn_groups[key].append((i, entry))
 
     for entries in mn_groups.values():
-        eval_indices.add(max(entries, key=_max_conc)[0])
+        best_idx, best_entry = max(entries, key=_max_conc)
+        eval_indices.add(best_idx)
+        # Set eval-conc to median of the conc list to avoid OOM during eval
+        conc = best_entry[Fields.CONC.value]
+        sorted_conc = sorted(conc) if isinstance(conc, list) else [conc]
+        mn_eval_conc[best_idx] = sorted_conc[len(sorted_conc) // 2]
 
     # Mark the selected entries
     for i, entry in enumerate(matrix_values):
         entry[Fields.RUN_EVAL.value] = i in eval_indices
+        if i in mn_eval_conc:
+            entry[Fields.EVAL_CONC.value] = mn_eval_conc[i]
 
     return matrix_values
 
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 2e8626abe..62a92c5ed 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -53,6 +53,7 @@ class Fields(Enum):
 
     # Eval
     RUN_EVAL = 'run-eval'
+    EVAL_CONC = 'eval-conc'
 
 
 """
@@ -126,6 +127,7 @@ class MultiNodeMatrixEntry(BaseModel):
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
     run_eval: bool = Field(alias=Fields.RUN_EVAL.value)
+    eval_conc: Optional[int] = Field(default=None, alias=Fields.EVAL_CONC.value)
 
 
 def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict:

From 4ffd505ee2447fa8f87f50e4a50b45c0e1ab8427 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 1 Apr 2026 06:47:39 -0700
Subject: [PATCH 17/22] config file guard

---
 runners/launch_b200-dgxc-slurm.sh | 7 +++++++
 runners/launch_b300-nv.sh         | 7 +++++++
 runners/launch_gb200-nv.sh        | 8 ++++++++
 runners/launch_gb300-nv.sh        | 6 ++++++
 runners/launch_h100-dgxc-slurm.sh | 7 +++++++
 runners/launch_h200-dgxc-slurm.sh | 7 +++++++
 6 files changed, 42 insertions(+)

diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 0d1bd40cc..282f597f4 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -99,6 +99,13 @@ EOF
     make setup ARCH=x86_64
 
     echo "Submitting job with srtctl..."
+
+    if [[ -z "$CONFIG_FILE" ]]; then
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+        exit 1
+    fi
+
     # Override the job name in the config file with the runner name
     sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 68da9f2b7..b9d7612ef 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -100,6 +100,13 @@ echo "Running make setup..."
 make setup ARCH=x86_64
 
 echo "Submitting job with srtctl..."
+
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
 # Override the job name in the config file with the runner name
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
 SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 22bf58665..d84c0ac13 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -107,6 +107,14 @@ PY
 fi
 
 
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
 echo "Cloning srt-slurm repository..."
 SRT_REPO_DIR="srt-slurm"
 if [ -d "$SRT_REPO_DIR" ]; then
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 2f56c3633..91147d90d 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -102,6 +102,12 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 
 echo "Submitting job with srtctl..."
 
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
 # Override the job name in the config file with the runner name
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
 
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index bb0335955..0cc03ae27 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -113,6 +113,13 @@ EOF
     make setup ARCH=x86_64
 
     echo "Submitting job with srtctl..."
+
+    if [[ -z "$CONFIG_FILE" ]]; then
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+        exit 1
+    fi
+
     # Override the job name in the config file with the runner name
     sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
     sed -i "/^name:.*/a sbatch_directives:\n  exclude: \"${SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_FILE"
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 9b3b771a5..f23f1f138 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -113,6 +113,13 @@ EOF
     make setup ARCH=x86_64
 
     echo "Submitting job with srtctl..."
+
+    if [[ -z "$CONFIG_FILE" ]]; then
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+        exit 1
+    fi
+
     # Override the job name in the config file with the runner name
     sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)

From 0d0e1e857cd152fdae665011df5159f5a6b45ddb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 1 Apr 2026 07:21:13 -0700
Subject: [PATCH 18/22] h100/h200/b200/b300 evals

---
 runners/launch_b200-dgxc-slurm.sh | 88 +++++++++++++++++++------------
 runners/launch_b300-nv.sh         | 88 +++++++++++++++++++------------
 runners/launch_h100-dgxc-slurm.sh | 88 +++++++++++++++++++------------
 runners/launch_h200-dgxc-slurm.sh | 88 +++++++++++++++++++------------
 4 files changed, 220 insertions(+), 132 deletions(-)

diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 282f597f4..0bd734282 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -65,6 +65,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     export ISL="$ISL"
     export OSL="$OSL"
+    export EVAL_ONLY="${EVAL_ONLY:-false}"
 
     # Create srtslurm.yaml for srtctl (used by both frameworks)
     SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -98,6 +99,9 @@ EOF
     echo "Running make setup..."
     make setup ARCH=x86_64
 
+    # Export eval-related env vars for srt-slurm post-benchmark eval
+    export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
     echo "Submitting job with srtctl..."
 
     if [[ -z "$CONFIG_FILE" ]]; then
@@ -169,45 +173,63 @@ EOF
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-    # Find all result subdirectories
-    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        # Find all result subdirectories
+        RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+        if [ -z "$RESULT_SUBDIRS" ]; then
+            echo "Warning: No result subdirectories found in $LOGS_DIR"
+        else
+            # Process results from all configurations
+            for result_subdir in $RESULT_SUBDIRS; do
+                echo "Processing result subdirectory: $result_subdir"
+
+                # Extract configuration info from directory name
+                CONFIG_NAME=$(basename "$result_subdir")
+
+                # Find all result JSON files
+                RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+                for result_file in $RESULT_FILES; do
+                    if [ -f "$result_file" ]; then
+                        # Extract metadata from filename
+                        # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                        filename=$(basename "$result_file")
+                        concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                        gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                        ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                        gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                        echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                        WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                        cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                        echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                    fi
+                done
+            done
+        fi
 
-    if [ -z "$RESULT_SUBDIRS" ]; then
-        echo "Warning: No result subdirectories found in $LOGS_DIR"
+        echo "All result files processed"
     else
-        # Process results from all configurations
-        for result_subdir in $RESULT_SUBDIRS; do
-            echo "Processing result subdirectory: $result_subdir"
-
-            # Extract configuration info from directory name
-            CONFIG_NAME=$(basename "$result_subdir")
-
-            # Find all result JSON files
-            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-            for result_file in $RESULT_FILES; do
-                if [ -f "$result_file" ]; then
-                    # Extract metadata from filename
-                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                    filename=$(basename "$result_file")
-                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
-
-                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
-
-                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+        echo "EVAL_ONLY=true: Skipping benchmark result collection"
+    fi
 
-                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-                fi
+    # Collect eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+        EVAL_DIR="$LOGS_DIR/eval_results"
+        if [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
             done
-        done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+        fi
     fi
 
-    echo "All result files processed"
-
     # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
     # from blocking the next job's checkout on this runner
     echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index b9d7612ef..0b40a2a00 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -64,6 +64,7 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX
 
 export ISL="$ISL"
 export OSL="$OSL"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
 
 # Create srtslurm.yaml for srtctl
 SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -99,6 +100,9 @@ cat srtslurm.yaml
 echo "Running make setup..."
 make setup ARCH=x86_64
 
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
 echo "Submitting job with srtctl..."
 
 if [[ -z "$CONFIG_FILE" ]]; then
@@ -170,45 +174,63 @@ echo "Found logs directory: $LOGS_DIR"
 cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
 tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-# Find all result subdirectories
-RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
 
-if [ -z "$RESULT_SUBDIRS" ]; then
-    echo "Warning: No result subdirectories found in $LOGS_DIR"
+    echo "All result files processed"
 else
-    # Process results from all configurations
-    for result_subdir in $RESULT_SUBDIRS; do
-        echo "Processing result subdirectory: $result_subdir"
-
-        # Extract configuration info from directory name
-        CONFIG_NAME=$(basename "$result_subdir")
-
-        # Find all result JSON files
-        RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-        for result_file in $RESULT_FILES; do
-            if [ -f "$result_file" ]; then
-                # Extract metadata from filename
-                # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                filename=$(basename "$result_file")
-                concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
-
-                echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
-
-                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                cp "$result_file" "$WORKSPACE_RESULT_FILE"
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
 
-                echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-            fi
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
         done
-    done
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
 fi
 
-echo "All result files processed"
-
 # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
 # from blocking the next job's checkout on this runner
 echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 0cc03ae27..29a7e1340 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -75,6 +75,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     export ISL="$ISL"
     export OSL="$OSL"
+    export EVAL_ONLY="${EVAL_ONLY:-false}"
 
     # Create srtslurm.yaml for srtctl (used by both frameworks)
     SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -112,6 +113,9 @@ EOF
     echo "Running make setup..."
     make setup ARCH=x86_64
 
+    # Export eval-related env vars for srt-slurm post-benchmark eval
+    export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
     echo "Submitting job with srtctl..."
 
     if [[ -z "$CONFIG_FILE" ]]; then
@@ -184,45 +188,63 @@ EOF
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-    # Find all result subdirectories
-    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
-
-    if [ -z "$RESULT_SUBDIRS" ]; then
-        echo "Warning: No result subdirectories found in $LOGS_DIR"
-    else
-        # Process results from all configurations
-        for result_subdir in $RESULT_SUBDIRS; do
-            echo "Processing result subdirectory: $result_subdir"
-
-            # Extract configuration info from directory name
-            CONFIG_NAME=$(basename "$result_subdir")
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        # Find all result subdirectories
+        RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-            # Find all result JSON files
-            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-            for result_file in $RESULT_FILES; do
-                if [ -f "$result_file" ]; then
-                    # Extract metadata from filename
-                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                    filename=$(basename "$result_file")
-                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
-
-                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+        if [ -z "$RESULT_SUBDIRS" ]; then
+            echo "Warning: No result subdirectories found in $LOGS_DIR"
+        else
+            # Process results from all configurations
+            for result_subdir in $RESULT_SUBDIRS; do
+                echo "Processing result subdirectory: $result_subdir"
+
+                # Extract configuration info from directory name
+                CONFIG_NAME=$(basename "$result_subdir")
+
+                # Find all result JSON files
+                RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+                for result_file in $RESULT_FILES; do
+                    if [ -f "$result_file" ]; then
+                        # Extract metadata from filename
+                        # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                        filename=$(basename "$result_file")
+                        concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                        gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                        ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                        gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                        echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                        WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                        cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                        echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                    fi
+                done
+            done
+        fi
 
-                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+        echo "All result files processed"
+    else
+        echo "EVAL_ONLY=true: Skipping benchmark result collection"
+    fi
 
-                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-                fi
+    # Collect eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+        EVAL_DIR="$LOGS_DIR/eval_results"
+        if [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
             done
-        done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+        fi
     fi
 
-    echo "All result files processed"
-
     # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
     # from blocking the next job's checkout on this runner
     echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index f23f1f138..dffef2d28 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -74,6 +74,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     export ISL="$ISL"
     export OSL="$OSL"
+    export EVAL_ONLY="${EVAL_ONLY:-false}"
 
     # Create srtslurm.yaml for srtctl (used by both frameworks)
     SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -112,6 +113,9 @@ EOF
     echo "Running make setup..."
     make setup ARCH=x86_64
 
+    # Export eval-related env vars for srt-slurm post-benchmark eval
+    export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
     echo "Submitting job with srtctl..."
 
     if [[ -z "$CONFIG_FILE" ]]; then
@@ -183,45 +187,63 @@ EOF
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-    # Find all result subdirectories
-    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
-
-    if [ -z "$RESULT_SUBDIRS" ]; then
-        echo "Warning: No result subdirectories found in $LOGS_DIR"
-    else
-        # Process results from all configurations
-        for result_subdir in $RESULT_SUBDIRS; do
-            echo "Processing result subdirectory: $result_subdir"
-
-            # Extract configuration info from directory name
-            CONFIG_NAME=$(basename "$result_subdir")
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        # Find all result subdirectories
+        RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-            # Find all result JSON files
-            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-            for result_file in $RESULT_FILES; do
-                if [ -f "$result_file" ]; then
-                    # Extract metadata from filename
-                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                    filename=$(basename "$result_file")
-                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
-
-                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+        if [ -z "$RESULT_SUBDIRS" ]; then
+            echo "Warning: No result subdirectories found in $LOGS_DIR"
+        else
+            # Process results from all configurations
+            for result_subdir in $RESULT_SUBDIRS; do
+                echo "Processing result subdirectory: $result_subdir"
+
+                # Extract configuration info from directory name
+                CONFIG_NAME=$(basename "$result_subdir")
+
+                # Find all result JSON files
+                RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+                for result_file in $RESULT_FILES; do
+                    if [ -f "$result_file" ]; then
+                        # Extract metadata from filename
+                        # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                        filename=$(basename "$result_file")
+                        concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                        gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                        ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                        gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                        echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                        WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                        cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                        echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                    fi
+                done
+            done
+        fi
 
-                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+        echo "All result files processed"
+    else
+        echo "EVAL_ONLY=true: Skipping benchmark result collection"
+    fi
 
-                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-                fi
+    # Collect eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+        EVAL_DIR="$LOGS_DIR/eval_results"
+        if [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
             done
-        done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+        fi
     fi
 
-    echo "All result files processed"
-
     # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
     # from blocking the next job's checkout on this runner
     echo "Cleaning up srt-slurm outputs..."

From bf615b962ba460aea7554fafd6e1627f8880873e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 1 Apr 2026 13:23:02 -0700
Subject: [PATCH 19/22] Update repo

---
 runners/launch_b200-dgxc-slurm.sh | 2 +-
 runners/launch_b300-nv.sh         | 2 +-
 runners/launch_h100-dgxc-slurm.sh | 2 +-
 runners/launch_h200-dgxc-slurm.sh | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 0bd734282..fb9bb7b22 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -36,7 +36,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+    git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR" || exit 1
     git checkout sa-submission-q1-2026
 
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 0b40a2a00..e4100de94 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -35,7 +35,7 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR" || exit 1
 git checkout sa-submission-q1-2026
 
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 29a7e1340..91cea74f3 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -41,7 +41,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+    git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q1-2026
 
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index dffef2d28..3e7032314 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -40,7 +40,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+    git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q1-2026
 

From 28a75a2ee91dde844b23bfe02b4d2bc504da34c4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 1 Apr 2026 16:26:47 -0700
Subject: [PATCH 20/22] models_name

---
 runners/launch_b300-nv.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index e4100de94..c718dcad0 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -16,11 +16,11 @@ fi
 # The yaml files specify HuggingFace model IDs for portability, but we use
 # local paths to avoid repeated downloading on the shared B300 cluster.
 if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
-    export MODEL_PATH="/scratch/models/deepseek-r1-0528-nvfp4-v2"
+    export MODEL_PATH="/data/models/dsr1-fp4"
     export SERVED_MODEL_NAME="deepseek-r1-fp4"
     export SRT_SLURM_MODEL_PREFIX="dsr1"
 elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
-    export MODEL_PATH="/scratch/models/deepseek-r1-0528"
+    export MODEL_PATH="/data/models/dsr1-fp8"
     export SERVED_MODEL_NAME="deepseek-r1-fp8"
     export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
 else

From 98a45e9877caf3a675467a3cdcde11fe8e32b579 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 1 Apr 2026 17:24:26 -0700
Subject: [PATCH 21/22] model config

---
 benchmarks/benchmark_lib.sh | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index b3264cef0..d21f827ae 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -593,14 +593,23 @@ PY
 
 get_native_max_context_length() {
     local model_path="$1"
+    # Prefer MODEL_PATH (local model directory) when available, since the
+    # argument may be a served-model name that is neither a valid HF repo
+    # ID nor a local path (e.g. "deepseek-r1-fp4" on the B300 cluster).
+    if [ -n "${MODEL_PATH:-}" ] && [ -d "${MODEL_PATH}" ]; then
+        model_path="${MODEL_PATH}"
+    fi
     python3 -c "
-from transformers import AutoConfig
-config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
-for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
-    if hasattr(config, attr):
-        print(getattr(config, attr))
-        break
-else:
+try:
+    from transformers import AutoConfig
+    config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
+    for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
+        if hasattr(config, attr):
+            print(getattr(config, attr))
+            break
+    else:
+        print(0)
+except Exception:
     print(0)
 "
 }

From de5497449df975f2646c26e98ae391f3d63e321f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 2 Apr 2026 13:25:10 -0700
Subject: [PATCH 22/22] summary table

---
 .../workflows/benchmark-multinode-tmpl.yml    |   9 +-
 .github/workflows/benchmark-tmpl.yml          |   7 +
 benchmarks/benchmark_lib.sh                   |  35 +++-
 benchmarks/multi_node/amd_utils/job.slurm     |   2 +
 benchmarks/multi_node/amd_utils/server.sh     |  14 +-
 benchmarks/multi_node/amd_utils/submit.sh     |   2 +
 utils/collect_eval_results.py                 | 183 ++++++++++++++----
 7 files changed, 212 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index ea086beb7..4da79d5cd 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -163,6 +163,13 @@ jobs:
           ref: ${{ inputs.ref || github.ref }}
           clean: false
 
+      - name: Cleanup stale eval outputs (pre-run)
+        if: ${{ inputs.run-eval || inputs.eval-only }}
+        run: |
+          rm -f meta_env.json || true
+          rm -f results*.json || true
+          rm -f sample*.jsonl || true
+
       - name: Launch multi-node job script
         env:
           RUNNER_NAME: ${{ runner.name }}
@@ -239,7 +246,7 @@ jobs:
         run: python3 utils/evals/validate_scores.py
 
       - name: Cleanup eval outputs (post-upload)
-        if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
+        if: ${{ always() && (inputs.run-eval || inputs.eval-only) }}
         run: |
           rm -f meta_env.json || true
           rm -f results*.json || true
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 797505eec..25bec61ee 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -140,6 +140,13 @@ jobs:
           ref: ${{ inputs.ref || github.ref }}
           clean: false
 
+      - name: Cleanup stale eval outputs (pre-run)
+        if: ${{ inputs.run-eval || inputs.eval-only }}
+        run: |
+          rm -f meta_env.json || true
+          rm -f results*.json || true
+          rm -f sample*.jsonl || true
+
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index d21f827ae..403484998 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -717,8 +717,32 @@ append_lm_eval_summary() {
     # Write minimal meta for collectors that expect it
     local meta_json="${out_dir}/meta_env.json"
     local model_name="${MODEL_NAME:-$MODEL}"
+    local is_multinode_json="false"
+    if [ "${IS_MULTINODE:-false}" = "true" ]; then
+        is_multinode_json="true"
+    fi
+
+    local prefill_tp="${PREFILL_TP:-${TP:-1}}"
+    local prefill_ep="${PREFILL_EP:-${EP_SIZE:-1}}"
+    local prefill_num_workers="${PREFILL_NUM_WORKERS:-1}"
+    local decode_tp="${DECODE_TP:-${TP:-1}}"
+    local decode_ep="${DECODE_EP:-${EP_SIZE:-1}}"
+    local decode_num_workers="${DECODE_NUM_WORKERS:-1}"
+
     local dp_json="false"
-    if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi
+    if [ "${DP_ATTENTION:-false}" = "true" ]; then dp_json="true"; fi
+    local prefill_dp_json="$dp_json"
+    if [ "${PREFILL_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then
+        prefill_dp_json="true"
+    else
+        prefill_dp_json="false"
+    fi
+    local decode_dp_json="$dp_json"
+    if [ "${DECODE_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then
+        decode_dp_json="true"
+    else
+        decode_dp_json="false"
+    fi
 
     # Derive framework/precision from env, fallback to parsing RESULT_FILENAME
     # RESULT_FILENAME format (from workflow):
@@ -743,6 +767,7 @@ append_lm_eval_summary() {
     fi
     cat > "${meta_json}" <<META
 {
+  "is_multinode": ${is_multinode_json},
   "framework": "${fw:-unknown}",
   "precision": "${prec:-unknown}",
   "spec_decoding": "${SPEC_DECODING}",
@@ -750,6 +775,14 @@ append_lm_eval_summary() {
   "conc": ${CONC:-1},
   "ep": ${EP_SIZE:-1},
   "dp_attention": ${dp_json},
+  "prefill_tp": ${prefill_tp},
+  "prefill_ep": ${prefill_ep},
+  "prefill_dp_attention": ${prefill_dp_json},
+  "prefill_num_workers": ${prefill_num_workers},
+  "decode_tp": ${decode_tp},
+  "decode_ep": ${decode_ep},
+  "decode_dp_attention": ${decode_dp_json},
+  "decode_num_workers": ${decode_num_workers},
   "model": "${model_name:-}",
   "infmax_model_prefix": "${MODEL_PREFIX:-unknown}",
   "hw": "${RUNNER_TYPE:-unknown}",
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index eb993f64e..2f88250b5 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -288,6 +288,7 @@ export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 # Eval-related env vars (threaded from submit.sh)
 export RUN_EVAL="${RUN_EVAL:-false}"
 export EVAL_ONLY="${EVAL_ONLY:-false}"
+export EVAL_CONC="${EVAL_CONC:-}"
 export FRAMEWORK="${FRAMEWORK:-}"
 export PRECISION="${PRECISION:-}"
 export MODEL_PREFIX="${MODEL_PREFIX:-}"
@@ -401,6 +402,7 @@ exec sudo docker run --rm \
     -e BENCHMARK_LOGS_DIR=/benchmark_logs \
     -e RUN_EVAL=\$RUN_EVAL \
     -e EVAL_ONLY=\$EVAL_ONLY \
+    -e EVAL_CONC=\$EVAL_CONC \
     -e FRAMEWORK=\$FRAMEWORK \
     -e PRECISION=\$PRECISION \
     -e MODEL_PREFIX=\$MODEL_PREFIX \
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 2d001fd53..9ed395bb4 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -504,7 +504,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
             source /workspace/benchmarks/benchmark_lib.sh
 
             # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
-            if [[ -n "${EVAL_CONC}" ]]; then
+            if [[ -n "${EVAL_CONC:-}" ]]; then
                 export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
             else
                 export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
@@ -518,10 +518,20 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
                 # Set metadata env vars for append_lm_eval_summary
                 export TP="${PREFILL_TP_SIZE}"
-                export CONC="${EVAL_CONC}"
+                export CONC="${EVAL_CONCURRENT_REQUESTS}"
                 export EP_SIZE=1
                 [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+                export PREFILL_TP="${PREFILL_TP_SIZE}"
+                export PREFILL_EP=1
+                [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
+                export PREFILL_NUM_WORKERS="${xP}"
+                export DECODE_TP="${DECODE_TP_SIZE}"
+                export DECODE_EP=1
+                [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
+                export DECODE_NUM_WORKERS="${yD}"
                 export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
                 export ISL="${BENCH_INPUT_LEN}"
                 export OSL="${BENCH_OUTPUT_LEN}"
                 # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index 65723be5e..be22b8d33 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -107,6 +107,8 @@ export BENCH_REQUEST_RATE=${REQUEST_RATE}
 
 # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker)
 export RUN_EVAL="${RUN_EVAL:-false}"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+export EVAL_CONC="${EVAL_CONC:-}"
 export FRAMEWORK="${FRAMEWORK:-}"
 export PRECISION="${PRECISION:-}"
 export MODEL_PREFIX="${MODEL_PREFIX:-}"
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 1c2f6429b..18917447e 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -10,7 +10,8 @@
 from summarize import (
     load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION,
     TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF,
-    SPEC_DECODING
+    SPEC_DECODING, PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS,
+    DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS
 )
 
 
@@ -160,19 +161,67 @@ def se(x: Any) -> str:
         return ''
 
 
+def as_int(x: Any, default: int = 0) -> int:
+    """Convert a metadata field to int with a fallback."""
+    try:
+        return int(x)
+    except Exception:
+        return default
+
+
+def as_bool(x: Any, default: bool = False) -> bool:
+    """Parse a metadata boolean stored as bool/string/int."""
+    if isinstance(x, bool):
+        return x
+    if x is None:
+        return default
+    return str(x).lower() == 'true'
+
+
 def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
     """Build a result row from metadata and extracted metrics."""
+    is_multinode = as_bool(meta.get('is_multinode'), False)
+    prefill_tp = as_int(meta.get('prefill_tp', meta.get('tp', 1)), 1)
+    prefill_ep = as_int(meta.get('prefill_ep', meta.get('ep', 1)), 1)
+    prefill_num_workers = as_int(meta.get('prefill_num_workers', 1), 1)
+    decode_tp = as_int(meta.get('decode_tp', meta.get('tp', 1)), 1)
+    decode_ep = as_int(meta.get('decode_ep', meta.get('ep', 1)), 1)
+    decode_num_workers = as_int(meta.get('decode_num_workers', 1), 1)
+    prefill_dp_attention = meta.get('prefill_dp_attention')
+    decode_dp_attention = meta.get('decode_dp_attention')
+    dp_attention = meta.get('dp_attention', 'none')
+
+    if prefill_dp_attention is None:
+        prefill_dp_attention = dp_attention
+    if decode_dp_attention is None:
+        decode_dp_attention = dp_attention
+
+    if is_multinode:
+        if prefill_dp_attention == decode_dp_attention:
+            dp_attention = prefill_dp_attention
+        else:
+            dp_attention = f"prefill={str(prefill_dp_attention).lower()},decode={str(decode_dp_attention).lower()}"
+
     row = {
+        'is_multinode': is_multinode,
         'model_prefix': meta.get('infmax_model_prefix', 'unknown'),
         'model': m.get('model') or meta.get('model', 'unknown'),
         'hw': meta.get('hw', 'unknown').upper(),
         'framework': meta.get('framework', 'unknown').lower(),
         'precision': meta.get('precision', 'unknown').lower(),
         'spec_decoding': meta.get('spec_decoding', 'unknown'),
-        'tp': int(meta.get('tp', 1)),
-        'ep': int(meta.get('ep', 1)),
-        'conc': int(meta.get('conc', 0)),
-        'dp_attention': str(meta.get('dp_attention', "none")).lower(),
+        'tp': as_int(meta.get('tp', prefill_tp), prefill_tp),
+        'ep': as_int(meta.get('ep', prefill_ep), prefill_ep),
+        'prefill_tp': prefill_tp,
+        'prefill_ep': prefill_ep,
+        'prefill_num_workers': prefill_num_workers,
+        'decode_tp': decode_tp,
+        'decode_ep': decode_ep,
+        'decode_num_workers': decode_num_workers,
+        'conc': as_int(meta.get('conc', 0), 0),
+        'dp_attention': str(dp_attention).lower(),
+        'prefill_dp_attention': str(prefill_dp_attention).lower(),
+        'decode_dp_attention': str(decode_dp_attention).lower(),
         'task': m.get('task', 'unknown'),
         'em_strict': m.get('strict'),
         'em_strict_se': m.get('strict_se'),
@@ -226,49 +275,111 @@ def main():
             row = build_row(meta, m)
             rows.append(row)
 
+    single_node_rows = [r for r in rows if not r['is_multinode']]
+    multinode_rows = [r for r in rows if r['is_multinode']]
+
     # Sort for stable output (default: by model_prefix)
     sort_by = sys.argv[3] if len(sys.argv) > 3 else 'model_prefix'
-    if sort_by == 'hw':
-        rows.sort(key=lambda r: (
-            r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc']
+    single_node_sort_key = (
+        (lambda r: (
+            r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''),
+            r['tp'], r['ep'], r['conc'],
         ))
-    else:
-        rows.sort(key=lambda r: (
-            r['model_prefix'], r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc']
+        if sort_by == 'hw'
+        else (lambda r: (
+            r['model_prefix'], r['hw'], r['framework'], r['precision'],
+            r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'],
+        ))
+    )
+    multinode_sort_key = (
+        (lambda r: (
+            r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''),
+            r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'],
+            r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'],
+        ))
+        if sort_by == 'hw'
+        else (lambda r: (
+            r['model_prefix'], r['hw'], r['framework'], r['precision'],
+            r.get('spec_decoding', ''),
+            r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'],
+            r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'],
         ))
+    )
+    single_node_rows.sort(key=single_node_sort_key)
+    multinode_rows.sort(key=multinode_sort_key)
 
     if not rows:
         print('> No eval results found to summarize.')
     else:
         # Print table using tabulate
         MODEL_PREFIX = "Model Prefix"
-        headers = [
-            MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION,
-            TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL
-        ]
-
-        table_rows = [
-            [
-                r['model_prefix'],
-                r['hw'],
-                r['framework'].upper(),
-                r['precision'].upper(),
-                r['spec_decoding'],
-                r['tp'],
-                r['ep'],
-                r['conc'],
-                r['dp_attention'],
-                r['task'],
-                f"{pct(r['score'])}{se(r['score_se'])}",
-                f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
-                f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
-                r['n_eff'] or '',
-                r['model']
+
+        if single_node_rows:
+            headers = [
+                MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING,
+                TP, EP, CONC, DP_ATTENTION,
+                TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL,
+            ]
+            table_rows = [
+                [
+                    r['model_prefix'],
+                    r['hw'],
+                    r['framework'].upper(),
+                    r['precision'].upper(),
+                    r['spec_decoding'],
+                    r['tp'],
+                    r['ep'],
+                    r['conc'],
+                    r['dp_attention'],
+                    r['task'],
+                    f"{pct(r['score'])}{se(r['score_se'])}",
+                    f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
+                    f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
+                    r['n_eff'] or '',
+                    r['model'],
+                ]
+                for r in single_node_rows
+            ]
+            print("### Single-Node Eval Results\n")
+            print(tabulate(table_rows, headers=headers, tablefmt="github"))
+
+        if multinode_rows:
+            headers = [
+                MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING,
+                PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS,
+                DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS,
+                CONC, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL,
+            ]
+            table_rows = [
+                [
+                    r['model_prefix'],
+                    r['hw'],
+                    r['framework'].upper(),
+                    r['precision'].upper(),
+                    r['spec_decoding'],
+                    r['prefill_tp'],
+                    r['prefill_ep'],
+                    r['prefill_dp_attention'],
+                    r['prefill_num_workers'],
+                    r['decode_tp'],
+                    r['decode_ep'],
+                    r['decode_dp_attention'],
+                    r['decode_num_workers'],
+                    r['conc'],
+                    r['task'],
+                    f"{pct(r['score'])}{se(r['score_se'])}",
+                    f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
+                    f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
+                    r['n_eff'] or '',
+                    r['model'],
+                ]
+                for r in multinode_rows
             ]
-            for r in rows
-        ]
+            if single_node_rows:
+                print("\n")
+            print("### Multi-Node Eval Results\n")
+            print(tabulate(table_rows, headers=headers, tablefmt="github"))
 
-        print(tabulate(table_rows, headers=headers, tablefmt="github"))
 
     # Write JSON aggregate
     out_path = Path(f'agg_eval_{exp_name}.json')