diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e84fc0da5..6890126cf 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -503,6 +503,38 @@ dsr1-fp8-mi355x-atom-mtp:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
+# Eval-only: fp8 disagg WITHOUT DPA — isolates DPA as variable
+dsr1-fp8-mi355x-sglang-disagg-nodpa-eval:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
 dsr1-fp8-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528
@@ -814,7 +846,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
 
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1022,7 +1054,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index e2cda146b..4da79d5cd 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -77,6 +77,20 @@ on:
         required: false
         type: string
         default: "[]"
+      run-eval:
+        type: boolean
+        required: false
+        default: false
+      eval-only:
+        description: "Run only evals (skip throughput benchmark)"
+        type: boolean
+        required: false
+        default: false
+      eval-conc:
+        description: "Concurrency to use for eval requests (overrides default max-of-conc-list)"
+        type: string
+        required: false
+        default: ""
       ref:
         description: "Git ref (branch/sha) to checkout"
         required: false
@@ -96,6 +110,9 @@ env:
   CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }}
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
+  RUN_EVAL: ${{ inputs.run-eval }}
+  EVAL_ONLY: ${{ inputs.eval-only }}
+  EVAL_CONC: ${{ inputs.eval-conc }}
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
@@ -116,7 +133,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 480
-    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}"
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | P(tp${{ inputs.prefill-tp }}/ep${{ inputs.prefill-ep }}/dp${{ inputs.prefill-dp-attn }}/nw${{ inputs.prefill-num-worker }}) D(tp${{ inputs.decode-tp }}/ep${{ inputs.decode-ep }}/dp${{ inputs.decode-dp-attn }}/nw${{ inputs.decode-num-worker }}) | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ join(fromJson(inputs.conc-list), 'x') }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
 
     steps:
       - name: Slurm cleanup (pre-run)
@@ -146,9 +163,17 @@ jobs:
           ref: ${{ inputs.ref || github.ref }}
           clean: false
 
+      - name: Cleanup stale eval outputs (pre-run)
+        if: ${{ inputs.run-eval || inputs.eval-only }}
+        run: |
+          rm -f meta_env.json || true
+          rm -f results*.json || true
+          rm -f sample*.jsonl || true
+
       - name: Launch multi-node job script
         env:
           RUNNER_NAME: ${{ runner.name }}
+          RUNNER_TYPE: ${{ inputs.runner }}
           # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}_prefill-tp{}-ep{}-dp{}-nw{}_decode-tp{}-ep{}-dp{}-nw{}_disagg-{}_spec-{}_conc{}_{runner}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_prefill-tp${{ env.PREFILL_TP }}-ep${{ env.PREFILL_EP }}-dp${{ env.PREFILL_DP_ATTN }}-nw${{ env.PREFILL_NUM_WORKERS }}_decode-tp${{ env.DECODE_TP }}-ep${{ env.DECODE_EP }}-dp${{ env.DECODE_DP_ATTN }}-nw${{ env.DECODE_NUM_WORKERS }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }}
         run: |
@@ -159,16 +184,26 @@ jobs:
           export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }}
           export IS_MULTINODE=true
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-          # Check if at least one result file was created
-          if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then
-            echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
-            echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)"
+          if [ "${{ inputs.eval-only }}" = "true" ]; then
+            echo "Eval-only mode: skipping benchmark result file check"
+            # Verify eval produced results
+            if ! ls results*.json 1>/dev/null 2>&1; then
+              echo "Eval-only run failed: no results*.json files found." >&2
+              exit 1
+            fi
           else
-            echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2
-            exit 1
+            # Check if at least one result file was created
+            if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then
+              echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
+              echo "Found result files: $(ls ${RESULT_FILENAME}_*.json)"
+            else
+              echo "Run failed: No benchmark result files found for ${RESULT_FILENAME}_*.json" >&2
+              exit 1
+            fi
           fi
 
       - name: Process results
+        if: ${{ !inputs.eval-only }}
         env:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
@@ -189,11 +224,34 @@ jobs:
           done
 
       - name: Upload results
+        if: ${{ !inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}_*.json
 
+      - name: Upload eval results (if any)
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
+          path: |
+            meta_env.json
+            results*.json
+            sample*.jsonl
+          if-no-files-found: ignore
+
+      - name: Verify eval scores
+        if: ${{ inputs.eval-only }}
+        run: python3 utils/evals/validate_scores.py
+
+      - name: Cleanup eval outputs (post-upload)
+        if: ${{ always() && (inputs.run-eval || inputs.eval-only) }}
+        run: |
+          rm -f meta_env.json || true
+          rm -f results*.json || true
+          rm -f sample*.jsonl || true
+
       - name: Upload logs
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 797505eec..25bec61ee 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -140,6 +140,13 @@ jobs:
           ref: ${{ inputs.ref || github.ref }}
           clean: false
 
+      - name: Cleanup stale eval outputs (pre-run)
+        if: ${{ inputs.run-eval || inputs.eval-only }}
+        run: |
+          rm -f meta_env.json || true
+          rm -f results*.json || true
+          rm -f sample*.jsonl || true
+
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index d6ecf76b0..487a4a0c3 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -38,6 +38,7 @@ jobs:
             single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
             multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
             eval-config: ${{ steps.get-jobs.outputs.eval-config }}
+            multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -55,11 +56,13 @@ jobs:
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
                   SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
-                  MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
+                  MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and not x.get('run-eval', False)]))")
                   EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
+                  MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
                   echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
+                  echo "multi-node-eval-config=$MULTI_EVAL" >> $GITHUB_OUTPUT
 
     test-sweep-multi-node:
         needs: get-jobs
@@ -97,6 +100,48 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: false
+            ref: ${{ inputs.ref }}
+
+    test-sweep-multi-node-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.multi-node-eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: multi-node eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.multi-node-eval-config) }}
+        secrets: inherit
+        with:
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.exp-name }}
+            conc-list: ${{ toJson(matrix.config.conc) }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+
+            prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
+            prefill-tp: ${{ matrix.config.prefill.tp }}
+            prefill-ep: ${{ matrix.config.prefill.ep }}
+            prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
+            prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
+
+            decode-num-worker: ${{ matrix.config.decode.num-worker }}
+            decode-tp: ${{ matrix.config.decode.tp }}
+            decode-ep: ${{ matrix.config.decode.ep }}
+            decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
+            decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: true
+            eval-only: true
+            eval-conc: ${{ matrix.config.eval-conc }}
             ref: ${{ inputs.ref }}
 
     test-sweep-single-node:
@@ -162,15 +207,15 @@ jobs:
 
     collect-results:
         needs: [test-sweep-multi-node, test-sweep-single-node]
-        if: ${{ always() }}
+        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped') }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
         with:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-evals]
-        if: ${{ always() && needs.test-sweep-evals.result != 'skipped' }}
+        needs: [test-sweep-evals, test-sweep-multi-node-evals]
+        if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index 4d61a918c..e3eaf1c3b 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -105,6 +105,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: false
 
     sweep-multi-node-8k1k:
         needs: setup
@@ -189,6 +190,44 @@ jobs:
             run-eval: true
             eval-only: true
 
+    sweep-multi-node-evals:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).multinode_evals) != 'null' }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: multi-node eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).multinode_evals }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            conc-list: ${{ toJson(matrix.config.conc) }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
+            prefill-tp: ${{ matrix.config.prefill.tp }}
+            prefill-ep: ${{ matrix.config.prefill.ep }}
+            prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
+            prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
+            decode-num-worker: ${{ matrix.config.decode.num-worker }}
+            decode-tp: ${{ matrix.config.decode.tp }}
+            decode-ep: ${{ matrix.config.decode.ep }}
+            decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
+            decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            run-eval: true
+            eval-only: true
+            eval-conc: ${{ matrix.config.eval-conc }}
+
     collect-results:
         needs:
             [
@@ -205,8 +244,8 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [sweep-evals, setup]
-        if: ${{ always() && needs.setup.result != 'skipped' && needs.sweep-evals.result != 'skipped' }}
+        needs: [sweep-evals, sweep-multi-node-evals, setup]
+        if: ${{ always() && needs.setup.result != 'skipped' && (needs.sweep-evals.result != 'skipped' || needs.sweep-multi-node-evals.result != 'skipped') }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
diff --git a/AGENTS.md b/AGENTS.md
index 94c28e334..e64a903cd 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -37,8 +37,9 @@ InferenceX is an open-source, automated benchmarking system that continuously tr
 │   ├── workflows/           # GitHub Actions CI/CD
 │   │   ├── run-sweep.yml    # Main performance sweep
 │   │   ├── e2e-tests.yml    # End-to-end testing
-│   │   ├── benchmark-tmpl.yml  # Benchmark job template
-│   │   └── collect-evals.yml   # Eval results collection
+│   │   ├── benchmark-tmpl.yml           # Single-node benchmark job template
+│   │   ├── benchmark-multinode-tmpl.yml # Multi-node benchmark job template
+│   │   └── collect-evals.yml            # Eval results collection
 │   └── configs/             # Master configuration files
 │       ├── nvidia-master.yaml
 │       ├── amd-master.yaml
@@ -299,14 +300,27 @@ Evals run optional accuracy checks to ensure model outputs aren't degraded by in
 
 ### When Evals Run
 
-Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run at two concurrency levels per configuration group:
+Evals run as **separate workflow jobs** from throughput benchmarks (eval-only mode). The `EVAL_ONLY` flag skips throughput benchmarking and only runs lm-eval.
 
-- **Highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn)
-- **Lower-median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn)
+**Single-node** eval selection:
+- All TPs at **highest concurrency** and **median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn)
+- Only on `8k1k` sequence length
+
+**Multi-node** eval selection:
+- Entry with **highest max concurrency** per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn)
+- Only `8k1k` sequence length
 
 This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`.
 
-**Note**: Evals only run on `8k1k` sequence length.
+**Workflow separation**: Eval jobs are independent from benchmark jobs:
+- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node)
+- `e2e-tests.yml`: `test-sweep-evals` and `test-sweep-multi-node-evals`
+- Both use their respective benchmark templates with `eval-only: true`
+- `collect-evals` depends only on eval jobs, not benchmark jobs
+
+**Multi-node eval infrastructure**:
+- AMD (MI355X): `server.sh` skips `bench.sh` when `EVAL_ONLY=true`, runs lm-eval directly
+- NVIDIA (GB200/GB300): Uses srt-slurm `infmax-eval` benchmark type with expanded `eval_context_length`
 
 ### Eval Framework: lm-eval
 
@@ -336,13 +350,13 @@ All benchmark scripts in `benchmarks/` follow one of two flows:
 
 ```bash
 # Combined mode (benchmark + eval):
-# 1. Start server
+# 1. Start server (with --context-length expansion if EVAL_ONLY=true)
 # 2. wait_for_server_ready
-# 3. run_benchmark_serving (throughput)
-# 4. Conditionally run evals:
+# 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true)
+# 4. Run evals:
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
-    append_lm_eval_summary
+    append_lm_eval_summary  # Writes meta_env.json and moves artifacts
 fi
 
 # Eval-only mode (EVAL_ONLY=true):
@@ -353,6 +367,16 @@ fi
 # 5. run_eval + append_lm_eval_summary
 ```
 
+**Multi-node AMD** (`benchmarks/multi_node/amd_utils/server.sh`):
+- Skips `bench.sh` when `EVAL_ONLY=true`
+- Runs lm-eval via `run_eval` against the router on port 30000
+- Copies eval artifacts to `/run_logs/slurm_job-*/eval_results/`
+
+**Multi-node NVIDIA** (GB200/GB300 via srt-slurm):
+- Uses `benchmark.type: "infmax-eval"` in srt-slurm config
+- `benchmark.eval_context_length` expands server context for eval
+- `infmax-eval` benchmark runner sources `benchmark_lib.sh` from `INFMAX_WORKSPACE`
+
 ### Key Eval Functions in `benchmarks/benchmark_lib.sh`
 
 | Function | Description |
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 535313252..403484998 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -506,13 +506,13 @@ _install_lm_eval_deps() {
     python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true
     local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476"
     if command -v git >/dev/null 2>&1; then
-        if ! python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \
+        if ! python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \
             "git+https://github.com/EleutherAI/lm-evaluation-harness.git@${lm_eval_ref}"; then
-            python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \
+            python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \
                 "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true
         fi
     else
-        python3 -m pip install -q --no-cache-dir --no-deps --break-system-packages \
+        python3 -m pip install -q --no-cache-dir --no-deps --force-reinstall --break-system-packages \
             "https://github.com/EleutherAI/lm-evaluation-harness/archive/${lm_eval_ref}.tar.gz" || true
     fi
 }
@@ -593,14 +593,23 @@ PY
 
 get_native_max_context_length() {
     local model_path="$1"
+    # Prefer MODEL_PATH (local model directory) when available, since the
+    # argument may be a served-model name that is neither a valid HF repo
+    # ID nor a local path (e.g. "deepseek-r1-fp4" on the B300 cluster).
+    if [ -n "${MODEL_PATH:-}" ] && [ -d "${MODEL_PATH}" ]; then
+        model_path="${MODEL_PATH}"
+    fi
     python3 -c "
-from transformers import AutoConfig
-config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
-for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
-    if hasattr(config, attr):
-        print(getattr(config, attr))
-        break
-else:
+try:
+    from transformers import AutoConfig
+    config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
+    for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
+        if hasattr(config, attr):
+            print(getattr(config, attr))
+            break
+    else:
+        print(0)
+except Exception:
     print(0)
 "
 }
@@ -708,8 +717,32 @@ append_lm_eval_summary() {
     # Write minimal meta for collectors that expect it
     local meta_json="${out_dir}/meta_env.json"
     local model_name="${MODEL_NAME:-$MODEL}"
+    local is_multinode_json="false"
+    if [ "${IS_MULTINODE:-false}" = "true" ]; then
+        is_multinode_json="true"
+    fi
+
+    local prefill_tp="${PREFILL_TP:-${TP:-1}}"
+    local prefill_ep="${PREFILL_EP:-${EP_SIZE:-1}}"
+    local prefill_num_workers="${PREFILL_NUM_WORKERS:-1}"
+    local decode_tp="${DECODE_TP:-${TP:-1}}"
+    local decode_ep="${DECODE_EP:-${EP_SIZE:-1}}"
+    local decode_num_workers="${DECODE_NUM_WORKERS:-1}"
+
     local dp_json="false"
-    if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi
+    if [ "${DP_ATTENTION:-false}" = "true" ]; then dp_json="true"; fi
+    local prefill_dp_json="$dp_json"
+    if [ "${PREFILL_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then
+        prefill_dp_json="true"
+    else
+        prefill_dp_json="false"
+    fi
+    local decode_dp_json="$dp_json"
+    if [ "${DECODE_DP_ATTENTION:-${DP_ATTENTION:-false}}" = "true" ]; then
+        decode_dp_json="true"
+    else
+        decode_dp_json="false"
+    fi
 
     # Derive framework/precision from env, fallback to parsing RESULT_FILENAME
     # RESULT_FILENAME format (from workflow):
@@ -734,6 +767,7 @@ append_lm_eval_summary() {
     fi
     cat > "${meta_json}" <<META
 {
+  "is_multinode": ${is_multinode_json},
   "framework": "${fw:-unknown}",
   "precision": "${prec:-unknown}",
   "spec_decoding": "${SPEC_DECODING}",
@@ -741,6 +775,14 @@ append_lm_eval_summary() {
   "conc": ${CONC:-1},
   "ep": ${EP_SIZE:-1},
   "dp_attention": ${dp_json},
+  "prefill_tp": ${prefill_tp},
+  "prefill_ep": ${prefill_ep},
+  "prefill_dp_attention": ${prefill_dp_json},
+  "prefill_num_workers": ${prefill_num_workers},
+  "decode_tp": ${decode_tp},
+  "decode_ep": ${decode_ep},
+  "decode_dp_attention": ${decode_dp_json},
+  "decode_num_workers": ${decode_num_workers},
   "model": "${model_name:-}",
   "infmax_model_prefix": "${MODEL_PREFIX:-unknown}",
   "hw": "${RUNNER_TYPE:-unknown}",
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 6b0352f24..2f88250b5 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -285,6 +285,17 @@ export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY
 export DRY_RUN="${DRY_RUN:-0}"
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 
+# Eval-related env vars (threaded from submit.sh)
+export RUN_EVAL="${RUN_EVAL:-false}"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+export EVAL_CONC="${EVAL_CONC:-}"
+export FRAMEWORK="${FRAMEWORK:-}"
+export PRECISION="${PRECISION:-}"
+export MODEL_PREFIX="${MODEL_PREFIX:-}"
+export RUNNER_TYPE="${RUNNER_TYPE:-}"
+export RESULT_FILENAME="${RESULT_FILENAME:-}"
+export SPEC_DECODING="${SPEC_DECODING:-}"
+
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
 export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"
@@ -389,6 +400,15 @@ exec sudo docker run --rm \
     -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \
     -e DRY_RUN=\$DRY_RUN \
     -e BENCHMARK_LOGS_DIR=/benchmark_logs \
+    -e RUN_EVAL=\$RUN_EVAL \
+    -e EVAL_ONLY=\$EVAL_ONLY \
+    -e EVAL_CONC=\$EVAL_CONC \
+    -e FRAMEWORK=\$FRAMEWORK \
+    -e PRECISION=\$PRECISION \
+    -e MODEL_PREFIX=\$MODEL_PREFIX \
+    -e RUNNER_TYPE=\$RUNNER_TYPE \
+    -e RESULT_FILENAME=\$RESULT_FILENAME \
+    -e SPEC_DECODING=\$SPEC_DECODING \
     --name \"$DOCKER_CONT_NAME\" \
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 7f174b760..9ed395bb4 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -468,7 +468,9 @@ if [ "$NODE_RANK" -eq 0 ]; then
         ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
         ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
 
-    if [[ "$DRY_RUN" -eq 1 ]]; then
+    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+    elif [[ "$DRY_RUN" -eq 1 ]]; then
         echo "DRY RUN: $BENCH_CMD"
     else
         set -x
@@ -476,6 +478,85 @@ if [ "$NODE_RANK" -eq 0 ]; then
         set +x
     fi
 
+    # Run evaluation if requested (before killing router)
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        echo "Running lm-eval evaluation on Node 0..."
+
+        # Health check: verify the router is still serving before running eval.
+        # The throughput benchmark may have crashed/exhausted decode workers.
+        EVAL_HEALTH_OK=false
+        for _attempt in 1 2 3; do
+            if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then
+                EVAL_HEALTH_OK=true
+                break
+            fi
+            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
+            sleep 10
+        done
+
+        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
+            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
+        else
+            # Must run from repo root so utils/evals/${task}.yaml resolves
+            pushd /workspace
+
+            # Source eval functions from benchmark_lib.sh
+            source /workspace/benchmarks/benchmark_lib.sh
+
+            # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
+            if [[ -n "${EVAL_CONC:-}" ]]; then
+                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
+            else
+                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            fi
+
+            if [[ "$DRY_RUN" -eq 1 ]]; then
+                echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})"
+            else
+                # Run lm-eval against the router on port 30000
+                run_eval --framework lm-eval --port 30000
+
+                # Set metadata env vars for append_lm_eval_summary
+                export TP="${PREFILL_TP_SIZE}"
+                export CONC="${EVAL_CONCURRENT_REQUESTS}"
+                export EP_SIZE=1
+                [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+                export PREFILL_TP="${PREFILL_TP_SIZE}"
+                export PREFILL_EP=1
+                [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
+                export PREFILL_NUM_WORKERS="${xP}"
+                export DECODE_TP="${DECODE_TP_SIZE}"
+                export DECODE_EP=1
+                [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
+                export DECODE_NUM_WORKERS="${yD}"
+                export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
+                export ISL="${BENCH_INPUT_LEN}"
+                export OSL="${BENCH_OUTPUT_LEN}"
+                # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME
+                # are already set via Docker -e flags from job.slurm
+
+                append_lm_eval_summary
+                # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
+
+                # Copy eval artifacts to run_logs for NFS extraction by runner
+                EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
+                mkdir -p "$EVAL_COPY_DIR"
+                for f in meta_env.json; do
+                    [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
+                done
+                # Use find for glob patterns to avoid "no match" errors
+                find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+                find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+
+                echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+            fi
+
+            popd
+        fi
+    fi
+
     # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
     LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
     mkdir -p "$LOGS_OUTPUT"
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index 802106350..be22b8d33 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -105,6 +105,17 @@ export BENCH_NUM_PROMPTS_MULTIPLIER=10
 export BENCH_MAX_CONCURRENCY=${CONCURRENCIES}
 export BENCH_REQUEST_RATE=${REQUEST_RATE}
 
+# Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker)
+export RUN_EVAL="${RUN_EVAL:-false}"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+export EVAL_CONC="${EVAL_CONC:-}"
+export FRAMEWORK="${FRAMEWORK:-}"
+export PRECISION="${PRECISION:-}"
+export MODEL_PREFIX="${MODEL_PREFIX:-}"
+export RUNNER_TYPE="${RUNNER_TYPE:-}"
+export RESULT_FILENAME="${RESULT_FILENAME:-}"
+export SPEC_DECODING="${SPEC_DECODING:-}"
+
 # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output.
 # SLURM writes output files on the batch node, so /tmp won't work (node-local).
 # Defaults to a sibling directory of the submit working directory.
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 0d1bd40cc..fb9bb7b22 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -36,7 +36,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+    git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR" || exit 1
     git checkout sa-submission-q1-2026
 
@@ -65,6 +65,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     export ISL="$ISL"
     export OSL="$OSL"
+    export EVAL_ONLY="${EVAL_ONLY:-false}"
 
     # Create srtslurm.yaml for srtctl (used by both frameworks)
     SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -98,7 +99,17 @@ EOF
     echo "Running make setup..."
     make setup ARCH=x86_64
 
+    # Export eval-related env vars for srt-slurm post-benchmark eval
+    export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
     echo "Submitting job with srtctl..."
+
+    if [[ -z "$CONFIG_FILE" ]]; then
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+        exit 1
+    fi
+
     # Override the job name in the config file with the runner name
     sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
@@ -162,45 +173,63 @@ EOF
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-    # Find all result subdirectories
-    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        # Find all result subdirectories
+        RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+        if [ -z "$RESULT_SUBDIRS" ]; then
+            echo "Warning: No result subdirectories found in $LOGS_DIR"
+        else
+            # Process results from all configurations
+            for result_subdir in $RESULT_SUBDIRS; do
+                echo "Processing result subdirectory: $result_subdir"
+
+                # Extract configuration info from directory name
+                CONFIG_NAME=$(basename "$result_subdir")
+
+                # Find all result JSON files
+                RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+                for result_file in $RESULT_FILES; do
+                    if [ -f "$result_file" ]; then
+                        # Extract metadata from filename
+                        # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                        filename=$(basename "$result_file")
+                        concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                        gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                        ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                        gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                        echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                        WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                        cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                        echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                    fi
+                done
+            done
+        fi
 
-    if [ -z "$RESULT_SUBDIRS" ]; then
-        echo "Warning: No result subdirectories found in $LOGS_DIR"
+        echo "All result files processed"
     else
-        # Process results from all configurations
-        for result_subdir in $RESULT_SUBDIRS; do
-            echo "Processing result subdirectory: $result_subdir"
-
-            # Extract configuration info from directory name
-            CONFIG_NAME=$(basename "$result_subdir")
-
-            # Find all result JSON files
-            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-            for result_file in $RESULT_FILES; do
-                if [ -f "$result_file" ]; then
-                    # Extract metadata from filename
-                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                    filename=$(basename "$result_file")
-                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
-
-                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
-
-                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+        echo "EVAL_ONLY=true: Skipping benchmark result collection"
+    fi
 
-                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-                fi
+    # Collect eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+        EVAL_DIR="$LOGS_DIR/eval_results"
+        if [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
             done
-        done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+        fi
     fi
 
-    echo "All result files processed"
-
     # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
     # from blocking the next job's checkout on this runner
     echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 68da9f2b7..c718dcad0 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -16,11 +16,11 @@ fi
 # The yaml files specify HuggingFace model IDs for portability, but we use
 # local paths to avoid repeated downloading on the shared B300 cluster.
 if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
-    export MODEL_PATH="/scratch/models/deepseek-r1-0528-nvfp4-v2"
+    export MODEL_PATH="/data/models/dsr1-fp4"
     export SERVED_MODEL_NAME="deepseek-r1-fp4"
     export SRT_SLURM_MODEL_PREFIX="dsr1"
 elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
-    export MODEL_PATH="/scratch/models/deepseek-r1-0528"
+    export MODEL_PATH="/data/models/dsr1-fp8"
     export SERVED_MODEL_NAME="deepseek-r1-fp8"
     export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
 else
@@ -35,7 +35,7 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR" || exit 1
 git checkout sa-submission-q1-2026
 
@@ -64,6 +64,7 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX
 
 export ISL="$ISL"
 export OSL="$OSL"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
 
 # Create srtslurm.yaml for srtctl
 SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -99,7 +100,17 @@ cat srtslurm.yaml
 echo "Running make setup..."
 make setup ARCH=x86_64
 
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
 echo "Submitting job with srtctl..."
+
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
 # Override the job name in the config file with the runner name
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
 SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
@@ -163,45 +174,63 @@ echo "Found logs directory: $LOGS_DIR"
 cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
 tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-# Find all result subdirectories
-RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
 
-if [ -z "$RESULT_SUBDIRS" ]; then
-    echo "Warning: No result subdirectories found in $LOGS_DIR"
+    echo "All result files processed"
 else
-    # Process results from all configurations
-    for result_subdir in $RESULT_SUBDIRS; do
-        echo "Processing result subdirectory: $result_subdir"
-
-        # Extract configuration info from directory name
-        CONFIG_NAME=$(basename "$result_subdir")
-
-        # Find all result JSON files
-        RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-        for result_file in $RESULT_FILES; do
-            if [ -f "$result_file" ]; then
-                # Extract metadata from filename
-                # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                filename=$(basename "$result_file")
-                concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
-
-                echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
-
-                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                cp "$result_file" "$WORKSPACE_RESULT_FILE"
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
 
-                echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-            fi
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
         done
-    done
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
 fi
 
-echo "All result files processed"
-
 # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
 # from blocking the next job's checkout on this runner
 echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index f8f0ef26e..d84c0ac13 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -50,6 +50,8 @@ NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$NGINX_IMAGE" |
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
 export ISL="$ISL"
 export OSL="$OSL"
 
@@ -105,6 +107,14 @@ PY
 fi
 
 
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
 echo "Cloning srt-slurm repository..."
 SRT_REPO_DIR="srt-slurm"
 if [ -d "$SRT_REPO_DIR" ]; then
@@ -112,7 +122,7 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout sa-submission-q1-2026
 
@@ -164,6 +174,9 @@ cat srtslurm.yaml
 echo "Running make setup..."
 make setup ARCH=aarch64
 
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
 echo "Submitting job with srtctl..."
 
 # Override the job name in the config file with the runner name
@@ -223,51 +236,71 @@ set -x
 echo "Job $JOB_ID completed!"
 echo "Collecting results..."
 
-if [ ! -d "$LOGS_DIR" ]; then
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
     echo "Warning: Logs directory not found at $LOGS_DIR"
-    exit 1
 fi
 
-echo "Found logs directory: $LOGS_DIR"
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
 
-cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-# Find all result subdirectories
-RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
 
-if [ -z "$RESULT_SUBDIRS" ]; then
-    echo "Warning: No result subdirectories found in $LOGS_DIR"
+    echo "All result files processed"
 else
-    # Process results from all configurations
-    for result_subdir in $RESULT_SUBDIRS; do
-        echo "Processing result subdirectory: $result_subdir"
-
-        # Extract configuration info from directory name
-        CONFIG_NAME=$(basename "$result_subdir")
-
-        # Find all result JSON files
-        RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-        for result_file in $RESULT_FILES; do
-            if [ -f "$result_file" ]; then
-                # Extract metadata from filename
-                # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                filename=$(basename "$result_file")
-                concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
-
-                echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
-
-                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                cp "$result_file" "$WORKSPACE_RESULT_FILE"
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
 
-                echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-            fi
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
         done
-    done
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
 fi
-
-echo "All result files processed"
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index d71fd5af7..91147d90d 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -31,6 +31,8 @@ NGINX_SQUASH_FILE="/home/sa-shared/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]
 srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
 srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE"
 
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
 export ISL="$ISL"
 export OSL="$OSL"
 
@@ -41,7 +43,7 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout sa-submission-q1-2026
 
@@ -95,8 +97,17 @@ cat srtslurm.yaml
 echo "Running make setup..."
 make setup ARCH=aarch64
 
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
 echo "Submitting job with srtctl..."
 
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
 # Override the job name in the config file with the runner name
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
 
@@ -150,54 +161,74 @@ set -x
 echo "Job $JOB_ID completed!"
 echo "Collecting results..."
 
-if [ ! -d "$LOGS_DIR" ]; then
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
     echo "Warning: Logs directory not found at $LOGS_DIR"
-    exit 1
 fi
 
-echo "Found logs directory: $LOGS_DIR"
-
-cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
 
-# Find all result subdirectories
-RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-if [ -z "$RESULT_SUBDIRS" ]; then
-    echo "Warning: No result subdirectories found in $LOGS_DIR"
-else
-    # Process results from all configurations
-    for result_subdir in $RESULT_SUBDIRS; do
-        echo "Processing result subdirectory: $result_subdir"
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
 
-        # Extract configuration info from directory name
-        CONFIG_NAME=$(basename "$result_subdir")
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
 
-        # Find all result JSON files
-        RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
 
-        for result_file in $RESULT_FILES; do
-            if [ -f "$result_file" ]; then
-                # Extract metadata from filename
-                # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                filename=$(basename "$result_file")
-                concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
 
-                echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
 
-                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                cp "$result_file" "$WORKSPACE_RESULT_FILE"
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
 
-                echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-            fi
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
         done
-    done
+    fi
+
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
 fi
 
-echo "All result files processed"
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi
 
 # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
 # from blocking the next job's checkout on this runner
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index bb0335955..91cea74f3 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -41,7 +41,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+    git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q1-2026
 
@@ -75,6 +75,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     export ISL="$ISL"
     export OSL="$OSL"
+    export EVAL_ONLY="${EVAL_ONLY:-false}"
 
     # Create srtslurm.yaml for srtctl (used by both frameworks)
     SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -112,7 +113,17 @@ EOF
     echo "Running make setup..."
     make setup ARCH=x86_64
 
+    # Export eval-related env vars for srt-slurm post-benchmark eval
+    export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
     echo "Submitting job with srtctl..."
+
+    if [[ -z "$CONFIG_FILE" ]]; then
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+        exit 1
+    fi
+
     # Override the job name in the config file with the runner name
     sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
     sed -i "/^name:.*/a sbatch_directives:\n  exclude: \"${SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_FILE"
@@ -177,45 +188,63 @@ EOF
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-    # Find all result subdirectories
-    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
-
-    if [ -z "$RESULT_SUBDIRS" ]; then
-        echo "Warning: No result subdirectories found in $LOGS_DIR"
-    else
-        # Process results from all configurations
-        for result_subdir in $RESULT_SUBDIRS; do
-            echo "Processing result subdirectory: $result_subdir"
-
-            # Extract configuration info from directory name
-            CONFIG_NAME=$(basename "$result_subdir")
-
-            # Find all result JSON files
-            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-            for result_file in $RESULT_FILES; do
-                if [ -f "$result_file" ]; then
-                    # Extract metadata from filename
-                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                    filename=$(basename "$result_file")
-                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        # Find all result subdirectories
+        RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+        if [ -z "$RESULT_SUBDIRS" ]; then
+            echo "Warning: No result subdirectories found in $LOGS_DIR"
+        else
+            # Process results from all configurations
+            for result_subdir in $RESULT_SUBDIRS; do
+                echo "Processing result subdirectory: $result_subdir"
+
+                # Extract configuration info from directory name
+                CONFIG_NAME=$(basename "$result_subdir")
+
+                # Find all result JSON files
+                RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+                for result_file in $RESULT_FILES; do
+                    if [ -f "$result_file" ]; then
+                        # Extract metadata from filename
+                        # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                        filename=$(basename "$result_file")
+                        concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                        gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                        ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                        gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                        echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                        WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                        cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                        echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                    fi
+                done
+            done
+        fi
 
-                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+        echo "All result files processed"
+    else
+        echo "EVAL_ONLY=true: Skipping benchmark result collection"
+    fi
 
-                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-                fi
+    # Collect eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+        EVAL_DIR="$LOGS_DIR/eval_results"
+        if [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
             done
-        done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+        fi
     fi
 
-    echo "All result files processed"
-
     # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
     # from blocking the next job's checkout on this runner
     echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 9b3b771a5..3e7032314 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -40,7 +40,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+    git clone https://github.com/Oseltamivir/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q1-2026
 
@@ -74,6 +74,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     export ISL="$ISL"
     export OSL="$OSL"
+    export EVAL_ONLY="${EVAL_ONLY:-false}"
 
     # Create srtslurm.yaml for srtctl (used by both frameworks)
     SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}"
@@ -112,7 +113,17 @@ EOF
     echo "Running make setup..."
     make setup ARCH=x86_64
 
+    # Export eval-related env vars for srt-slurm post-benchmark eval
+    export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
     echo "Submitting job with srtctl..."
+
+    if [[ -z "$CONFIG_FILE" ]]; then
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+        exit 1
+    fi
+
     # Override the job name in the config file with the runner name
     sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
@@ -176,45 +187,63 @@ EOF
     cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
     tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
 
-    # Find all result subdirectories
-    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
-
-    if [ -z "$RESULT_SUBDIRS" ]; then
-        echo "Warning: No result subdirectories found in $LOGS_DIR"
-    else
-        # Process results from all configurations
-        for result_subdir in $RESULT_SUBDIRS; do
-            echo "Processing result subdirectory: $result_subdir"
-
-            # Extract configuration info from directory name
-            CONFIG_NAME=$(basename "$result_subdir")
-
-            # Find all result JSON files
-            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
-
-            for result_file in $RESULT_FILES; do
-                if [ -f "$result_file" ]; then
-                    # Extract metadata from filename
-                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
-                    filename=$(basename "$result_file")
-                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
-                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
-                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
-                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        # Find all result subdirectories
+        RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
 
-                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+        if [ -z "$RESULT_SUBDIRS" ]; then
+            echo "Warning: No result subdirectories found in $LOGS_DIR"
+        else
+            # Process results from all configurations
+            for result_subdir in $RESULT_SUBDIRS; do
+                echo "Processing result subdirectory: $result_subdir"
+
+                # Extract configuration info from directory name
+                CONFIG_NAME=$(basename "$result_subdir")
+
+                # Find all result JSON files
+                RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+                for result_file in $RESULT_FILES; do
+                    if [ -f "$result_file" ]; then
+                        # Extract metadata from filename
+                        # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                        filename=$(basename "$result_file")
+                        concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                        gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                        ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                        gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                        echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                        WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                        cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                        echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                    fi
+                done
+            done
+        fi
 
-                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
-                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+        echo "All result files processed"
+    else
+        echo "EVAL_ONLY=true: Skipping benchmark result collection"
+    fi
 
-                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
-                fi
+    # Collect eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+        EVAL_DIR="$LOGS_DIR/eval_results"
+        if [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
             done
-        done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+        fi
     fi
 
-    echo "All result files processed"
-
     # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
     # from blocking the next job's checkout on this runner
     echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 52e28e9b8..aa55d35e5 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -51,6 +51,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     mkdir -p "$BENCHMARK_LOGS_DIR"
     sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
+    # Ensure root-owned files are cleaned up even on early exit to prevent
+    # EACCES errors when the next GH Actions job checks out on this runner
+    trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT
+
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
     if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then
         BENCHMARK_SUBDIR="multi_node"
@@ -101,33 +105,50 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement
     # Find the latest log directory that contains the data
 
-    cat > collect_latest_results.py <<'PY'
+    if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+        cat > collect_latest_results.py <<'PY'
 import os, sys
 sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
 for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
     print(path)
 PY
 
-    LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
-    if [ -z "$LOGS_DIR" ]; then
-        echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
-        exit 1
+        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
+        if [ -z "$LOGS_DIR" ]; then
+            echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
+            exit 1
+        fi
+
+        echo "Found logs directory: $LOGS_DIR"
+        ls -la "$LOGS_DIR"
+
+        # Result JSON are contained within the result directory
+        for result_file in $(find $LOGS_DIR -type f); do
+            # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json
+            file_name=$(basename $result_file)
+            if [ -f $result_file ]; then
+                # Copy the result file to workspace with a unique name
+                WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
+                echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
+                cp $result_file $WORKSPACE_RESULT_FILE
+            fi
+        done
     fi
 
-    echo "Found logs directory: $LOGS_DIR"
-    ls -la "$LOGS_DIR"
-
-    # Result JSON are contained within the result directory
-    for result_file in $(find $LOGS_DIR -type f); do
-        # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json
-        file_name=$(basename $result_file)
-        if [ -f $result_file ]; then
-            # Copy the result file to workspace with a unique name
-            WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
-            echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
-            cp $result_file $WORKSPACE_RESULT_FILE
+    # Extract eval results if eval was requested
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        # Find eval_results in the slurm job logs directory
+        EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1)
+        if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then
+            echo "Extracting eval results from $EVAL_DIR"
+            for eval_file in "$EVAL_DIR"/*; do
+                [ -f "$eval_file" ] && cp "$eval_file" "$GITHUB_WORKSPACE/"
+                echo "Copied eval artifact: $(basename "$eval_file")"
+            done
+        else
+            echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs"
         fi
-    done
+    fi
 
     echo "All result files processed"
     # Use sync scancel to ensure nfs file handle is released in time
@@ -146,6 +167,9 @@ PY
         echo "Logs copied to $ARTIFACT_DIR for artifact upload"
     fi
 
+    # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup
+    sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+
 else
 
     export HF_HUB_CACHE_MOUNT="/var/lib/hf-hub-cache/"
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 1c2f6429b..18917447e 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -10,7 +10,8 @@
 from summarize import (
     load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION,
     TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF,
-    SPEC_DECODING
+    SPEC_DECODING, PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS,
+    DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS
 )
 
 
@@ -160,19 +161,67 @@ def se(x: Any) -> str:
         return ''
 
 
+def as_int(x: Any, default: int = 0) -> int:
+    """Convert a metadata field to int with a fallback."""
+    try:
+        return int(x)
+    except Exception:
+        return default
+
+
+def as_bool(x: Any, default: bool = False) -> bool:
+    """Parse a metadata boolean stored as bool/string/int."""
+    if isinstance(x, bool):
+        return x
+    if x is None:
+        return default
+    return str(x).lower() == 'true'
+
+
 def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
     """Build a result row from metadata and extracted metrics."""
+    is_multinode = as_bool(meta.get('is_multinode'), False)
+    prefill_tp = as_int(meta.get('prefill_tp', meta.get('tp', 1)), 1)
+    prefill_ep = as_int(meta.get('prefill_ep', meta.get('ep', 1)), 1)
+    prefill_num_workers = as_int(meta.get('prefill_num_workers', 1), 1)
+    decode_tp = as_int(meta.get('decode_tp', meta.get('tp', 1)), 1)
+    decode_ep = as_int(meta.get('decode_ep', meta.get('ep', 1)), 1)
+    decode_num_workers = as_int(meta.get('decode_num_workers', 1), 1)
+    prefill_dp_attention = meta.get('prefill_dp_attention')
+    decode_dp_attention = meta.get('decode_dp_attention')
+    dp_attention = meta.get('dp_attention', 'none')
+
+    if prefill_dp_attention is None:
+        prefill_dp_attention = dp_attention
+    if decode_dp_attention is None:
+        decode_dp_attention = dp_attention
+
+    if is_multinode:
+        if prefill_dp_attention == decode_dp_attention:
+            dp_attention = prefill_dp_attention
+        else:
+            dp_attention = f"prefill={str(prefill_dp_attention).lower()},decode={str(decode_dp_attention).lower()}"
+
     row = {
+        'is_multinode': is_multinode,
         'model_prefix': meta.get('infmax_model_prefix', 'unknown'),
         'model': m.get('model') or meta.get('model', 'unknown'),
         'hw': meta.get('hw', 'unknown').upper(),
         'framework': meta.get('framework', 'unknown').lower(),
         'precision': meta.get('precision', 'unknown').lower(),
         'spec_decoding': meta.get('spec_decoding', 'unknown'),
-        'tp': int(meta.get('tp', 1)),
-        'ep': int(meta.get('ep', 1)),
-        'conc': int(meta.get('conc', 0)),
-        'dp_attention': str(meta.get('dp_attention', "none")).lower(),
+        'tp': as_int(meta.get('tp', prefill_tp), prefill_tp),
+        'ep': as_int(meta.get('ep', prefill_ep), prefill_ep),
+        'prefill_tp': prefill_tp,
+        'prefill_ep': prefill_ep,
+        'prefill_num_workers': prefill_num_workers,
+        'decode_tp': decode_tp,
+        'decode_ep': decode_ep,
+        'decode_num_workers': decode_num_workers,
+        'conc': as_int(meta.get('conc', 0), 0),
+        'dp_attention': str(dp_attention).lower(),
+        'prefill_dp_attention': str(prefill_dp_attention).lower(),
+        'decode_dp_attention': str(decode_dp_attention).lower(),
         'task': m.get('task', 'unknown'),
         'em_strict': m.get('strict'),
         'em_strict_se': m.get('strict_se'),
@@ -226,49 +275,111 @@ def main():
             row = build_row(meta, m)
             rows.append(row)
 
+    single_node_rows = [r for r in rows if not r['is_multinode']]
+    multinode_rows = [r for r in rows if r['is_multinode']]
+
     # Sort for stable output (default: by model_prefix)
     sort_by = sys.argv[3] if len(sys.argv) > 3 else 'model_prefix'
-    if sort_by == 'hw':
-        rows.sort(key=lambda r: (
-            r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc']
+    single_node_sort_key = (
+        (lambda r: (
+            r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''),
+            r['tp'], r['ep'], r['conc'],
         ))
-    else:
-        rows.sort(key=lambda r: (
-            r['model_prefix'], r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc']
+        if sort_by == 'hw'
+        else (lambda r: (
+            r['model_prefix'], r['hw'], r['framework'], r['precision'],
+            r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'],
+        ))
+    )
+    multinode_sort_key = (
+        (lambda r: (
+            r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''),
+            r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'],
+            r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'],
+        ))
+        if sort_by == 'hw'
+        else (lambda r: (
+            r['model_prefix'], r['hw'], r['framework'], r['precision'],
+            r.get('spec_decoding', ''),
+            r['prefill_tp'], r['prefill_ep'], r['prefill_num_workers'],
+            r['decode_tp'], r['decode_ep'], r['decode_num_workers'], r['conc'],
         ))
+    )
+    single_node_rows.sort(key=single_node_sort_key)
+    multinode_rows.sort(key=multinode_sort_key)
 
     if not rows:
         print('> No eval results found to summarize.')
     else:
         # Print table using tabulate
         MODEL_PREFIX = "Model Prefix"
-        headers = [
-            MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION,
-            TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL
-        ]
-
-        table_rows = [
-            [
-                r['model_prefix'],
-                r['hw'],
-                r['framework'].upper(),
-                r['precision'].upper(),
-                r['spec_decoding'],
-                r['tp'],
-                r['ep'],
-                r['conc'],
-                r['dp_attention'],
-                r['task'],
-                f"{pct(r['score'])}{se(r['score_se'])}",
-                f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
-                f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
-                r['n_eff'] or '',
-                r['model']
+
+        if single_node_rows:
+            headers = [
+                MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING,
+                TP, EP, CONC, DP_ATTENTION,
+                TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL,
+            ]
+            table_rows = [
+                [
+                    r['model_prefix'],
+                    r['hw'],
+                    r['framework'].upper(),
+                    r['precision'].upper(),
+                    r['spec_decoding'],
+                    r['tp'],
+                    r['ep'],
+                    r['conc'],
+                    r['dp_attention'],
+                    r['task'],
+                    f"{pct(r['score'])}{se(r['score_se'])}",
+                    f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
+                    f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
+                    r['n_eff'] or '',
+                    r['model'],
+                ]
+                for r in single_node_rows
+            ]
+            print("### Single-Node Eval Results\n")
+            print(tabulate(table_rows, headers=headers, tablefmt="github"))
+
+        if multinode_rows:
+            headers = [
+                MODEL_PREFIX, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING,
+                PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS,
+                DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS,
+                CONC, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, MODEL,
+            ]
+            table_rows = [
+                [
+                    r['model_prefix'],
+                    r['hw'],
+                    r['framework'].upper(),
+                    r['precision'].upper(),
+                    r['spec_decoding'],
+                    r['prefill_tp'],
+                    r['prefill_ep'],
+                    r['prefill_dp_attention'],
+                    r['prefill_num_workers'],
+                    r['decode_tp'],
+                    r['decode_ep'],
+                    r['decode_dp_attention'],
+                    r['decode_num_workers'],
+                    r['conc'],
+                    r['task'],
+                    f"{pct(r['score'])}{se(r['score_se'])}",
+                    f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
+                    f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
+                    r['n_eff'] or '',
+                    r['model'],
+                ]
+                for r in multinode_rows
             ]
-            for r in rows
-        ]
+            if single_node_rows:
+                print("\n")
+            print("### Multi-Node Eval Results\n")
+            print(tabulate(table_rows, headers=headers, tablefmt="github"))
 
-        print(tabulate(table_rows, headers=headers, tablefmt="github"))
 
     # Write JSON aggregate
     out_path = Path(f'agg_eval_{exp_name}.json')
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index e32d6d988..f729d5f24 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -6,21 +6,54 @@ Quick graded QnA which measures model performance. Examples of test suites:
 - **gpqa**: Graduate level, Google-Proof multiple choice questions
 
 ## When?
-At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py`
+Evals run as **separate workflow jobs** from throughput benchmarks. The selection logic is in `mark_eval_entries()` of `utils/matrix_logic/generate_sweep_configs.py`.
+
+**Single-node**: At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k.
+
+**Multi-node**: One entry per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) with the highest max concurrency, only for 8k1k.
 
 ## Why?
-To verify how model outputs are affected by throughput optimizations. 
+To verify how model outputs are affected by throughput optimizations.
 - TP/Conc might affect model outputs
 - Check kernel implementations for correctness
 - If there was a tradeoff in accuracy for performance
 
 ## How?
-- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+`run_eval` in `benchmarks/benchmark_lib.sh` runs EleutherAI/lm-evaluation-harness against the server's OpenAI-compatible endpoint. Concurrency is set via `EVAL_CONCURRENT_REQUESTS` env var (not a CLI flag). Results are collected by `utils/collect_eval_results.py` and published as a summary table.
+
+### Single-node
+In eval-only mode (`EVAL_ONLY=true`), the benchmark script starts the server with expanded context length (via `compute_eval_context_length`), skips throughput, and runs lm-eval directly. Each framework handles the context expansion differently (`--context-length` for SGLang, `--max_seq_len` for TRT-LLM).
+
+### Multi-node
+Multi-node evals support three hardware paths:
+
+**MI355X (AMD)** — `benchmarks/multi_node/amd_utils/server.sh`
+- Skips `bench.sh` when `EVAL_ONLY=true`
+- Runs lm-eval via `run_eval` against the router on port 30000
+- Concurrency derived from max of `BENCH_MAX_CONCURRENCY` (x-separated values)
+- Eval artifacts copied to `/run_logs/slurm_job-*/eval_results/`
+- `runners/launch_mi355x-amds.sh` skips benchmark result collection when `EVAL_ONLY=true` and uses `find` to locate eval results
+
+**GB200/GB300 (NVIDIA)** — via [srt-slurm fork](https://github.com/Oseltamivir/srt-slurm) (`sa-submission-q1-2026` branch)
+- `do_sweep.py` skips the benchmark stage when `EVAL_ONLY=true`, runs `_run_post_eval()` directly
+- In eval-only mode, uses the full `wait_for_model()` health check (same as benchmark stage) since the benchmark health check was skipped
+- `lm-eval` benchmark runner (`benchmarks/lm_eval.py`) sources InferenceX's `benchmark_lib.sh` from the mounted workspace (`/infmax-workspace`)
+- Eval artifacts written to `/logs/eval_results/` inside the container, collected by launch scripts
+- `runners/launch_gb200-nv.sh` and `launch_gb300-nv.sh` always collect server logs (for debugging) but skip benchmark result collection when `EVAL_ONLY=true`
+- Env vars threaded: `RUN_EVAL`, `EVAL_ONLY`, `FRAMEWORK`, `PRECISION`, `MODEL_PREFIX`, `RUNNER_TYPE`, `RESULT_FILENAME`, `SPEC_DECODING`, `ISL`, `OSL`, `PREFILL_TP/EP/DP_ATTN`, `DECODE_TP/EP/DP_ATTN`, `MODEL_NAME`, `EVAL_CONC`
+
+### Workflow structure
+- `e2e-tests.yml`: `test-sweep-evals` (single-node) and `test-sweep-multi-node-evals` (multi-node)
+- `run-sweep.yml`: `sweep-evals` (single-node) and `sweep-multi-node-evals` (multi-node)
+- Both use their respective benchmark templates with `eval-only: true`, `run-eval: true`
+- `collect-evals` depends on both eval jobs; `collect-results` only runs when benchmark jobs ran
+- `process_changelog.py` splits eval results into `evals` (single-node) and `multinode_evals`
+
+### Score validation
+`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
 
 ## Misc
 Following files are task definitions from lmeval, more info on changes within the files
 - `utils/evals/gsm8k.yaml`
 - `utils/evals/gpqa_diamond.yaml`
 
-
-
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 2a336c960..9682c1423 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -33,26 +33,34 @@ def seq_len_to_str(isl: int, osl: int) -> str:
     return seq_len_itos.get((isl, osl), f"{isl}_{osl}")
 
 def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
-    """Eval selection policy (single-node only):
-    - Only consider 8k1k (isl=8192, osl=1024).
-    - For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn):
+    """Eval selection policy:
+    - Single-node: only consider 8k1k (isl=8192, osl=1024).
+      For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn):
         - Mark all entries at the highest CONC (all TPs)
         - Mark all entries at the median CONC (all TPs)
+    - Multi-node: for each unique (model, runner, framework, precision,
+      spec-decoding, prefill-dp-attn, decode-dp-attn), only 8k1k entries.
+      Mark the entry with the highest max concurrency. Sets eval-conc to the
+      median of the conc list to avoid OOM during eval.
     """
     from collections import defaultdict
 
-    # Only run evals on 8k1k
     target_isl, target_osl = seq_len_stoi["8k1k"]
-    # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn).
-    # Only include entries that have a top-level TP (i.e., single-node schema).
-    groups = defaultdict(list)
+    eval_indices = set()
+    mn_eval_conc = {}  # index -> chosen eval concurrency for multinode entries
+
+    def _max_conc(ie):
+        c = ie[1][Fields.CONC.value]
+        return max(c) if isinstance(c, list) else c
+
+    # Single-node: group by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn).
+    # Only 8k1k entries with a top-level TP (single-node schema).
+    sn_groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
         if Fields.TP.value not in entry:
             continue
-
         if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
             continue
-
         key = (
             entry[Fields.MODEL.value],
             entry[Fields.RUNNER.value],
@@ -61,27 +69,53 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
             entry[Fields.ISL.value],
             entry[Fields.OSL.value],
             entry[Fields.SPEC_DECODING.value],
-            entry[Fields.DP_ATTN.value]
+            entry[Fields.DP_ATTN.value],
         )
-        groups[key].append((i, entry))
-
-    # For each group, select entries at highest CONC and median CONC (all TPs)
-    eval_indices = set()
-    for key, entries in groups.items():
-        if not entries:
-            continue
+        sn_groups[key].append((i, entry))
 
+    for entries in sn_groups.values():
         conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries))
         median_conc = conc_values[len(conc_values) // 2]
         target_concs = {conc_values[-1], median_conc}
-
         for i, e in entries:
             if e[Fields.CONC.value] in target_concs:
                 eval_indices.add(i)
 
+    # Multi-node: group by (model, runner, framework, precision, spec-decoding, prefill-dp, decode-dp).
+    # Only 8k1k entries with a prefill key (multi-node schema).
+    # Pick the entry with the highest max concurrency per group.
+    mn_groups = defaultdict(list)
+    for i, entry in enumerate(matrix_values):
+        if Fields.TP.value in entry:
+            continue
+        if Fields.PREFILL.value not in entry:
+            continue
+        if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
+            continue
+        key = (
+            entry[Fields.MODEL.value],
+            entry[Fields.RUNNER.value],
+            entry[Fields.FRAMEWORK.value],
+            entry[Fields.PRECISION.value],
+            entry[Fields.SPEC_DECODING.value],
+            entry.get(Fields.PREFILL.value, {}).get(Fields.DP_ATTN.value),
+            entry.get(Fields.DECODE.value, {}).get(Fields.DP_ATTN.value),
+        )
+        mn_groups[key].append((i, entry))
+
+    for entries in mn_groups.values():
+        best_idx, best_entry = max(entries, key=_max_conc)
+        eval_indices.add(best_idx)
+        # Set eval-conc to median of the conc list to avoid OOM during eval
+        conc = best_entry[Fields.CONC.value]
+        sorted_conc = sorted(conc) if isinstance(conc, list) else [conc]
+        mn_eval_conc[best_idx] = sorted_conc[len(sorted_conc) // 2]
+
     # Mark the selected entries
     for i, entry in enumerate(matrix_values):
         entry[Fields.RUN_EVAL.value] = i in eval_indices
+        if i in mn_eval_conc:
+            entry[Fields.EVAL_CONC.value] = mn_eval_conc[i]
 
     return matrix_values
 
@@ -557,9 +591,18 @@ def generate_test_config_sweep(args, all_config_data):
         runner = val[Fields.RUNNER.value]
         disagg = val.get(Fields.DISAGG.value, False)
 
+        # Build seq-len filter if --seq-lens was provided
+        seq_lens_filter = None
+        if getattr(args, 'seq_lens', None):
+            seq_lens_filter = {seq_len_stoi[s] for s in args.seq_lens}
+
         for seq_len_config in val[Fields.SEQ_LEN_CONFIGS.value]:
             isl = seq_len_config[Fields.ISL.value]
             osl = seq_len_config[Fields.OSL.value]
+
+            if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+                continue
+
             seq_len_str = seq_len_to_str(isl, osl)
 
             for bmk in seq_len_config[Fields.SEARCH_SPACE.value]:
@@ -905,6 +948,13 @@ def main():
         required=False,
         help='Only include these concurrency values. Values must exist in the config conc-range/list.'
     )
+    test_config_keys_parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help='Only include these sequence length configurations (e.g., 1k1k 8k1k)'
+    )
     test_config_keys_parser.add_argument(
         '-h', '--help',
         action='help',
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 697d97de6..62a92c5ed 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -53,6 +53,7 @@ class Fields(Enum):
 
     # Eval
     RUN_EVAL = 'run-eval'
+    EVAL_CONC = 'eval-conc'
 
 
 """
@@ -126,6 +127,7 @@ class MultiNodeMatrixEntry(BaseModel):
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
     run_eval: bool = Field(alias=Fields.RUN_EVAL.value)
+    eval_conc: Optional[int] = Field(default=None, alias=Fields.EVAL_CONC.value)
 
 
 def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict:
@@ -361,6 +363,7 @@ class ChangelogMatrixEntry(BaseModel):
     multi_node: dict[str, list[MultiNodeMatrixEntry]
                      ] = Field(default_factory=dict)
     evals: list[SingleNodeMatrixEntry] = Field(default_factory=list)
+    multinode_evals: list[MultiNodeMatrixEntry] = Field(default_factory=list)
     changelog_metadata: ChangelogMetadata
 
 
diff --git a/utils/process_changelog.py b/utils/process_changelog.py
index 7da19d030..9d231ad3c 100644
--- a/utils/process_changelog.py
+++ b/utils/process_changelog.py
@@ -82,6 +82,7 @@ def main():
         "single_node": defaultdict(list),
         "multi_node": defaultdict(list),
         "evals": [],
+        "multinode_evals": [],
         "changelog_metadata": {
             "base_ref": args.base_ref,
             "head_ref": args.head_ref,
@@ -163,7 +164,8 @@ def main():
         else:
             final_results["single_node"][seq_len_str].append(result)
 
-    final_results["evals"] = all_eval_results
+    final_results["evals"] = [e for e in all_eval_results if "prefill" not in e or e.get("prefill") is None]
+    final_results["multinode_evals"] = [e for e in all_eval_results if "prefill" in e and e.get("prefill") is not None]
 
     # Validate final results structure
     validated = ChangelogMatrixEntry.model_validate(final_results)