diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e84fc0da5..adacf0203 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -94,6 +94,24 @@ dsr1-fp8-mi325x-sglang:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
+dsr1-fp8-mi325x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
 dsr1-fp8-mi355x-sglang:
   image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
   model: deepseek-ai/DeepSeek-R1-0528
@@ -1231,3 +1249,347 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+
+dsr1-fp8-mi325x-sglang-disagg:
+  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 768, 512, 256 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # Single-node EP8/DP decode (test: isolates whether EP/DP itself works on MI325X
+    # or if only the multi-node distributed init is broken with Broadcom Thor 2)
+    - spec-decoding: "none"
+      conc-list: [ 512, 256, 128, 64 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp8-mi325x-sglang-disagg-mtp:
+  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=1"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 768, 512, 256 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=1"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+    # Single-node EP8/DP decode with MTP (test: isolates EP/DP vs multi-node init)
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 256, 128, 64 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=1"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 1251e459d..f61e81e36 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -75,6 +75,11 @@ mi325x:
 - 'mi325x-amd_1'
 - 'mi325x-amd_2'
 - 'mi325x-amd_3'
+mi325x-disagg:
+- 'mi325x-amd_0'
+- 'mi325x-amd_1'
+- 'mi325x-amd_2'
+- 'mi325x-amd_3'
 mi355x:
 - 'mi355x-amds_0'
 - 'mi355x-amds_1'
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 5565c5b3b..99f2d0238 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -20,6 +20,9 @@ if [[ -z "$IBDEVICES" ]]; then
         export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
     elif [[ $NODENAME == mia1* ]]; then
         export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    elif [[ $NODENAME == chi-mi325x* ]]; then
+        # Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
+        export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
     else
         echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
         exit 1
@@ -101,6 +104,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
         elif [[ $NODENAME == mia1* ]]; then
             export MORI_RDMA_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+        elif [[ $NODENAME == chi-mi325x* ]]; then
+            # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+            export MORI_RDMA_TC=104
+            export MORI_RDMA_SL=3
+            echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
         else
             echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
         fi
@@ -114,6 +122,11 @@ else
     elif [[ $NODENAME == mia1* ]]; then
         export MORI_RDMA_TC=104
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+    elif [[ $NODENAME == chi-mi325x* ]]; then
+        # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+        export MORI_RDMA_TC=104
+        export MORI_RDMA_SL=3
+        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
     else
         echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
         echo "       This is normal for clusters without QoS or outside Docker containers."
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 6b0352f24..523bfd7c5 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then
     exit 1
 fi
 
-# Validate MODEL_NAME exists as a top-level key in models.yaml
-if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
-    echo "Error: Model '$MODEL_NAME' not found in models.yaml"
+# MODEL_YAML_KEY is the models.yaml lookup key (bare model name, e.g. DeepSeek-R1-0528).
+# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>).
+_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
+
+# Validate the yaml key exists as a top-level key in models.yaml
+if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then
+    echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml"
     echo "Available models:"
     grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/  - /'
     exit 1
 fi
-echo "Model found: $MODEL_NAME"
+echo "Model found: $_MODEL_YAML_KEY"
 
 # All models use server.sh as the entrypoint
 RUN_FILE="server.sh"
@@ -249,10 +253,9 @@ echo "NNODES is ${NNODES}"
 echo "REPO Directory is ${DI_REPO_DIR}"
 echo "USER_NAME is ${USER_NAME}"
 
-# Get the RDMA priority and DSCP value from the NIC
+# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully)
 if ! command -v nicctl >/dev/null 2>&1; then
-    echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
-    exit 1
+    echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2
 fi
 
 # Reduce log spam
@@ -296,8 +299,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)
 
 cleanup() {
   echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
-  # clean up the logs folder
-  sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
+  # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+  timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
 
   echo "[${SLURM_JOB_ID}] cleanup done."
 }
@@ -318,6 +321,54 @@ srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
     echo "NFS cache refreshed on $(hostname)"
 '
 
+# =============================================================================
+# Optional: Pre-stage model to local NVMe for faster loading
+# =============================================================================
+# LOCAL_MODEL_CACHE_DIR: mount point for fast local storage (NVMe/SSD) on compute nodes.
+# Set per-cluster via the runner/launch script. When set, model weights are rsync'd
+# from shared storage to local NVMe before Docker starts. This is idempotent —
+# subsequent runs skip files already cached locally.
+#
+# If unset or the local path doesn't exist, the model is served directly from
+# shared storage (NFS/Lustre) as before.
+if [[ -n "${LOCAL_MODEL_CACHE_DIR:-}" ]]; then
+    LOCAL_MODEL_FULL="${LOCAL_MODEL_CACHE_DIR}/${MODEL_NAME}"
+    echo "[cache] Pre-staging model to local NVMe on all nodes..."
+    echo "[cache]   Source: $MODEL_PATH"
+    echo "[cache]   Dest:   $LOCAL_MODEL_FULL"
+
+    srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
+        set -euo pipefail
+        SRC="'"$MODEL_PATH"'"
+        DST="'"$LOCAL_MODEL_FULL"'"
+        CACHE_DIR="'"${LOCAL_MODEL_CACHE_DIR}"'"
+
+        # Create destination directory
+        sudo mkdir -p "$CACHE_DIR" 2>/dev/null || mkdir -p "$CACHE_DIR"
+        sudo chown -R "$(whoami)" "$CACHE_DIR" 2>/dev/null || true
+
+        echo "[cache] $(hostname): Syncing model to local NVMe..."
+        START=$(date +%s)
+
+        rclone sync "$SRC/" "$DST/" \
+            --transfers 32 \
+            --checkers 32 \
+            --links \
+            --progress
+
+        ELAPSED=$(( $(date +%s) - START ))
+        SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1)
+        echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)"
+    ' 2>&1
+
+    if [[ $? -eq 0 ]]; then
+        echo "[cache] Model pre-staged successfully. Updating MODEL_DIR."
+        MODEL_DIR="${LOCAL_MODEL_CACHE_DIR}"
+    else
+        echo "[cache] WARNING: Local caching failed on some nodes. Falling back to shared storage."
+    fi
+fi
+
 srun \
   --nodelist="$SELECTED_NODELIST_SRUN" \
   --kill-on-bad-exit=1 \
@@ -357,7 +408,7 @@ exec sudo docker run --rm \
     --privileged \
     -v ${MODEL_DIR}:/models \
     -v \$HOME/.ssh:/root/.ssh \
-    -v $(which nicctl):/usr/sbin/nicctl \
+    $(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \
     --shm-size 128G \
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
@@ -373,6 +424,7 @@ exec sudo docker run --rm \
     -e xP=\$xP \
     -e yD=\$yD \
     -e MODEL_NAME=\$MODEL_NAME \
+    -e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \
     -e IPADDRS=\$IPADDRS \
     -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
     -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 7f174b760..960cbb6e7 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -72,11 +72,12 @@ fi
 # Load model config via inline Python (PyYAML is available in SGLang containers)
 # Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
 # is done here in Python to avoid bash glob-expanding the * characters.
+_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
 eval "$(python3 -c "
 import yaml, sys, os
 
 config_path = '${MODELS_YAML}'
-model_name = '${MODEL_NAME}'
+model_name = '${_MODEL_YAML_KEY}'
 
 with open(config_path) as f:
     models = yaml.safe_load(f)
@@ -212,6 +213,13 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
     MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
 fi
 
+# DP attention forces chunked_prefill_size to 1024 inside SGLang, which must be
+# <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. Bump the decode dispatch
+# token limit when DP is enabled to satisfy this assertion.
+if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$MORI_MAX_DISPATCH_TOKENS_DECODE" -lt 1024 ]]; then
+    MORI_MAX_DISPATCH_TOKENS_DECODE=1024
+fi
+
 # =============================================================================
 # Cluster Topology Configuration
 # =============================================================================
diff --git a/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh
new file mode 100755
index 000000000..6a7314ab4
--- /dev/null
+++ b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index ae1e930f0..dc594a854 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -26,6 +26,14 @@ hf download $MODEL
 export SGLANG_USE_AITER=1
 export SGLANG_AITER_MLA_PERSIST=1
 
+# MTP (speculative decoding) flags
+MTP_ARGS=""
+CHAT_TEMPLATE_ARGS=""
+if [[ "${SPEC_DECODING:-}" == "mtp" ]]; then
+    MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 3 --speculative-num-draft-tokens 4"
+    CHAT_TEMPLATE_ARGS="--use-chat-template"
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -47,7 +55,7 @@ python3 -m sglang.launch_server \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
 --disable-radix-cache \
-$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+$MTP_ARGS $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -64,7 +72,8 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    $CHAT_TEMPLATE_ARGS
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 967edc19c..d059c439b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,18 @@
+- config-keys:
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi325x-sglang-mtp
+    - dsr1-fp8-mi325x-sglang-disagg
+    - dsr1-fp8-mi325x-sglang-disagg-mtp
+  description:
+    - "Add MI325X DeepSeek-R1 FP8 single-node and disaggregated inference with Broadcom Thor 2 IBGDA"
+    - "Single-node: SGLang with aiter backend, MLA persist kernel, TP8, FP8 KV cache"
+    - "Disaggregated: Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches"
+    - "Image (disagg): ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good"
+    - "Image (single-node): lmsysorg/sglang:v0.5.9-rocm700-mi30x"
+    - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k"
+    - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985
+
 - config-keys:
     - kimik2.5-int4-mi300x-vllm
   description:
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 67f93a309..6ac64f3d8 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -3,38 +3,198 @@
 export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
 export PORT=8888
 
-PARTITION="compute"
-SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-LOCK_FILE="${SQUASH_FILE}.lock"
-
-set -x
+# Local NVMe cache for model weights (set to empty to disable)
+# MI325X nodes have 8x 3.5TB NVMe drives; /local-nvme must be set up
+# via: sudo bash utils/setup_local_nvme.sh /local-nvme
+export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}"
 
-JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+PARTITION="compute"
 
-if [ -z "$JOB_ID" ]; then
-    echo "ERROR: salloc failed to allocate a job"
+# Detect benchmark subdir from where the script lives.
+# Multi-node scripts include the framework suffix (e.g. _sglang-disagg.sh);
+# single-node scripts do not (e.g. dsr1_fp8_mi325x.sh).
+SCRIPT_NAME_WITH_FW="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh"
+SCRIPT_NAME_BASE="${EXP_NAME%%_*}_${PRECISION}_mi325x.sh"
+if [[ -f "benchmarks/multi_node/${SCRIPT_NAME_WITH_FW}" ]]; then
+    BENCHMARK_SUBDIR="multi_node"
+    SCRIPT_NAME="${SCRIPT_NAME_WITH_FW}"
+elif [[ -f "benchmarks/single_node/${SCRIPT_NAME_BASE}" ]]; then
+    BENCHMARK_SUBDIR="single_node"
+    SCRIPT_NAME="${SCRIPT_NAME_BASE}"
+else
+    echo "ERROR: neither benchmarks/multi_node/${SCRIPT_NAME_WITH_FW} nor benchmarks/single_node/${SCRIPT_NAME_BASE} found"
     exit 1
 fi
 
-# Use flock to serialize concurrent imports to the same squash file
-srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
-    exec 9>\"$LOCK_FILE\"
-    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
-    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
-        echo 'Squash file already exists and is valid, skipping import'
-    else
-        rm -f \"$SQUASH_FILE\"
-        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+# =============================================================================
+# Multi-node disaggregated path: sbatch + Docker via submit.sh
+# =============================================================================
+if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
+
+    scancel_sync() {
+        local jobid=$1
+        local timeout=${2:-600}
+        local interval=10
+        local start
+        start=$(date +%s)
+
+        echo "[scancel_sync] Requesting cancel of job $jobid"
+        scancel "$jobid" || true
+
+        while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do
+            local now
+            now=$(date +%s)
+            if (( now - start >= timeout )); then
+                echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s"
+                return 1
+            fi
+            echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..."
+            sleep "$interval"
+        done
+        echo "[scancel_sync] job $jobid exited"
+        return 0
+    }
+
+    set -x
+
+    export SLURM_ACCOUNT="$USER"
+    export SLURM_PARTITION="$PARTITION"
+    export SLURM_JOB_NAME="benchmark-sglang-disagg.job"
+
+    export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}"
+
+    # MODEL_YAML_KEY: top-level key in models.yaml for server config lookup.
+    if [[ -z "${MODEL_YAML_KEY:-}" ]]; then
+        export MODEL_YAML_KEY="${MODEL##*/}"
+    fi
+
+    # MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container.
+    # Auto-resolved from HF hub cache layout so no symlink is needed.
+    if [[ -z "${MODEL_NAME:-}" ]]; then
+        _HF_DIR="models--$(echo "${MODEL}" | tr '/' '--')"
+        _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
+        if [[ -n "${_SNAPSHOT}" ]]; then
+            export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
+        else
+            export MODEL_NAME="${MODEL_YAML_KEY}"
+        fi
+    fi
+
+    export GPUS_PER_NODE=8
+
+    export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
+    mkdir -p "$BENCHMARK_LOGS_DIR"
+    # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+    timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+    JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
+
+    LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out"
+
+    sleep 10
+
+    while ! ls "$LOG_FILE" &>/dev/null; do
+        if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then
+            echo "ERROR: Job $JOB_ID failed before creating log file"
+            scontrol show job "$JOB_ID"
+            exit 1
+        fi
+        sleep 5
+    done
+
+    set +x
+
+    (
+        while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do
+            sleep 10
+        done
+    ) &
+    POLL_PID=$!
+
+    tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+    wait $POLL_PID
+
+    set -x
+
+    cat > collect_latest_results.py <<'PY'
+import os, sys
+sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
+for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
+    print(path)
+PY
+
+    LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
+    if [ -z "$LOGS_DIR" ]; then
+        echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
+        exit 1
+    fi
+
+    echo "Found logs directory: $LOGS_DIR"
+    ls -la "$LOGS_DIR"
+
+    for result_file in $(find $LOGS_DIR -type f); do
+        file_name=$(basename $result_file)
+        if [ -f $result_file ]; then
+            WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
+            echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
+            cp $result_file $WORKSPACE_RESULT_FILE
+        fi
+    done
+
+    echo "All result files processed"
+    set +x
+    scancel_sync $JOB_ID
+    set -x
+    echo "Canceled the slurm job $JOB_ID"
+
+    # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+    timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+    if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+        ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts"
+        mkdir -p "$ARTIFACT_DIR"
+        cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true
+        echo "Logs copied to $ARTIFACT_DIR for artifact upload"
     fi
-"
-srun --jobid=$JOB_ID \
---container-image=$SQUASH_FILE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
---container-mount-home \
---container-writable \
---container-remap-root \
---container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh
-
-scancel $JOB_ID
+
+# =============================================================================
+# Single-node path: enroot via salloc + srun
+# =============================================================================
+else
+
+    SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    LOCK_FILE="${SQUASH_FILE}.lock"
+
+    set -x
+
+    JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+    if [ -z "$JOB_ID" ]; then
+        echo "ERROR: salloc failed to allocate a job"
+        exit 1
+    fi
+
+    srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+        exec 9>\"$LOCK_FILE\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import'
+        else
+            rm -f \"$SQUASH_FILE\"
+            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        fi
+    "
+    srun --jobid=$JOB_ID \
+    --container-image=$SQUASH_FILE \
+    --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+    --container-mount-home \
+    --container-writable \
+    --container-remap-root \
+    --container-workdir=/workspace/ \
+    --no-container-entrypoint --export=ALL \
+    bash benchmarks/single_node/${SCRIPT_NAME}
+
+    scancel $JOB_ID
+
+fi
diff --git a/scripts/manual-test-mi325x.sh b/scripts/manual-test-mi325x.sh
new file mode 100755
index 000000000..30ec87d6a
--- /dev/null
+++ b/scripts/manual-test-mi325x.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+export GITHUB_WORKSPACE=$(pwd)
+export RUNNER_NAME=mi325x-amd-manual
+
+export MODEL=deepseek-ai/DeepSeek-R1-0528
+export EXP_NAME=dsr1_1k1k
+export PRECISION=fp8
+export FRAMEWORK=sglang-disagg
+
+export IMAGE=ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
+
+export ISL=1024
+export OSL=1024
+export CONC_LIST="1024 512 256 128 64 32 16 8 4 2 1"
+export SPEC_DECODING=none
+export RANDOM_RANGE_RATIO=1
+
+export PREFILL_NODES=1
+export PREFILL_NUM_WORKERS=1
+export PREFILL_TP=4
+export PREFILL_EP=1
+export PREFILL_DP_ATTN=false
+
+export DECODE_NODES=1
+export DECODE_NUM_WORKERS=1
+export DECODE_TP=8
+export DECODE_EP=1
+export DECODE_DP_ATTN=false
+
+bash runners/launch_mi325x-amd.sh
+
+#model files are here:
+#/nfsdata/sa/gharunner/gharunners/hf-hub-cache/models--deepseek-ai--DeepSeek-R1-0528
\ No newline at end of file
diff --git a/utils/cache_model_locally.sh b/utils/cache_model_locally.sh
new file mode 100755
index 000000000..0b1480231
--- /dev/null
+++ b/utils/cache_model_locally.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# cache_model_locally.sh — Pre-stage model weights from shared storage to local NVMe.
+#
+# Syncs a model directory from NFS/shared storage to fast local NVMe before
+# the inference server starts, using rclone for high-parallelism transfers.
+#
+# Usage:
+#   source utils/cache_model_locally.sh
+#   cache_model_locally "/nfs/hub/models--org--repo" "/local-nvme/hub/models--org--repo"
+#
+# Or as a standalone script:
+#   bash utils/cache_model_locally.sh /nfs/hub/models--org--repo /local-nvme/hub/models--org--repo
+#
+# Features:
+#   - Uses rclone sync with 32 parallel transfers for maximum throughput
+#   - Preserves HuggingFace cache symlink structure (--links)
+#   - Idempotent: rclone skips files already present and identical
+#   - Works with both HF hub cache layout and flat model directories
+#
+# Environment variables:
+#   CACHE_TRANSFERS  — number of parallel rclone transfers (default: 32)
+#   CACHE_CHECKERS   — number of parallel rclone checkers (default: 32)
+#   CACHE_DRY_RUN    — set to 1 to print what would be synced without copying
+
+set -euo pipefail
+
+CACHE_TRANSFERS="${CACHE_TRANSFERS:-32}"
+CACHE_CHECKERS="${CACHE_CHECKERS:-32}"
+CACHE_DRY_RUN="${CACHE_DRY_RUN:-0}"
+
+cache_model_locally() {
+    local src="${1:?Usage: cache_model_locally <source_path> <dest_path>}"
+    local dst="${2:?Usage: cache_model_locally <source_path> <dest_path>}"
+
+    if [[ ! -d "$src" ]]; then
+        echo "[cache] ERROR: Source path does not exist: $src" >&2
+        return 1
+    fi
+
+    echo "[cache] Syncing model to local storage..."
+    echo "[cache]   Source: $src"
+    echo "[cache]   Dest:   $dst"
+    echo "[cache]   Transfers: $CACHE_TRANSFERS, Checkers: $CACHE_CHECKERS"
+
+    mkdir -p "$dst"
+
+    local start_time
+    start_time=$(date +%s)
+
+    local rclone_opts=(--transfers "$CACHE_TRANSFERS" --checkers "$CACHE_CHECKERS" --links --progress)
+    if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then
+        rclone_opts+=(--dry-run)
+    fi
+
+    rclone sync "$src/" "$dst/" "${rclone_opts[@]}"
+
+    local elapsed=$(( $(date +%s) - start_time ))
+    local size
+    size=$(du -sh "$dst" 2>/dev/null | cut -f1)
+
+    echo "[cache] Done in ${elapsed}s — $size cached at $dst"
+    echo "$dst"
+    return 0
+}
+
+# If run as a standalone script (not sourced), execute with args
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    if [[ $# -lt 2 ]]; then
+        echo "Usage: $0 <source_model_path> <dest_model_path>" >&2
+        echo "  Env: CACHE_TRANSFERS=$CACHE_TRANSFERS CACHE_CHECKERS=$CACHE_CHECKERS" >&2
+        exit 1
+    fi
+    cache_model_locally "$1" "$2"
+fi
diff --git a/utils/setup_local_nvme.sh b/utils/setup_local_nvme.sh
new file mode 100755
index 000000000..03b81e8a4
--- /dev/null
+++ b/utils/setup_local_nvme.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+# setup_local_nvme.sh — Format and mount local NVMe drives for model caching.
+#
+# Detects unformatted/unmounted NVMe drives and sets up a mount point for
+# caching model weights locally. Designed to be run once per node (idempotent).
+#
+# Usage (run on each compute node, requires root):
+#   sudo bash utils/setup_local_nvme.sh [mount_point]
+#
+# Default mount point: /local-nvme
+#
+# This script:
+#   1. Finds the first available NVMe drive that is not the boot device
+#   2. Formats it with ext4 if not already formatted
+#   3. Mounts it at the specified mount point
+#   4. Adds an fstab entry for persistence across reboots
+#
+# For RAID-0 across multiple NVMe drives (maximum throughput), use:
+#   sudo bash utils/setup_local_nvme.sh --raid [mount_point]
+
+set -euo pipefail
+
+USE_RAID=false
+MOUNT_POINT="/local-nvme"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --raid) USE_RAID=true; shift ;;
+        *) MOUNT_POINT="$1"; shift ;;
+    esac
+done
+
+if [[ $EUID -ne 0 ]]; then
+    echo "ERROR: This script must be run as root (sudo)" >&2
+    exit 1
+fi
+
+echo "[nvme-setup] Mount point: $MOUNT_POINT"
+
+# Already mounted?
+if mountpoint -q "$MOUNT_POINT" 2>/dev/null; then
+    echo "[nvme-setup] $MOUNT_POINT is already mounted:"
+    df -h "$MOUNT_POINT"
+    exit 0
+fi
+
+# Find NVMe drives that are not part of the root filesystem
+ROOT_DEV=$(findmnt -n -o SOURCE / | sed 's/[0-9]*$//' | sed 's/p$//')
+NVME_DRIVES=()
+for dev in /dev/nvme*n1; do
+    [[ -b "$dev" ]] || continue
+    # Skip if this drive is part of root
+    if [[ "$dev" == "$ROOT_DEV"* ]]; then
+        echo "[nvme-setup] Skipping $dev (root device)"
+        continue
+    fi
+    # Skip if already mounted
+    if mount | grep -q "^$dev "; then
+        echo "[nvme-setup] Skipping $dev (already mounted)"
+        continue
+    fi
+    # Skip if part of an md array
+    if grep -q "$(basename "$dev")" /proc/mdstat 2>/dev/null; then
+        echo "[nvme-setup] Skipping $dev (part of md array)"
+        continue
+    fi
+    NVME_DRIVES+=("$dev")
+done
+
+if [[ ${#NVME_DRIVES[@]} -eq 0 ]]; then
+    echo "[nvme-setup] No available NVMe drives found."
+    exit 1
+fi
+
+echo "[nvme-setup] Found ${#NVME_DRIVES[@]} available NVMe drives: ${NVME_DRIVES[*]}"
+
+if [[ "$USE_RAID" == true ]] && [[ ${#NVME_DRIVES[@]} -gt 1 ]]; then
+    # RAID-0 for maximum throughput
+    MD_DEV="/dev/md10"
+    echo "[nvme-setup] Creating RAID-0 array across ${#NVME_DRIVES[@]} drives..."
+
+    if [[ -b "$MD_DEV" ]]; then
+        echo "[nvme-setup] $MD_DEV already exists, using it"
+    else
+        mdadm --create "$MD_DEV" --level=0 --raid-devices=${#NVME_DRIVES[@]} "${NVME_DRIVES[@]}" --run
+    fi
+
+    TARGET_DEV="$MD_DEV"
+else
+    # Single drive (use the first available)
+    TARGET_DEV="${NVME_DRIVES[0]}"
+    echo "[nvme-setup] Using single drive: $TARGET_DEV"
+fi
+
+# Format if needed
+if ! blkid "$TARGET_DEV" | grep -q 'TYPE="ext4"'; then
+    echo "[nvme-setup] Formatting $TARGET_DEV with ext4..."
+    mkfs.ext4 -F -L local-nvme "$TARGET_DEV"
+else
+    echo "[nvme-setup] $TARGET_DEV already has ext4 filesystem"
+fi
+
+# Mount
+mkdir -p "$MOUNT_POINT"
+mount -o noatime,discard "$TARGET_DEV" "$MOUNT_POINT"
+
+# Set permissions so non-root users can write
+chmod 1777 "$MOUNT_POINT"
+
+# Add fstab entry if not present
+if ! grep -q "$MOUNT_POINT" /etc/fstab; then
+    UUID=$(blkid -s UUID -o value "$TARGET_DEV")
+    echo "UUID=$UUID $MOUNT_POINT ext4 noatime,discard,nofail 0 2" >> /etc/fstab
+    echo "[nvme-setup] Added fstab entry"
+fi
+
+echo "[nvme-setup] Done:"
+df -h "$MOUNT_POINT"