diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e84fc0da5..adacf0203 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -94,6 +94,24 @@ dsr1-fp8-mi325x-sglang: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } +dsr1-fp8-mi325x-sglang-mtp: + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + dsr1-fp8-mi355x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi35x model: deepseek-ai/DeepSeek-R1-0528 @@ -1231,3 +1249,347 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + +dsr1-fp8-mi325x-sglang-disagg: + image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) + # - spec-decoding: "none" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" + + # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + # - spec-decoding: "none" + # conc-list: [ 768, 512, 256 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # Single-node EP8/DP decode (test: isolates whether EP/DP itself works on MI325X + # or if only the multi-node distributed init is broken with Broadcom Thor 2) + - spec-decoding: "none" + conc-list: [ 512, 256, 128, 64 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + # - spec-decoding: "none" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=2" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + +dsr1-fp8-mi325x-sglang-disagg-mtp: + image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) + # - spec-decoding: "mtp" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=1" + + # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + # - spec-decoding: "mtp" + # conc-list: [ 768, 512, 256 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # Single-node EP8/DP decode with MTP (test: isolates EP/DP vs multi-node init) + - spec-decoding: "mtp" + conc-list: [ 512, 256, 128, 64 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + # - spec-decoding: "mtp" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=2" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 1251e459d..f61e81e36 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -75,6 +75,11 @@ mi325x: - 'mi325x-amd_1' - 'mi325x-amd_2' - 'mi325x-amd_3' +mi325x-disagg: +- 'mi325x-amd_0' +- 'mi325x-amd_1' +- 'mi325x-amd_2' +- 'mi325x-amd_3' mi355x: - 'mi355x-amds_0' - 'mi355x-amds_1' diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..99f2d0238 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -20,6 +20,9 @@ if [[ -z "$IBDEVICES" ]]; then export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 elif [[ $NODENAME == mia1* ]]; then export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + elif [[ $NODENAME == chi-mi325x* ]]; then + # Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it + export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8 else echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 exit 1 @@ -101,6 +104,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p { elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == chi-mi325x* ]]; then + # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104 + export MORI_RDMA_TC=104 + export MORI_RDMA_SL=3 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." fi @@ -114,6 +122,11 @@ else elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == chi-mi325x* ]]; then + # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104 + export MORI_RDMA_TC=104 + export MORI_RDMA_SL=3 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." echo " This is normal for clusters without QoS or outside Docker containers." diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 6b0352f24..523bfd7c5 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml -if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" +# MODEL_YAML_KEY is the models.yaml lookup key (bare model name, e.g. DeepSeek-R1-0528). +# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/). +_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}" + +# Validate the yaml key exists as a top-level key in models.yaml +if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then + echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi -echo "Model found: $MODEL_NAME" +echo "Model found: $_MODEL_YAML_KEY" # All models use server.sh as the entrypoint RUN_FILE="server.sh" @@ -249,10 +253,9 @@ echo "NNODES is ${NNODES}" echo "REPO Directory is ${DI_REPO_DIR}" echo "USER_NAME is ${USER_NAME}" -# Get the RDMA priority and DSCP value from the NIC +# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully) if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 + echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2 fi # Reduce log spam @@ -296,8 +299,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) cleanup() { echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks + timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } @@ -318,6 +321,54 @@ srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Optional: Pre-stage model to local NVMe for faster loading +# ============================================================================= +# LOCAL_MODEL_CACHE_DIR: mount point for fast local storage (NVMe/SSD) on compute nodes. +# Set per-cluster via the runner/launch script. When set, model weights are rsync'd +# from shared storage to local NVMe before Docker starts. This is idempotent — +# subsequent runs skip files already cached locally. +# +# If unset or the local path doesn't exist, the model is served directly from +# shared storage (NFS/Lustre) as before. +if [[ -n "${LOCAL_MODEL_CACHE_DIR:-}" ]]; then + LOCAL_MODEL_FULL="${LOCAL_MODEL_CACHE_DIR}/${MODEL_NAME}" + echo "[cache] Pre-staging model to local NVMe on all nodes..." + echo "[cache] Source: $MODEL_PATH" + echo "[cache] Dest: $LOCAL_MODEL_FULL" + + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' + set -euo pipefail + SRC="'"$MODEL_PATH"'" + DST="'"$LOCAL_MODEL_FULL"'" + CACHE_DIR="'"${LOCAL_MODEL_CACHE_DIR}"'" + + # Create destination directory + sudo mkdir -p "$CACHE_DIR" 2>/dev/null || mkdir -p "$CACHE_DIR" + sudo chown -R "$(whoami)" "$CACHE_DIR" 2>/dev/null || true + + echo "[cache] $(hostname): Syncing model to local NVMe..." + START=$(date +%s) + + rclone sync "$SRC/" "$DST/" \ + --transfers 32 \ + --checkers 32 \ + --links \ + --progress + + ELAPSED=$(( $(date +%s) - START )) + SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1) + echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)" + ' 2>&1 + + if [[ $? -eq 0 ]]; then + echo "[cache] Model pre-staged successfully. Updating MODEL_DIR." + MODEL_DIR="${LOCAL_MODEL_CACHE_DIR}" + else + echo "[cache] WARNING: Local caching failed on some nodes. Falling back to shared storage." + fi +fi + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ @@ -357,7 +408,7 @@ exec sudo docker run --rm \ --privileged \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ + $(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ @@ -373,6 +424,7 @@ exec sudo docker run --rm \ -e xP=\$xP \ -e yD=\$yD \ -e MODEL_NAME=\$MODEL_NAME \ + -e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \ -e IPADDRS=\$IPADDRS \ -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7f174b760..960cbb6e7 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -72,11 +72,12 @@ fi # Load model config via inline Python (PyYAML is available in SGLang containers) # Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") # is done here in Python to avoid bash glob-expanding the * characters. +_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}" eval "$(python3 -c " import yaml, sys, os config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' +model_name = '${_MODEL_YAML_KEY}' with open(config_path) as f: models = yaml.safe_load(f) @@ -212,6 +213,13 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi +# DP attention forces chunked_prefill_size to 1024 inside SGLang, which must be +# <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. Bump the decode dispatch +# token limit when DP is enabled to satisfy this assertion. +if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$MORI_MAX_DISPATCH_TOKENS_DECODE" -lt 1024 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=1024 +fi + # ============================================================================= # Cluster Topology Configuration # ============================================================================= diff --git a/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh new file mode 100755 index 000000000..6a7314ab4 --- /dev/null +++ b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index ae1e930f0..dc594a854 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -26,6 +26,14 @@ hf download $MODEL export SGLANG_USE_AITER=1 export SGLANG_AITER_MLA_PERSIST=1 +# MTP (speculative decoding) flags +MTP_ARGS="" +CHAT_TEMPLATE_ARGS="" +if [[ "${SPEC_DECODING:-}" == "mtp" ]]; then + MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 3 --speculative-num-draft-tokens 4" + CHAT_TEMPLATE_ARGS="--use-chat-template" +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -47,7 +55,7 @@ python3 -m sglang.launch_server \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ --disable-radix-cache \ -$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +$MTP_ARGS $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -64,7 +72,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + $CHAT_TEMPLATE_ARGS # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 967edc19c..d059c439b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,18 @@ +- config-keys: + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi325x-sglang-mtp + - dsr1-fp8-mi325x-sglang-disagg + - dsr1-fp8-mi325x-sglang-disagg-mtp + description: + - "Add MI325X DeepSeek-R1 FP8 single-node and disaggregated inference with Broadcom Thor 2 IBGDA" + - "Single-node: SGLang with aiter backend, MLA persist kernel, TP8, FP8 KV cache" + - "Disaggregated: Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches" + - "Image (disagg): ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good" + - "Image (single-node): lmsysorg/sglang:v0.5.9-rocm700-mi30x" + - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k" + - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985 + - config-keys: - kimik2.5-int4-mi300x-vllm description: diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 67f93a309..6ac64f3d8 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -3,38 +3,198 @@ export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" export PORT=8888 -PARTITION="compute" -SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" - -set -x +# Local NVMe cache for model weights (set to empty to disable) +# MI325X nodes have 8x 3.5TB NVMe drives; /local-nvme must be set up +# via: sudo bash utils/setup_local_nvme.sh /local-nvme +export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}" -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +PARTITION="compute" -if [ -z "$JOB_ID" ]; then - echo "ERROR: salloc failed to allocate a job" +# Detect benchmark subdir from where the script lives. +# Multi-node scripts include the framework suffix (e.g. _sglang-disagg.sh); +# single-node scripts do not (e.g. dsr1_fp8_mi325x.sh). +SCRIPT_NAME_WITH_FW="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh" +SCRIPT_NAME_BASE="${EXP_NAME%%_*}_${PRECISION}_mi325x.sh" +if [[ -f "benchmarks/multi_node/${SCRIPT_NAME_WITH_FW}" ]]; then + BENCHMARK_SUBDIR="multi_node" + SCRIPT_NAME="${SCRIPT_NAME_WITH_FW}" +elif [[ -f "benchmarks/single_node/${SCRIPT_NAME_BASE}" ]]; then + BENCHMARK_SUBDIR="single_node" + SCRIPT_NAME="${SCRIPT_NAME_BASE}" +else + echo "ERROR: neither benchmarks/multi_node/${SCRIPT_NAME_WITH_FW} nor benchmarks/single_node/${SCRIPT_NAME_BASE} found" exit 1 fi -# Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE +# ============================================================================= +# Multi-node disaggregated path: sbatch + Docker via submit.sh +# ============================================================================= +if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then + + scancel_sync() { + local jobid=$1 + local timeout=${2:-600} + local interval=10 + local start + start=$(date +%s) + + echo "[scancel_sync] Requesting cancel of job $jobid" + scancel "$jobid" || true + + while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do + local now + now=$(date +%s) + if (( now - start >= timeout )); then + echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s" + return 1 + fi + echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..." + sleep "$interval" + done + echo "[scancel_sync] job $jobid exited" + return 0 + } + + set -x + + export SLURM_ACCOUNT="$USER" + export SLURM_PARTITION="$PARTITION" + export SLURM_JOB_NAME="benchmark-sglang-disagg.job" + + export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}" + + # MODEL_YAML_KEY: top-level key in models.yaml for server config lookup. + if [[ -z "${MODEL_YAML_KEY:-}" ]]; then + export MODEL_YAML_KEY="${MODEL##*/}" + fi + + # MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container. + # Auto-resolved from HF hub cache layout so no symlink is needed. + if [[ -z "${MODEL_NAME:-}" ]]; then + _HF_DIR="models--$(echo "${MODEL}" | tr '/' '--')" + _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1) + if [[ -n "${_SNAPSHOT}" ]]; then + export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}" + else + export MODEL_NAME="${MODEL_YAML_KEY}" + fi + fi + + export GPUS_PER_NODE=8 + + export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" + mkdir -p "$BENCHMARK_LOGS_DIR" + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks + timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + + JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + + LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" + + sleep 10 + + while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + sleep 5 + done + + set +x + + ( + while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do + sleep 10 + done + ) & + POLL_PID=$! + + tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + + wait $POLL_PID + + set -x + + cat > collect_latest_results.py <<'PY' +import os, sys +sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: + print(path) +PY + + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + ls -la "$LOGS_DIR" + + for result_file in $(find $LOGS_DIR -type f); do + file_name=$(basename $result_file) + if [ -f $result_file ]; then + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done + + echo "All result files processed" + set +x + scancel_sync $JOB_ID + set -x + echo "Canceled the slurm job $JOB_ID" + + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks + timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + + if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$ARTIFACT_DIR" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true + echo "Logs copied to $ARTIFACT_DIR for artifact upload" fi -" -srun --jobid=$JOB_ID \ ---container-image=$SQUASH_FILE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---container-mount-home \ ---container-writable \ ---container-remap-root \ ---container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh - -scancel $JOB_ID + +# ============================================================================= +# Single-node path: enroot via salloc + srun +# ============================================================================= +else + + SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" + + set -x + + JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + + if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 + fi + + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + srun --jobid=$JOB_ID \ + --container-image=$SQUASH_FILE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mount-home \ + --container-writable \ + --container-remap-root \ + --container-workdir=/workspace/ \ + --no-container-entrypoint --export=ALL \ + bash benchmarks/single_node/${SCRIPT_NAME} + + scancel $JOB_ID + +fi diff --git a/scripts/manual-test-mi325x.sh b/scripts/manual-test-mi325x.sh new file mode 100755 index 000000000..30ec87d6a --- /dev/null +++ b/scripts/manual-test-mi325x.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "${BASH_SOURCE[0]}")/.." + +export GITHUB_WORKSPACE=$(pwd) +export RUNNER_NAME=mi325x-amd-manual + +export MODEL=deepseek-ai/DeepSeek-R1-0528 +export EXP_NAME=dsr1_1k1k +export PRECISION=fp8 +export FRAMEWORK=sglang-disagg + +export IMAGE=ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good + +export ISL=1024 +export OSL=1024 +export CONC_LIST="1024 512 256 128 64 32 16 8 4 2 1" +export SPEC_DECODING=none +export RANDOM_RANGE_RATIO=1 + +export PREFILL_NODES=1 +export PREFILL_NUM_WORKERS=1 +export PREFILL_TP=4 +export PREFILL_EP=1 +export PREFILL_DP_ATTN=false + +export DECODE_NODES=1 +export DECODE_NUM_WORKERS=1 +export DECODE_TP=8 +export DECODE_EP=1 +export DECODE_DP_ATTN=false + +bash runners/launch_mi325x-amd.sh + +#model files are here: +#/nfsdata/sa/gharunner/gharunners/hf-hub-cache/models--deepseek-ai--DeepSeek-R1-0528 \ No newline at end of file diff --git a/utils/cache_model_locally.sh b/utils/cache_model_locally.sh new file mode 100755 index 000000000..0b1480231 --- /dev/null +++ b/utils/cache_model_locally.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# cache_model_locally.sh — Pre-stage model weights from shared storage to local NVMe. +# +# Syncs a model directory from NFS/shared storage to fast local NVMe before +# the inference server starts, using rclone for high-parallelism transfers. +# +# Usage: +# source utils/cache_model_locally.sh +# cache_model_locally "/nfs/hub/models--org--repo" "/local-nvme/hub/models--org--repo" +# +# Or as a standalone script: +# bash utils/cache_model_locally.sh /nfs/hub/models--org--repo /local-nvme/hub/models--org--repo +# +# Features: +# - Uses rclone sync with 32 parallel transfers for maximum throughput +# - Preserves HuggingFace cache symlink structure (--links) +# - Idempotent: rclone skips files already present and identical +# - Works with both HF hub cache layout and flat model directories +# +# Environment variables: +# CACHE_TRANSFERS — number of parallel rclone transfers (default: 32) +# CACHE_CHECKERS — number of parallel rclone checkers (default: 32) +# CACHE_DRY_RUN — set to 1 to print what would be synced without copying + +set -euo pipefail + +CACHE_TRANSFERS="${CACHE_TRANSFERS:-32}" +CACHE_CHECKERS="${CACHE_CHECKERS:-32}" +CACHE_DRY_RUN="${CACHE_DRY_RUN:-0}" + +cache_model_locally() { + local src="${1:?Usage: cache_model_locally }" + local dst="${2:?Usage: cache_model_locally }" + + if [[ ! -d "$src" ]]; then + echo "[cache] ERROR: Source path does not exist: $src" >&2 + return 1 + fi + + echo "[cache] Syncing model to local storage..." + echo "[cache] Source: $src" + echo "[cache] Dest: $dst" + echo "[cache] Transfers: $CACHE_TRANSFERS, Checkers: $CACHE_CHECKERS" + + mkdir -p "$dst" + + local start_time + start_time=$(date +%s) + + local rclone_opts=(--transfers "$CACHE_TRANSFERS" --checkers "$CACHE_CHECKERS" --links --progress) + if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then + rclone_opts+=(--dry-run) + fi + + rclone sync "$src/" "$dst/" "${rclone_opts[@]}" + + local elapsed=$(( $(date +%s) - start_time )) + local size + size=$(du -sh "$dst" 2>/dev/null | cut -f1) + + echo "[cache] Done in ${elapsed}s — $size cached at $dst" + echo "$dst" + return 0 +} + +# If run as a standalone script (not sourced), execute with args +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + if [[ $# -lt 2 ]]; then + echo "Usage: $0 " >&2 + echo " Env: CACHE_TRANSFERS=$CACHE_TRANSFERS CACHE_CHECKERS=$CACHE_CHECKERS" >&2 + exit 1 + fi + cache_model_locally "$1" "$2" +fi diff --git a/utils/setup_local_nvme.sh b/utils/setup_local_nvme.sh new file mode 100755 index 000000000..03b81e8a4 --- /dev/null +++ b/utils/setup_local_nvme.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# setup_local_nvme.sh — Format and mount local NVMe drives for model caching. +# +# Detects unformatted/unmounted NVMe drives and sets up a mount point for +# caching model weights locally. Designed to be run once per node (idempotent). +# +# Usage (run on each compute node, requires root): +# sudo bash utils/setup_local_nvme.sh [mount_point] +# +# Default mount point: /local-nvme +# +# This script: +# 1. Finds the first available NVMe drive that is not the boot device +# 2. Formats it with ext4 if not already formatted +# 3. Mounts it at the specified mount point +# 4. Adds an fstab entry for persistence across reboots +# +# For RAID-0 across multiple NVMe drives (maximum throughput), use: +# sudo bash utils/setup_local_nvme.sh --raid [mount_point] + +set -euo pipefail + +USE_RAID=false +MOUNT_POINT="/local-nvme" + +while [[ $# -gt 0 ]]; do + case "$1" in + --raid) USE_RAID=true; shift ;; + *) MOUNT_POINT="$1"; shift ;; + esac +done + +if [[ $EUID -ne 0 ]]; then + echo "ERROR: This script must be run as root (sudo)" >&2 + exit 1 +fi + +echo "[nvme-setup] Mount point: $MOUNT_POINT" + +# Already mounted? +if mountpoint -q "$MOUNT_POINT" 2>/dev/null; then + echo "[nvme-setup] $MOUNT_POINT is already mounted:" + df -h "$MOUNT_POINT" + exit 0 +fi + +# Find NVMe drives that are not part of the root filesystem +ROOT_DEV=$(findmnt -n -o SOURCE / | sed 's/[0-9]*$//' | sed 's/p$//') +NVME_DRIVES=() +for dev in /dev/nvme*n1; do + [[ -b "$dev" ]] || continue + # Skip if this drive is part of root + if [[ "$dev" == "$ROOT_DEV"* ]]; then + echo "[nvme-setup] Skipping $dev (root device)" + continue + fi + # Skip if already mounted + if mount | grep -q "^$dev "; then + echo "[nvme-setup] Skipping $dev (already mounted)" + continue + fi + # Skip if part of an md array + if grep -q "$(basename "$dev")" /proc/mdstat 2>/dev/null; then + echo "[nvme-setup] Skipping $dev (part of md array)" + continue + fi + NVME_DRIVES+=("$dev") +done + +if [[ ${#NVME_DRIVES[@]} -eq 0 ]]; then + echo "[nvme-setup] No available NVMe drives found." + exit 1 +fi + +echo "[nvme-setup] Found ${#NVME_DRIVES[@]} available NVMe drives: ${NVME_DRIVES[*]}" + +if [[ "$USE_RAID" == true ]] && [[ ${#NVME_DRIVES[@]} -gt 1 ]]; then + # RAID-0 for maximum throughput + MD_DEV="/dev/md10" + echo "[nvme-setup] Creating RAID-0 array across ${#NVME_DRIVES[@]} drives..." + + if [[ -b "$MD_DEV" ]]; then + echo "[nvme-setup] $MD_DEV already exists, using it" + else + mdadm --create "$MD_DEV" --level=0 --raid-devices=${#NVME_DRIVES[@]} "${NVME_DRIVES[@]}" --run + fi + + TARGET_DEV="$MD_DEV" +else + # Single drive (use the first available) + TARGET_DEV="${NVME_DRIVES[0]}" + echo "[nvme-setup] Using single drive: $TARGET_DEV" +fi + +# Format if needed +if ! blkid "$TARGET_DEV" | grep -q 'TYPE="ext4"'; then + echo "[nvme-setup] Formatting $TARGET_DEV with ext4..." + mkfs.ext4 -F -L local-nvme "$TARGET_DEV" +else + echo "[nvme-setup] $TARGET_DEV already has ext4 filesystem" +fi + +# Mount +mkdir -p "$MOUNT_POINT" +mount -o noatime,discard "$TARGET_DEV" "$MOUNT_POINT" + +# Set permissions so non-root users can write +chmod 1777 "$MOUNT_POINT" + +# Add fstab entry if not present +if ! grep -q "$MOUNT_POINT" /etc/fstab; then + UUID=$(blkid -s UUID -o value "$TARGET_DEV") + echo "UUID=$UUID $MOUNT_POINT ext4 noatime,discard,nofail 0 2" >> /etc/fstab + echo "[nvme-setup] Added fstab entry" +fi + +echo "[nvme-setup] Done:" +df -h "$MOUNT_POINT"