Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 202 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,24 @@ dsr1-fp8-mi300x-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi300x-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi300x
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }

dsr1-fp8-mi325x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -1272,3 +1290,187 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsr1-fp8-mi300x-sglang-disagg:
image: ghcr.io/jordannanos/sgl-mi300x-mori:v0.5.9-bnxt
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi300x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

- isl: 8192
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

dsr1-fp8-mi300x-sglang-disagg-mtp:
image: ghcr.io/jordannanos/sgl-mi300x-mori:v0.5.9-bnxt
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi300x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"

- isl: 8192
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"
4 changes: 4 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ mi300x:
- 'mi300x-amds_1'
- 'mi300x-amds_2'
- 'mi300x-amds_3'
mi300x-disagg:
- 'mi300x-amds_0'
- 'mi300x-amds_2'
- 'mi300x-amds_3'
mi325x:
- 'mi325x-amd_0'
- 'mi325x-amd_1'
Expand Down
16 changes: 16 additions & 0 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ if [[ -z "$IBDEVICES" ]]; then
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
elif [[ $NODENAME == mia1* ]]; then
export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
elif [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI300X cluster: Broadcom RoCE (bnxt_re); all 8 devices present
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
else
echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
exit 1
Expand Down Expand Up @@ -101,6 +107,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
fi
Expand All @@ -114,6 +125,11 @@ else
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
echo " This is normal for clusters without QoS or outside Docker containers."
Expand Down
38 changes: 28 additions & 10 deletions benchmarks/multi_node/amd_utils/job.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then
exit 1
fi

# Validate MODEL_NAME exists as a top-level key in models.yaml
if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
echo "Error: Model '$MODEL_NAME' not found in models.yaml"
# MODEL_YAML_KEY: top-level key in models.yaml for server config lookup.
# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>).
_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"

# Validate the yaml key exists as a top-level key in models.yaml
if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then
echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml"
echo "Available models:"
grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /'
exit 1
fi
echo "Model found: $MODEL_NAME"
echo "Model found: $_MODEL_YAML_KEY"

# All models use server.sh as the entrypoint
RUN_FILE="server.sh"
Expand Down Expand Up @@ -133,6 +137,20 @@ check_model_path() {
fi
}

# If MODEL_NAME is a plain name (not already a HF cache path), try to resolve
# the HF hub cache layout on this node: models--{org}--{repo}/snapshots/<hash>
# This handles clusters where the cache is node-local and can't be resolved
# from the job launcher (which may run on a different host).
if [[ "$MODEL_NAME" != models--* ]] && [[ "$MODEL_NAME" != *snapshots* ]]; then
_HF_ORG_REPO="${MODEL_YAML_KEY:-$MODEL_NAME}"
_HF_DIR="models--$(echo "${_HF_ORG_REPO}" | tr '/' '--')"
_SNAPSHOT=$(ls "${MODEL_DIR}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
if [[ -n "${_SNAPSHOT}" ]]; then
MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
echo "Resolved MODEL_NAME from local HF cache: ${MODEL_NAME}"
fi
fi

# Check model weights exist on "$MODEL_DIR/$MODEL_NAME"
if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
Expand Down Expand Up @@ -249,10 +267,9 @@ echo "NNODES is ${NNODES}"
echo "REPO Directory is ${DI_REPO_DIR}"
echo "USER_NAME is ${USER_NAME}"

# Get the RDMA priority and DSCP value from the NIC
# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully)
if ! command -v nicctl >/dev/null 2>&1; then
echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
exit 1
echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2
fi

# Reduce log spam
Expand Down Expand Up @@ -296,8 +313,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)

cleanup() {
echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
# clean up the logs folder
sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
# NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true

echo "[${SLURM_JOB_ID}] cleanup done."
}
Expand Down Expand Up @@ -357,7 +374,7 @@ exec sudo docker run --rm \
--privileged \
-v ${MODEL_DIR}:/models \
-v \$HOME/.ssh:/root/.ssh \
-v $(which nicctl):/usr/sbin/nicctl \
$(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \
--shm-size 128G \
-v /tmp:/run_logs \
-v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
Expand All @@ -373,6 +390,7 @@ exec sudo docker run --rm \
-e xP=\$xP \
-e yD=\$yD \
-e MODEL_NAME=\$MODEL_NAME \
-e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \
-e IPADDRS=\$IPADDRS \
-e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
-e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
Expand Down
10 changes: 9 additions & 1 deletion benchmarks/multi_node/amd_utils/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ fi
# Load model config via inline Python (PyYAML is available in SGLang containers)
# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
# is done here in Python to avoid bash glob-expanding the * characters.
_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
eval "$(python3 -c "
import yaml, sys, os

config_path = '${MODELS_YAML}'
model_name = '${MODEL_NAME}'
model_name = '${_MODEL_YAML_KEY}'

with open(config_path) as f:
models = yaml.safe_load(f)
Expand Down Expand Up @@ -212,6 +213,13 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
fi

# DP attention forces chunked_prefill_size to 1024 inside SGLang, which must be
# <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. Bump the decode dispatch
# token limit when DP is enabled to satisfy this assertion.
if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$MORI_MAX_DISPATCH_TOKENS_DECODE" -lt 1024 ]]; then
MORI_MAX_DISPATCH_TOKENS_DECODE=1024
fi

# =============================================================================
# Cluster Topology Configuration
# =============================================================================
Expand Down
Loading
Loading