Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
894b08e
[AMD] Add vLLM disaggregated prefill-decode benchmark for MI355X
chunfangamd Mar 11, 2026
1c4ad3d
[AMD] Refactor vLLM disagg recipe: models.yaml, UCX cleanup, QoS support
chunfangamd Mar 11, 2026
04ab30d
[AMD] Update vLLM disagg recipe for v0.17.1 NixlConnector API
chunfangamd Mar 11, 2026
99ce774
[AMD] Make vLLM disagg recipe CI-compatible (mia1 cluster)
chunfangamd Mar 12, 2026
d16bd21
[AMD] Co-locate vLLM disagg router with prefill on NODE_RANK=0
chunfangamd Mar 12, 2026
cf4b88c
[AMD] Use public vLLM base image with runtime dependency install
chunfangamd Mar 12, 2026
1b46ce5
[AMD] Enable Expert Parallelism with MoRI all-to-all on vLLM disagg d…
chunfangamd Mar 13, 2026
585ddb4
[AMD] Switch vLLM disagg KV transfer to MoRI-IO with protocol-aware p…
chunfangamd Mar 13, 2026
69fcdbd
[AMD] BUG fix: RANDOM_RANGE_RATIO never reaches bench.sh
ichbinblau Mar 17, 2026
d214e79
Bug fix: 1. With DRY_RUN=1, node 0 skipped starting proxy/prefill but…
ichbinblau Mar 17, 2026
3ffcc74
[AMD] Fix vLLM disagg hang: READ mode support + safety timeouts
chunfangamd Mar 19, 2026
9129ead
Adapt vLLM disagg recipe for 9N mia1 cluster (mlx5 NICs)
chunfangamd Mar 21, 2026
728f91a
[AMD] Fix vLLM disagg sweep hang: KV cache leak + benchmark client ha…
chunfangamd Mar 22, 2026
a163fd6
[AMD] Fix vLLM disagg Slurm job never terminating after benchmark com…
chunfangamd Mar 22, 2026
cb52c29
[AMD] Enable MoRI-IO READ mode by default for vLLM disagg
chunfangamd Mar 22, 2026
25a0310
[AMD] Fix CI checkout failure caused by root-owned __pycache__ files
chunfangamd Mar 22, 2026
5bbc954
[AMD] Fix CI checkout EACCES by redirecting Python bytecache off NFS
chunfangamd Mar 23, 2026
89ae516
[AMD] Fix KV reaper deadlock on high-ISL disagg workloads
chunfangamd Mar 23, 2026
f611f47
[AMD] Enable reading PREFILL_TP,PREFILL_EP,PREFILL_DP_ATTN,DECODE_TP,…
ichbinblau Mar 24, 2026
bec9c09
Merge branch 'main' into chun-oren-theresa/vllm_disagg
chunfangamd Mar 25, 2026
72a0002
feat: add MiniMax M2.5 PD disaggregation recipe (1P2D, MoRI-EP + MoRI…
ChuanLi1101 Apr 2, 2026
bb6bd0e
feat: add Dockerfile and runtime patch for MiniMax M2.5 WideEP + MoRI
ChuanLi1101 Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,156 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
# - "DECODE_MTP_SIZE=0"


dsr1-fp8-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:v0.17.1
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi355x-disagg
precision: fp8
framework: vllm-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 1024
osl: 8192
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"


minimaxm25-fp8-mi355x-vllm-disagg:
image: minimax-m25-disagg:latest # custom build: docker/minimax-m25-disagg/build.sh
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm25
runner: mi355x-disagg
precision: fp8
framework: vllm-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 1024
osl: 8192
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"


dsr1-fp4-mi355x-sglang-disagg:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
Expand Down
79 changes: 79 additions & 0 deletions benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
CONC_LIST \
ISL \
OSL \
IMAGE \
SPEC_DECODING \
MODEL_PATH \
PREFILL_NUM_WORKERS \
PREFILL_TP \
PREFILL_EP \
PREFILL_DP_ATTN \
DECODE_NUM_WORKERS \
DECODE_TP \
DECODE_EP \
DECODE_DP_ATTN \
PREFILL_NODES \
DECODE_NODES \
RANDOM_RANGE_RATIO

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

set -x

cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1

export TIME_LIMIT="08:00:00"
export MODEL_PATH=$MODEL_PATH
export MODEL_NAME=$MODEL_NAME
export CONTAINER_IMAGE=$IMAGE

# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh
if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
export PREFILL_ENABLE_EP=false
else
export PREFILL_ENABLE_EP=true
fi

if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
export PREFILL_ENABLE_DP=true
else
export PREFILL_ENABLE_DP=false
fi

if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
export DECODE_ENABLE_EP=false
else
export DECODE_ENABLE_EP=true
fi

if [[ "$DECODE_DP_ATTN" == "true" ]]; then
export DECODE_ENABLE_DP=true
else
export DECODE_ENABLE_DP=false
fi

# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST.
JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
$PREFILL_NUM_WORKERS \
$DECODE_NODES \
$DECODE_NUM_WORKERS \
$ISL $OSL "${CONC_LIST// /x}" inf \
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
${PREFILL_TP} ${DECODE_TP} \
${RANDOM_RANGE_RATIO} \
"${NODELIST:-}")

if [[ $? -ne 0 ]]; then
echo "Failed to submit job" >&2
exit 1
fi

echo "$JOB_ID"
77 changes: 77 additions & 0 deletions benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
CONC_LIST \
ISL \
OSL \
IMAGE \
SPEC_DECODING \
MODEL_PATH \
PREFILL_NUM_WORKERS \
PREFILL_TP \
PREFILL_EP \
PREFILL_DP_ATTN \
DECODE_NUM_WORKERS \
DECODE_TP \
DECODE_EP \
DECODE_DP_ATTN \
PREFILL_NODES \
DECODE_NODES \
RANDOM_RANGE_RATIO

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

set -x

cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1

export TIME_LIMIT="08:00:00"
export MODEL_PATH=$MODEL_PATH
export MODEL_NAME=$MODEL_NAME
export CONTAINER_IMAGE=$IMAGE

if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
export PREFILL_ENABLE_EP=false
else
export PREFILL_ENABLE_EP=true
fi

if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
export PREFILL_ENABLE_DP=true
else
export PREFILL_ENABLE_DP=false
fi

if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
export DECODE_ENABLE_EP=false
else
export DECODE_ENABLE_EP=true
fi

if [[ "$DECODE_DP_ATTN" == "true" ]]; then
export DECODE_ENABLE_DP=true
else
export DECODE_ENABLE_DP=false
fi

JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
$PREFILL_NUM_WORKERS \
$DECODE_NODES \
$DECODE_NUM_WORKERS \
$ISL $OSL "${CONC_LIST// /x}" inf \
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
${PREFILL_TP} ${DECODE_TP} \
${RANDOM_RANGE_RATIO} \
"${NODELIST:-}")

if [[ $? -ne 0 ]]; then
echo "Failed to submit job" >&2
exit 1
fi

echo "$JOB_ID"
75 changes: 75 additions & 0 deletions benchmarks/multi_node/vllm_disagg_utils/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
# vLLM Disaggregated Benchmark Runner
#
# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh)
# so that the CI pipeline can collect and process results.
#
# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
# <model_dir> <model_name> <log_path> <isl> <osl> \
# <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>

n_prefill=$1
n_decode=$2
prefill_gpus=$3
decode_gpus=$4
model_path=$5
model_name=$6
MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
log_path=$7

chosen_isl=${8:-1024}
chosen_osl=${9:-1024}
concurrency_list=${10:-"512x1"}
chosen_req_rate=${11:-inf}
random_range_ratio=${12:-0.8}
num_prompts_multiplier=${13:-10}

IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"

ROUTER_PORT="${ROUTER_PORT:-30000}"

echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"

profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}"
mkdir -p "$profile_folder"

source "$(dirname "$0")/../../benchmark_lib.sh"

REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"

for max_concurrency in "${chosen_concurrencies[@]}"; do

export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"

num_prompts=$(( max_concurrency * num_prompts_multiplier ))
if [[ "$num_prompts" -lt 16 ]]; then
num_prompts=16
fi

echo "profile_folder: $profile_folder"
echo "max_concurrency: $max_concurrency"
echo "chosen_req_rate: $chosen_req_rate"
echo "MODEL_PATH: $MODEL_PATH"
echo "ROUTER_PORT: $ROUTER_PORT"
echo "chosen_isl: $chosen_isl"
echo "chosen_osl: $chosen_osl"
echo "num_prompts: $num_prompts"
echo "export_file: $export_file"

run_benchmark_serving \
--bench-serving-dir "$REPO_ROOT" \
--model "$MODEL_PATH" \
--port "$ROUTER_PORT" \
--backend openai \
--input-len "$chosen_isl" \
--output-len "$chosen_osl" \
--random-range-ratio "$random_range_ratio" \
--num-prompts "$num_prompts" \
--max-concurrency "$max_concurrency" \
--result-filename "$export_file" \
--result-dir /workspace/

echo "-----------------------------------------"
echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
sleep 10
done
Loading