Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
dc27fa1
Update nvidia-master.yaml
kedarpotdar-nv Mar 26, 2026
9ee612f
vllm version bump
kedarpotdar-nv Mar 26, 2026
a27edee
add perf changelog
kedarpotdar-nv Mar 26, 2026
a33dc21
update search space and configs
kedarpotdar-nv Mar 27, 2026
ecaac19
Merge branch 'main' into nv/minimax-vllm018
kedarpotdar-nv Mar 27, 2026
89acdf2
fix typo in VLLM_USE_DEEP_GEMM
kedarpotdar-nv Mar 27, 2026
f12f56d
Merge origin/main and resolve conflicts
github-actions[bot] Mar 28, 2026
83706ae
Remove ISL 1024 / OSL 8192 seq-len config for minimaxm2.5-fp8-b200-vllm
github-actions[bot] Mar 29, 2026
e405b60
update image
kedarpotdar-nv Mar 30, 2026
e4333e9
Merge branch 'main' into nv/minimax-vllm018
kedarpotdar-nv Mar 30, 2026
534927b
update config and remove DEEPGEMM flag
kedarpotdar-nv Apr 1, 2026
b885b69
Merge branch 'main' into nv/minimax-vllm018
kedarpotdar-nv Apr 1, 2026
6073a5e
test tep
kedarpotdar-nv Apr 1, 2026
4eb4c92
Merge branch 'main' into nv/minimax-vllm018
kedarpotdar-nv Apr 1, 2026
f6d81f7
fix typo in ep bash script
kedarpotdar-nv Apr 2, 2026
d172646
add max cudagraph size
kedarpotdar-nv Apr 3, 2026
97b25f1
upgrade to vllm 0.19
kedarpotdar-nv Apr 3, 2026
beb6a6b
typo
kedarpotdar-nv Apr 3, 2026
9af9992
revert h200 change
kedarpotdar-nv Apr 3, 2026
f1cc537
Merge branch 'main' into nv/minimax-vllm018
kedarpotdar-nv Apr 3, 2026
0d734c1
fix: update perf-changelog version to v0.19.0
github-actions[bot] Apr 3, 2026
4f15004
Remove commented-out tp:8 search-space entry
github-actions[bot] Apr 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3101,7 +3101,7 @@ gptoss-fp4-b200-vllm:
- { tp: 8, conc-start: 4, conc-end: 4 }

minimaxm2.5-fp8-b200-vllm:
image: vllm/vllm-openai:v0.17.0-cu130
image: vllm/vllm-openai:v0.19.0-cu130
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: b200
Expand All @@ -3112,13 +3112,15 @@ minimaxm2.5-fp8-b200-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 512 }
- { tp: 2, ep: 2, conc-start: 4, conc-end: 256 }
- { tp: 4, conc-start: 4, conc-end: 512 }
- { tp: 4, ep: 4, conc-start: 16, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 256 }
- { tp: 4, conc-start: 4, conc-end: 256 }

gptoss-fp4-h100-vllm:
image: vllm/vllm-openai:v0.18.0
Expand Down
12 changes: 7 additions & 5 deletions benchmarks/single_node/minimaxm2.5_fp8_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@ hf download "$MODEL"
SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

export VLLM_USE_FLASHINFER_MOE_FP8=0
export VLLM_MOE_USE_DEEP_GEMM=0
export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl

if [ "$EP_SIZE" -ge 1 ]; then
if [ "$EP_SIZE" -gt 1 ]; then
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch lol

EP=" --enable-expert-parallel"
else
EP=" "
Expand All @@ -44,10 +43,13 @@ set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
$EP \
--gpu-memory-utilization 0.95 \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size=32 \
--no-enable-prefix-caching \
--kv-cache-dtype fp8 \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand Down
12 changes: 11 additions & 1 deletion perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,7 @@
description:
- "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966

- config-keys:
# NVIDIA single-node
- dsr1-fp4-b200-sglang
Expand Down Expand Up @@ -1235,3 +1235,13 @@
- "New model support on ATOM framework"
- "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963

- config-keys:
- minimaxm2.5-fp8-b200-vllm
description:
- "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200"
- "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs"
- "Remove ISL 1024 / OSL 8192 seq-len config"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947


Loading