Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
362 changes: 362 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,24 @@ dsr1-fp8-mi325x-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi325x-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }

dsr1-fp8-mi355x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -1231,3 +1249,347 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsr1-fp8-mi325x-sglang-disagg:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ur missing perfchange log . yaml too

image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
# - spec-decoding: "none"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=0"

# # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
# - spec-decoding: "none"
# conc-list: [ 768, 512, 256 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

# Single-node EP8/DP decode (test: isolates whether EP/DP itself works on MI325X
# or if only the multi-node distributed init is broken with Broadcom Thor 2)
- spec-decoding: "none"
conc-list: [ 512, 256, 128, 64 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

- isl: 8192
osl: 1024
search-space:
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
# - spec-decoding: "none"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "PREFILL_NODES=2"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=1"
# - "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

r u sure that TP4 is on the pareto here? do u have an graph?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

u only have TP4 curve and u have "hide non-optimal"? can u run the rest of the 24 datapoints?

- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp8-mi325x-sglang-disagg-mtp:
image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# MTP configurations
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
# - spec-decoding: "mtp"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=1"

# # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
# - spec-decoding: "mtp"
# conc-list: [ 768, 512, 256 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"

# Single-node EP8/DP decode with MTP (test: isolates EP/DP vs multi-node init)
- spec-decoding: "mtp"
conc-list: [ 512, 256, 128, 64 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"

- isl: 8192
osl: 1024
search-space:
# MTP configurations
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
# - spec-decoding: "mtp"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "PREFILL_NODES=2"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=1"
# - "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"
5 changes: 5 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ mi325x:
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi325x-disagg:
- 'mi325x-amd_0'
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi355x:
- 'mi355x-amds_0'
- 'mi355x-amds_1'
Expand Down
Loading
Loading