Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e332f90
add agentic trace replay benchmark infrastructure
cquil11 Apr 1, 2026
28991eb
remove deprecated GpuTransferCollector from metrics collector
cquil11 Apr 1, 2026
695ec2e
modularize metrics collector with backend auto-detection
cquil11 Apr 1, 2026
6a41d49
remove unused Protocol import
cquil11 Apr 1, 2026
c137677
add LMCache agentic trace benchmark for H100
cquil11 Apr 1, 2026
ee76767
add H100 LMCache trace sweep config
cquil11 Apr 1, 2026
839ba0f
fix LMCache benchmark: use fixed-schedule replay, remove ignore_eos
cquil11 Apr 1, 2026
fc8e3cf
remove --fixed-schedule: use concurrency mode per Samuel's recommenda…
cquil11 Apr 1, 2026
6bbbfa9
update yaml
cquil11 Apr 2, 2026
a2e4fe6
fix H100 runner: add SCRIPT_SUFFIX support
cquil11 Apr 2, 2026
fee0278
fix: mkdir RESULT_DIR before trace conversion
cquil11 Apr 2, 2026
769532c
add H200 LMCache trace benchmark and config
cquil11 Apr 2, 2026
02876af
update yaml
cquil11 Apr 2, 2026
2134fd8
fix H200-nb runner: add SCRIPT_SUFFIX support
cquil11 Apr 2, 2026
ab2812a
fix all H200 runners: add SCRIPT_SUFFIX support
cquil11 Apr 2, 2026
5aa993f
fix all runners: add SCRIPT_SUFFIX support
cquil11 Apr 2, 2026
d5dd151
reduce multiturn artifact size: upload only files needed for post-pro…
cquil11 Apr 2, 2026
bd4ec30
add exclusive
cquil11 Apr 2, 2026
a12cc9d
add exclusive
cquil11 Apr 2, 2026
af49d11
add exclusive
cquil11 Apr 2, 2026
48ef44d
use aiperf summary CSV instead of per-record JSONL for post-processing
cquil11 Apr 2, 2026
4f106b8
debug
cquil11 Apr 2, 2026
cfb25fb
fix LMCache traces: convert system role to developer for vLLM v0.18+
cquil11 Apr 2, 2026
ede9bde
revert system->developer role conversion in LMCache traces
cquil11 Apr 2, 2026
a7ac440
fix MetricsCollector missing gpu_transfer_collector attribute
cquil11 Apr 2, 2026
db87b95
fix LMCache traces: strip null fields to pass vLLM Pydantic validation
cquil11 Apr 2, 2026
07ce85d
use hf download for LMCache traces instead of datasets.load_dataset
cquil11 Apr 2, 2026
195ca66
add B200 FP4 multiturn benchmark script using aiperf
cquil11 Apr 2, 2026
09e6ec1
add entry for b200 ds
cquil11 Apr 2, 2026
951326a
add expert parallel support to B200 FP4 aiperf script
cquil11 Apr 2, 2026
0100fa1
skip LMCache trace entries with empty messages
cquil11 Apr 2, 2026
110dfa4
fix: prioritize aiperf summary CSV over malformed client CSV
cquil11 Apr 2, 2026
c64e644
fix aiperf CSV parser: handle multi-section format with different col…
cquil11 Apr 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .github/configs/multiturn-agentic-trace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
h200-fp8-llama70b:
tp2:
users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32]
offload: ["on", "off"]
tp4:
users: [2, 4, 6, 8, 16, 24, 32, 40, 48, 56]
offload: ["on", "off"]
tp8:
users: [2, 4, 6, 8, 16, 32, 48, 64, 80, 128, 256]
offload: ["on", "off"]

mi355x-fp8-llama70b:
tp2:
users: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56]
offload: ["on", "off"]
tp4:
users: [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 112, 256]
offload: ["on", "off"]
tp8:
users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512]
offload: ["on", "off"]

h200-fp8-llama70b-lmcache:
tp2:
users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64]
offload: ["on", "off"]
tp4:
users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128]
offload: ["on", "off"]
tp8:
users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256]
offload: ["on", "off"]

h100-fp8-llama70b-lmcache:
tp2:
users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64]
offload: ["on", "off"]
tp4:
users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128]
offload: ["on", "off"]
tp8:
users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256]
offload: ["on", "off"]

b200-fp4-dsr1-weka-trace:
tp4:
ep: 4
users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128, 256]
offload: ["on", "off"]
tp8:
ep: 8
users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512]
offload: ["on", "off"]
181 changes: 181 additions & 0 deletions .github/workflows/benchmark-multiturn-tmpl.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
name: Template - Multi-Turn Benchmark
on:
workflow_call:
inputs:
runner:
required: true
type: string
image:
required: true
type: string
model:
required: true
type: string
precision:
required: false
type: string
default: 'fp4'
exp-name:
required: true
type: string
tp:
required: true
type: string
users:
required: true
type: string
offload-mode:
description: "on = prefix+offload, off = prefix only, noprefix = no prefix caching"
required: true
type: string
duration:
required: false
type: string
default: ''
request-rate:
description: "Request rate per client (Poisson, req/s). 0 = no delay."
required: false
type: string
default: '0'
total-cpu-dram-gb:
required: false
type: string
default: '300'
script-suffix:
description: "Suffix appended to benchmark script name (e.g. '_lmcache')"
required: false
type: string
default: ''
ep:
description: "Expert parallelism size (for MoE models)"
required: false
type: string
default: '0'
ref:
description: "Git ref (branch/sha) to checkout"
required: false
type: string

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_CACHE: '/mnt/hf_hub_cache/'
EXP_NAME: ${{ inputs.exp-name }}
MODEL: ${{ inputs.model }}
IMAGE: ${{ inputs.image }}
PRECISION: ${{ inputs.precision }}
FRAMEWORK: 'vllm'
TP: ${{ inputs.tp }}
EP_SIZE: ${{ inputs.ep }}
USERS: ${{ inputs.users }}
OFFLOAD_MODE: ${{ inputs.offload-mode }}
DURATION: ${{ inputs.duration }}
REQUEST_RATE: ${{ inputs.request-rate }}
TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
SCRIPT_SUFFIX: ${{ inputs.script-suffix }}
SPEC_DECODING: 'off'

permissions:
contents: read

jobs:
benchmark:
runs-on: ${{ inputs.runner }}
timeout-minutes: 180
name: "${{ inputs.exp-name }} tp=${{ inputs.tp }} users=${{ inputs.users }} offload=${{ inputs.offload-mode }}"
steps:
- name: Resource cleanup (pre-run)
run: &resource-cleanup |
# Cleanup Docker resources
if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
echo "[Docker] Cleaning up resources ..."
docker ps -aq | xargs -r docker rm -f
docker network prune -f
while [ -n "$(docker ps -aq)" ]; do
docker ps -a
sleep 5
done
fi

# Cleanup SLURM resources
if command -v squeue >/dev/null 2>&1; then
if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then
echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
scancel --name="${{ runner.name }}" || true
while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
squeue --name="${{ runner.name }}"
sleep 5
done
else
echo "[Slurm] Cleaning up jobs for user: $USER ..."
scancel -u "$USER" || true
while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do
squeue -u "$USER"
sleep 5
done
fi
fi

- name: Clean stale git locks
run: find . -name 'index.lock' -delete 2>/dev/null || true

- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
token: ${{ secrets.REPO_PAT }}
fetch-depth: 0
ref: ${{ inputs.ref || github.ref }}
submodules: true


- name: Launch job script
env:
RUNNER_NAME: ${{ runner.name }}
RESULT_DIR: /workspace/results
run: |
bash ./runners/launch_${RUNNER_NAME%%_*}.sh

# The runner script doesn't propagate exit codes (scancel masks them).
# Check status.txt to determine if the benchmark actually succeeded.
if [ ! -f results/status.txt ]; then
echo "Run failed: results/status.txt not found." >&2
exit 1
fi
STATUS=$(cat results/status.txt)
if [ "$STATUS" != "SUCCESS" ]; then
echo "Run failed: status=$STATUS" >&2
cat results/benchmark.log 2>/dev/null || true
exit 1
fi

- name: Upload results
if: always()
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
with:
name: "multiturn_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
path: |
results/metrics_client_metrics.csv
results/metrics_server_metrics.csv
results/metrics_plots.png
results/benchmark.log
results/config.yaml
results/vllm_command.txt
results/benchmark_command.txt
results/benchmark_metadata.json
results/metrics_workload.png
results/aiperf_artifacts/profile_export_aiperf.csv
results/workload_distribution_summary.txt
results/workload_distribution_plots.png
results/trace_replay/detailed_results.csv
results/status.txt
if-no-files-found: ignore

- name: Upload server logs
if: always()
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
with:
name: "server_logs_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
path: results/server.log
if-no-files-found: ignore

- name: Resource cleanup (post-run)
if: always()
run: *resource-cleanup
Loading
Loading